In [2]:
import os
import pandas as pd
import numpy as np
import math
from matplotlib.pyplot import *
import matplotlib.pyplot as plt
from matplotlib import animation
from matplotlib import cm
from parse import *
import seaborn as sns

In [3]:
CHUNKSIZE = 100000

TRAIN_PATH        = "../train_StationPathInfo.csv"
TRAIN_PATH_EX     = "../train_StationPathInfoEx.csv"
TEST_PATH         = "../test_StationPathInfo.csv"
TEST_PATH_EX      = "../test_StationPathInfoEx.csv"

TRAIN_DATE        = "../train_date.csv"
TRAIN_NUMERIC     = "../train_numeric.csv"
TRAIN_CATEGORICAL = "../train_categorical.csv"
TEST_DATE         = "../test_date.csv"
TEST_NUMERIC      = "../test_numeric.csv"
TEST_CATEGORICAL  = "../test_categorical.csv"

SEED = 0
CHUNKSIZE = 50000
NROWS = 1200000

ID_COLUMN = 'Id'
TARGET_COLUMN = 'Response'
    
    

In [4]:
chunk_train_num = pd.read_csv(TRAIN_NUMERIC, nrows=NROWS, chunksize=CHUNKSIZE)
chunk_test_num  = pd.read_csv(TEST_NUMERIC, nrows=NROWS, chunksize=CHUNKSIZE)
chunk_train_cat = pd.read_csv(TRAIN_CATEGORICAL, nrows=NROWS, chunksize=CHUNKSIZE)
chunk_test_cat  = pd.read_csv(TEST_CATEGORICAL, nrows=NROWS, chunksize=CHUNKSIZE)

df_train_num = chunk_train_num.get_chunk(10)
df_test_num  = chunk_test_num.get_chunk(10)
df_train_cat = chunk_train_cat.get_chunk(10)
df_test_cat  = chunk_test_cat.get_chunk(10)

df_train_num.head()

Unnamed: 0,Id,L0_S0_F0,L0_S0_F2,L0_S0_F4,L0_S0_F6,L0_S0_F8,L0_S0_F10,L0_S0_F12,L0_S0_F14,L0_S0_F16,...,L3_S50_F4245,L3_S50_F4247,L3_S50_F4249,L3_S50_F4251,L3_S50_F4253,L3_S51_F4256,L3_S51_F4258,L3_S51_F4260,L3_S51_F4262,Response
0,4,0.03,-0.034,-0.197,-0.179,0.118,0.116,-0.015,-0.032,0.02,...,,,,,,,,,,0
1,6,,,,,,,,,,...,,,,,,,,,,0
2,7,0.088,0.086,0.003,-0.052,0.161,0.025,-0.015,-0.072,-0.225,...,,,,,,,,,,0
3,9,-0.036,-0.064,0.294,0.33,0.074,0.161,0.022,0.128,-0.026,...,,,,,,,,,,0
4,11,-0.055,-0.086,0.294,0.33,0.118,0.025,0.03,0.168,-0.169,...,,,,,,,,,,0


In [5]:
STATION_NUM = 52

num_cols = df_train_num.columns
cat_cols = df_train_cat.columns

num_feature_list = []
cat_feature_list = []

for cur_station_id in range(STATION_NUM):
    num_feature_list_one = []
    cat_feature_list_one = []
    for column_name in num_cols:
        if column_name != 'Id':
            r = parse("L{}_S{}_F{}", column_name)
            if r is not None:
                col_line_id    = int(r[0])
                col_station_id = int(r[1])
                if cur_station_id == col_station_id:
                    num_feature_list_one.append(column_name)
    for column_name in cat_cols:
        if column_name != 'Id':
            r = parse("L{}_S{}_F{}", column_name)
            if r is not None:
                col_line_id    = int(r[0])
                col_station_id = int(r[1])
                if cur_station_id == col_station_id:
                    cat_feature_list_one.append(column_name)
        
    num_feature_list.append(num_feature_list_one)
    cat_feature_list.append(cat_feature_list_one)

In [6]:
for i in range(STATION_NUM):
    print(str(i) + '   ' + str(len(num_feature_list[i])) + '   ' + str(len(cat_feature_list[i])))

0   12   0
1   2   4
2   9   18
3   9   18
4   2   6
5   2   0
6   3   10
7   3   0
8   3   0
9   12   39
10   12   39
11   12   26
12   12   0
13   2   0
14   9   18
15   9   9
16   2   6
17   2   0
18   3   10
19   3   0
20   3   0
21   14   45
22   14   45
23   14   30
24   229   660
25   284   567
26   14   53
27   14   53
28   14   53
29   53   63
30   68   204
31   4   8
32   1   3
33   10   0
34   4   0
35   8   18
36   8   8
37   4   0
38   3   6
39   4   8
40   8   0
41   14   0
42   0   24
43   8   24
44   8   8
45   5   0
46   0   3
47   11   33
48   6   0
49   7   21
50   7   0
51   4   0


S24, S25, S30 の feature が多いので、ここを外すと普通に扱えそう…
だが、そんなことはなかった。
feature importance で上位に来る、29, 30, 31, 33 の numeric でやってみる。

### 通過ステーション情報・時刻情報を読み出す。

In [10]:
# 読み出す。

df_train_bin = pd.read_csv(TRAIN_PATH)
df_test_bin  = pd.read_csv(TEST_PATH)
# df_pass = pd.concat([df_train_bin, df_test_bin])
df_pass = df_train_bin

In [11]:
# 時系列でソートしたヒートマップを出してみる。

TEST_START_TIME      = "../test_StartEndTime.csv"
TRAIN_START_TIME     = "../train_StartEndTime.csv"

df_start_train = pd.read_csv(TRAIN_START_TIME)
df_start_train = df_start_train.ix[:,['Id','StartTime','EndTime', 'Response']]
df_start_test = pd.read_csv(TEST_START_TIME)
df_start_test['Response'] = -1
df_time = pd.concat([df_start_train, df_start_test])


.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  import sys


## station のnumerical data, categorical data とマージする。

### station 0編


S0 は Categorical Feature がない。

### station 1 編

In [13]:
df_station1_train = pd.read_csv("./train_categorical_station_1.csv")
df_station1_all = pd.merge(df_pass, df_time, on='Id', how='left')
df_station1_all = pd.merge(df_station1_all, df_station1_train, on = 'Id', how='left')

  interactivity=interactivity, compiler=compiler, result=result)


In [14]:
df_station1_pass = df_station1_all[df_station1_all['L0_S1_D26'] ==1]
df_station1_pass

Unnamed: 0,Id,L0_S0_D1,L0_S1_D26,L0_S2_D34,L0_S3_D70,L0_S4_D106,L0_S5_D115,L0_S6_D120,L0_S7_D137,L0_S8_D145,...,L3_S49_D4208,L3_S50_D4242,L3_S51_D4255,StartTime,EndTime,Response,L0_S1_F25,L0_S1_F27,L0_S1_F29,L0_S1_F31
0,4,1,1,1,0,1,0,0,1,1,...,0,0,0,82.24,87.29,0,,,,
2,7,1,1,1,0,0,1,1,0,1,...,0,0,0,1618.70,1624.42,0,,,,
3,9,1,1,1,0,1,0,0,1,1,...,0,0,0,1149.20,1154.16,0,,,,
4,11,1,1,0,1,1,0,0,1,1,...,0,0,0,602.64,606.02,0,,,,
5,13,1,1,0,1,1,0,0,1,1,...,0,0,0,1331.66,1339.73,0,,,,
8,18,1,1,1,0,1,0,0,1,1,...,0,0,0,517.64,518.08,0,,,,
10,26,1,1,1,0,1,0,0,1,1,...,0,0,0,1104.78,1105.95,0,,,,
11,27,1,1,0,1,0,1,1,0,1,...,0,0,0,392.85,401.41,0,,,,
12,28,1,1,0,1,1,0,0,1,1,...,0,0,0,55.44,62.10,0,,,,
13,31,1,1,1,0,1,0,1,0,1,...,0,0,0,98.99,99.67,0,,,,


S1 を通過したものでも、S1のCategorical Data の計測値がNanなものがある。<br>
Nanかどうかは使えるかもしれない。

In [15]:
df_station1_valid = df_station1_all[df_station1_all['L0_S0_D1'] ==1]
df_station1_valid_1 = df_station1_valid.dropna(subset=['L0_S1_F25'])
df_station1_valid_1

Unnamed: 0,Id,L0_S0_D1,L0_S1_D26,L0_S2_D34,L0_S3_D70,L0_S4_D106,L0_S5_D115,L0_S6_D120,L0_S7_D137,L0_S8_D145,...,L3_S49_D4208,L3_S50_D4242,L3_S51_D4255,StartTime,EndTime,Response,L0_S1_F25,L0_S1_F27,L0_S1_F29,L0_S1_F31
7800,15498,1,1,0,1,0,1,1,0,1,...,0,0,0,263.44,265.22,0,T1,T9,T1,T9
12063,24042,1,1,1,0,1,0,1,0,1,...,0,0,0,622.81,625.46,0,T1,T9,T1,T9
41533,83154,1,1,0,1,0,1,0,1,1,...,0,0,0,263.43,265.22,0,T1,T9,T1,T9
41534,83155,1,1,0,1,0,1,0,1,1,...,0,0,0,263.43,266.37,0,T1,T9,T1,T9
51457,103078,1,1,0,1,0,1,0,1,1,...,0,0,0,1661.83,1661.97,0,T1,T9,T1,T9
91731,183688,1,1,1,0,1,0,0,1,1,...,0,0,0,267.34,271.49,0,T1,T9,T1,T9
96543,193268,1,1,0,1,0,1,0,1,1,...,0,0,0,1661.83,1661.96,0,T1,T9,T1,T9
159886,319566,1,1,0,1,0,1,0,1,1,...,0,0,0,640.33,652.81,0,T1,T9,T1,T9
278255,556400,1,1,0,1,1,0,1,0,1,...,0,0,0,640.33,652.81,0,T1,T9,T1,T9
304205,608617,1,1,0,1,0,1,0,1,1,...,0,0,0,623.3,626.04,0,T1,T9,T1,T9


有効な値が入ってる行は、すべて同じ値になっている。<br>
Nanかどうかは使えるかもしれない。

Nanかどうかで値の分布をみる。
T0 以外のものはResponse=0と、値の分布に差はあるが、そもそも母数が少ないので使い物にならないかも。

In [25]:
df_station1_T0 = (df_station1_valid.fillna('T0'))
df_station1_T0 = df_station1_T0[df_station1_T0['L0_S1_F25'] == 'T0']
ratio1 = (len(df_station1_T0[df_station1_T0['Response']==1])) / len(df_station1_T0)
print(ratio1)

df_station1_T_not0 = df_station1_valid.dropna()
ratio2 = (len(df_station1_T_not0[df_station1_T_not0['Response']==1])) / len(df_station1_T_not0)
print(ratio2)


0.005354410636400198
0.0


In [40]:
del df_station1_train
del df_station1_all
del df_station1_valid_1
del df_station1_pass

### station 2 編

In [43]:
df_station2_train = pd.read_csv("./train_categorical_station_2.csv")
df_station2_all = pd.merge(df_pass, df_time, on='Id', how='left')
df_station2_all = pd.merge(df_station2_all, df_station2_train, on = 'Id', how='left')
df_station2_pass = df_station2_all[df_station2_all['L0_S2_D34'] ==1]
df_station2_pass

Unnamed: 0,Id,L0_S0_D1,L0_S1_D26,L0_S2_D34,L0_S3_D70,L0_S4_D106,L0_S5_D115,L0_S6_D120,L0_S7_D137,L0_S8_D145,...,L0_S2_F49,L0_S2_F51,L0_S2_F53,L0_S2_F55,L0_S2_F57,L0_S2_F59,L0_S2_F61,L0_S2_F63,L0_S2_F65,L0_S2_F67
0,4,1,1,1,0,1,0,0,1,1,...,,,,,,,,,,
2,7,1,1,1,0,0,1,1,0,1,...,,,,,,,,,,
3,9,1,1,1,0,1,0,0,1,1,...,,,,,,,,,,
8,18,1,1,1,0,1,0,0,1,1,...,,,,,,,,,,
10,26,1,1,1,0,1,0,0,1,1,...,,,,,,,,,,
13,31,1,1,1,0,1,0,1,0,1,...,,,,,,,,,,
14,34,1,1,1,0,0,1,1,0,1,...,,,,,,,,,,
15,38,1,1,1,0,1,0,0,1,1,...,,,,,,,,,,
17,44,1,1,1,0,0,1,0,1,1,...,,,,,,,,,,
22,56,1,1,1,0,1,0,0,1,1,...,,,,,,,,,,


In [63]:
df_station2_valid = df_station2_all[df_station2_all['L0_S0_D1'] ==1]
df_station2_valid_2 = df_station2_valid.dropna(subset=['L0_S2_F49'])
df_station2_valid_2

Unnamed: 0,Id,L0_S0_D1,L0_S1_D26,L0_S2_D34,L0_S3_D70,L0_S4_D106,L0_S5_D115,L0_S6_D120,L0_S7_D137,L0_S8_D145,...,L0_S2_F49,L0_S2_F51,L0_S2_F53,L0_S2_F55,L0_S2_F57,L0_S2_F59,L0_S2_F61,L0_S2_F63,L0_S2_F65,L0_S2_F67
8522,16927,1,1,1,1,1,0,1,0,1,...,T1,T32,T1,T32,T1,T32,T1,T32,T1,T32
10846,21591,1,1,1,1,0,1,1,0,1,...,T1,T32,T1,T32,T1,T32,T1,T32,T1,T32
17811,35611,1,1,1,1,0,1,0,1,1,...,T1,T32,T1,T32,T1,T32,T1,T32,T1,T32
19349,38775,1,1,1,1,1,0,1,0,1,...,T1,T32,T1,T32,T1,T32,T1,T32,T1,T32
19755,39584,1,1,1,1,0,1,0,1,1,...,T1,T32,T1,T32,T1,T32,T1,T32,T1,T32
21431,42908,1,1,1,1,0,1,0,1,1,...,T1,T32,T1,T32,T1,T32,T1,T32,T1,T32
27023,54135,1,1,1,1,1,0,0,1,1,...,T1,T32,T1,T32,T1,T32,T1,T32,T1,T32
34487,69159,1,1,1,1,1,0,1,0,1,...,T1,T48,T1,T48,T1,T48,T1,T48,T1,T48
38022,76191,1,1,1,1,0,1,1,0,1,...,T1,T96,T1,T96,T1,T96,T1,T96,T1,T96
38023,76192,1,1,1,1,0,1,1,0,1,...,T1,T96,T1,T96,T1,T96,T1,T96,T1,T96


In [47]:
df_station2_valid_2_rsp1 = df_station2_valid_2[df_station2_valid_2['Response'] == 1]
df_station2_valid_2_rsp1

Unnamed: 0,Id,L0_S0_D1,L0_S1_D26,L0_S2_D34,L0_S3_D70,L0_S4_D106,L0_S5_D115,L0_S6_D120,L0_S7_D137,L0_S8_D145,...,L0_S2_F49,L0_S2_F51,L0_S2_F53,L0_S2_F55,L0_S2_F57,L0_S2_F59,L0_S2_F61,L0_S2_F63,L0_S2_F65,L0_S2_F67
235258,470536,1,1,1,1,0,1,1,0,1,...,T1,T48,T1,T48,T1,T48,T1,T48,T1,T48
974489,1948933,1,1,1,1,1,0,1,0,1,...,T1,T32,T1,T32,T1,T32,T1,T32,T1,T32
1122775,2245875,1,1,1,1,0,1,0,1,1,...,T1,T32,T1,T32,T1,T32,T1,T32,T1,T32


In [48]:
df_station2_valid_2_rsp0 = df_station2_valid_2[df_station2_valid_2['Response'] == 0]
df_station2_valid_2_rsp0

Unnamed: 0,Id,L0_S0_D1,L0_S1_D26,L0_S2_D34,L0_S3_D70,L0_S4_D106,L0_S5_D115,L0_S6_D120,L0_S7_D137,L0_S8_D145,...,L0_S2_F49,L0_S2_F51,L0_S2_F53,L0_S2_F55,L0_S2_F57,L0_S2_F59,L0_S2_F61,L0_S2_F63,L0_S2_F65,L0_S2_F67
8522,16927,1,1,1,1,1,0,1,0,1,...,T1,T32,T1,T32,T1,T32,T1,T32,T1,T32
10846,21591,1,1,1,1,0,1,1,0,1,...,T1,T32,T1,T32,T1,T32,T1,T32,T1,T32
17811,35611,1,1,1,1,0,1,0,1,1,...,T1,T32,T1,T32,T1,T32,T1,T32,T1,T32
19349,38775,1,1,1,1,1,0,1,0,1,...,T1,T32,T1,T32,T1,T32,T1,T32,T1,T32
19755,39584,1,1,1,1,0,1,0,1,1,...,T1,T32,T1,T32,T1,T32,T1,T32,T1,T32
21431,42908,1,1,1,1,0,1,0,1,1,...,T1,T32,T1,T32,T1,T32,T1,T32,T1,T32
27023,54135,1,1,1,1,1,0,0,1,1,...,T1,T32,T1,T32,T1,T32,T1,T32,T1,T32
34487,69159,1,1,1,1,1,0,1,0,1,...,T1,T48,T1,T48,T1,T48,T1,T48,T1,T48
38022,76191,1,1,1,1,0,1,1,0,1,...,T1,T96,T1,T96,T1,T96,T1,T96,T1,T96
38023,76192,1,1,1,1,0,1,1,0,1,...,T1,T96,T1,T96,T1,T96,T1,T96,T1,T96


0 と 1 で有意な差分が見られない。
あと、列単位で全部同じものが入ってるように見える。

In [61]:
df_station2_valid_2_rsp0.T.drop_duplicates().T


Unnamed: 0,Id,L0_S0_D1,L0_S4_D106,L0_S5_D115,L0_S6_D120,L0_S7_D137,L0_S9_D152,L0_S10_D216,L0_S11_D280,L0_S12_D331,...,L3_S38_D3953,L3_S39_D3966,L3_S43_D4062,L3_S44_D4101,L3_S49_D4208,L3_S50_D4242,StartTime,EndTime,L0_S2_F33,L0_S2_F35
8522,16927,1,1,0,1,0,0,1,0,0,...,0,0,0,0,0,0,912.75,921.3,T1,T32
10846,21591,1,0,1,1,0,1,0,0,0,...,0,0,0,0,0,0,80.54,87.21,T1,T32
17811,35611,1,0,1,0,1,1,0,0,0,...,0,0,0,0,0,0,1374.03,1376.27,T1,T32
19349,38775,1,1,0,1,0,1,0,0,0,...,0,0,0,0,0,0,653.82,655.43,T1,T32
19755,39584,1,0,1,0,1,0,0,1,0,...,0,0,0,0,0,0,1624.84,1625.95,T1,T32
21431,42908,1,0,1,0,1,0,0,1,0,...,0,0,0,0,0,0,1660.44,1661.92,T1,T32
27023,54135,1,1,0,0,1,0,1,0,0,...,1,0,0,0,0,0,670.48,673.43,T1,T32
34487,69159,1,1,0,1,0,0,0,1,0,...,0,0,0,0,0,0,1365.11,1367.17,T1,T48
38022,76191,1,0,1,1,0,0,1,0,0,...,0,0,0,0,0,0,1658.32,1660.23,T1,T96
38023,76192,1,0,1,1,0,0,1,0,0,...,0,0,0,0,0,0,1658.32,1660.23,T1,T96


S2の有効な特徴量は、列方向に全部重複しててT1とT32/T48/T96/T128 しかない…

### station3 編

In [70]:
df_station3_train = pd.read_csv("./train_categorical_station_3.csv")
df_station3_all = pd.merge(df_pass, df_time, on='Id', how='left')
df_station3_all = pd.merge(df_station3_all, df_station3_train, on = 'Id', how='left')
df_station3_pass = df_station3_all[df_station3_all['L0_S3_D70'] ==1]
df_station3_pass

Unnamed: 0,Id,L0_S0_D1,L0_S1_D26,L0_S2_D34,L0_S3_D70,L0_S4_D106,L0_S5_D115,L0_S6_D120,L0_S7_D137,L0_S8_D145,...,L0_S3_F85,L0_S3_F87,L0_S3_F89,L0_S3_F91,L0_S3_F93,L0_S3_F95,L0_S3_F97,L0_S3_F99,L0_S3_F101,L0_S3_F103
4,11,1,1,0,1,1,0,0,1,1,...,,,,,,,,,,
5,13,1,1,0,1,1,0,0,1,1,...,,,,,,,,,,
11,27,1,1,0,1,0,1,1,0,1,...,,,,,,,,,,
12,28,1,1,0,1,1,0,0,1,1,...,,,,,,,,,,
19,49,1,1,0,1,1,0,1,0,1,...,,,,,,,,,,
25,68,1,1,0,1,0,1,0,1,1,...,,,,,,,,,,
27,71,1,1,0,1,1,0,0,1,1,...,,,,,,,,,,
32,78,1,1,0,1,0,1,0,1,1,...,,,,,,,,,,
34,81,1,1,0,1,1,0,0,1,1,...,,,,,,,,,,
37,86,1,1,0,1,0,1,0,1,1,...,,,,,,,,,,


In [94]:
df_station3_valid_3 = df_station3_pass.dropna(subset=['L0_S3_F69'])
df_station3_valid_3

Unnamed: 0,Id,L0_S0_D1,L0_S1_D26,L0_S2_D34,L0_S3_D70,L0_S4_D106,L0_S5_D115,L0_S6_D120,L0_S7_D137,L0_S8_D145,...,L0_S3_F85,L0_S3_F87,L0_S3_F89,L0_S3_F91,L0_S3_F93,L0_S3_F95,L0_S3_F97,L0_S3_F99,L0_S3_F101,L0_S3_F103


station3 の categorical data はNanばっかり?
全列試してそうなってた…


### station4 編

In [108]:
df_station4_train = pd.read_csv("./train_categorical_station_4.csv")
df_station4_all = pd.merge(df_pass, df_time, on='Id', how='left')
df_station4_all = pd.merge(df_station4_all, df_station4_train, on = 'Id', how='left')
df_station4_pass = df_station4_all[df_station4_all['L0_S4_D106'] ==1]
df_station4_pass

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,Id,L0_S0_D1,L0_S1_D26,L0_S2_D34,L0_S3_D70,L0_S4_D106,L0_S5_D115,L0_S6_D120,L0_S7_D137,L0_S8_D145,...,L3_S51_D4255,StartTime,EndTime,Response,L0_S4_F105,L0_S4_F107,L0_S4_F108,L0_S4_F110,L0_S4_F112,L0_S4_F113
0,4,1,1,1,0,1,0,0,1,1,...,0,82.24,87.29,0,,,,,,
3,9,1,1,1,0,1,0,0,1,1,...,0,1149.20,1154.16,0,,,,,,
4,11,1,1,0,1,1,0,0,1,1,...,0,602.64,606.02,0,,,,,,
5,13,1,1,0,1,1,0,0,1,1,...,0,1331.66,1339.73,0,,,,,,
8,18,1,1,1,0,1,0,0,1,1,...,0,517.64,518.08,0,,,,,,
10,26,1,1,1,0,1,0,0,1,1,...,0,1104.78,1105.95,0,,,,,,
12,28,1,1,0,1,1,0,0,1,1,...,0,55.44,62.10,0,,,,,,
13,31,1,1,1,0,1,0,1,0,1,...,0,98.99,99.67,0,,,,,,
15,38,1,1,1,0,1,0,0,1,1,...,0,1633.80,1636.37,0,,,,,,
19,49,1,1,0,1,1,0,1,0,1,...,0,675.84,677.68,0,,,,,,


In [109]:
df_station4_valid_4 = df_station4_pass.dropna(subset=['L0_S4_F105'])
df_station4_valid_4

Unnamed: 0,Id,L0_S0_D1,L0_S1_D26,L0_S2_D34,L0_S3_D70,L0_S4_D106,L0_S5_D115,L0_S6_D120,L0_S7_D137,L0_S8_D145,...,L3_S51_D4255,StartTime,EndTime,Response,L0_S4_F105,L0_S4_F107,L0_S4_F108,L0_S4_F110,L0_S4_F112,L0_S4_F113
8223,16330,1,1,1,0,1,1,0,1,1,...,0,1111.22,1112.39,0,T1,T1,T97,,,
35936,72081,1,1,0,1,1,1,0,1,1,...,0,1528.77,1530.59,0,T1,T1,T98,T1,T1,T98
39231,78568,1,1,0,1,1,1,1,0,1,...,0,375.83,382.08,0,T1,T1,T97,,,
40929,81956,1,1,0,1,1,1,0,1,1,...,0,314.31,318.82,0,T1,,T98,T1,,T98
48205,96421,1,1,0,1,1,1,1,0,1,...,0,885.81,888.99,1,T1,,T98,T1,,T98
58866,117847,1,1,0,1,1,1,0,1,1,...,1,508.42,510.16,0,T1,T1,T98,T1,T1,T98
67420,134878,1,1,1,0,1,1,0,1,1,...,0,972.51,975.04,0,T1,T1,T97,,,
80736,161884,1,1,0,1,1,1,1,0,1,...,0,1111.22,1112.39,0,T1,,T97,,,
108831,217931,1,1,0,1,1,1,0,1,1,...,0,1163.86,1166.49,0,T1,T1,T97,,,
126967,253729,1,1,0,1,1,1,0,1,1,...,0,625.27,637.64,0,T1,T1,T97,,,


In [110]:
df_station4_valid_4[df_station4_valid_4['Response']==1]



Unnamed: 0,Id,L0_S0_D1,L0_S1_D26,L0_S2_D34,L0_S3_D70,L0_S4_D106,L0_S5_D115,L0_S6_D120,L0_S7_D137,L0_S8_D145,...,L3_S51_D4255,StartTime,EndTime,Response,L0_S4_F105,L0_S4_F107,L0_S4_F108,L0_S4_F110,L0_S4_F112,L0_S4_F113
48205,96421,1,1,0,1,1,1,1,0,1,...,0,885.81,888.99,1,T1,,T98,T1,,T98


In [111]:
df_station4_valid_4[df_station4_valid_4['Response']==0]

Unnamed: 0,Id,L0_S0_D1,L0_S1_D26,L0_S2_D34,L0_S3_D70,L0_S4_D106,L0_S5_D115,L0_S6_D120,L0_S7_D137,L0_S8_D145,...,L3_S51_D4255,StartTime,EndTime,Response,L0_S4_F105,L0_S4_F107,L0_S4_F108,L0_S4_F110,L0_S4_F112,L0_S4_F113
8223,16330,1,1,1,0,1,1,0,1,1,...,0,1111.22,1112.39,0,T1,T1,T97,,,
35936,72081,1,1,0,1,1,1,0,1,1,...,0,1528.77,1530.59,0,T1,T1,T98,T1,T1,T98
39231,78568,1,1,0,1,1,1,1,0,1,...,0,375.83,382.08,0,T1,T1,T97,,,
40929,81956,1,1,0,1,1,1,0,1,1,...,0,314.31,318.82,0,T1,,T98,T1,,T98
58866,117847,1,1,0,1,1,1,0,1,1,...,1,508.42,510.16,0,T1,T1,T98,T1,T1,T98
67420,134878,1,1,1,0,1,1,0,1,1,...,0,972.51,975.04,0,T1,T1,T97,,,
80736,161884,1,1,0,1,1,1,1,0,1,...,0,1111.22,1112.39,0,T1,,T97,,,
108831,217931,1,1,0,1,1,1,0,1,1,...,0,1163.86,1166.49,0,T1,T1,T97,,,
126967,253729,1,1,0,1,1,1,0,1,1,...,0,625.27,637.64,0,T1,T1,T97,,,
134431,268628,1,1,1,0,1,1,0,1,1,...,0,885.85,888.95,0,T1,T1,T98,T1,T1,T98


F105, F108 は通過したものに値があるが、いずれにせよサンプルが少なすぎて使えないか。

### statoin 32 編

In [115]:
df_station32_train = pd.read_csv("./train_categorical_station_32.csv")
df_station32_all = pd.merge(df_pass, df_time, on='Id', how='left')
df_station32_all = pd.merge(df_station32_all, df_station32_train, on = 'Id', how='left')
df_station32_pass = df_station32_all[df_station32_all['L3_S32_D3852'] ==1]
df_station32_pass

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,Id,L0_S0_D1,L0_S1_D26,L0_S2_D34,L0_S3_D70,L0_S4_D106,L0_S5_D115,L0_S6_D120,L0_S7_D137,L0_S8_D145,...,L3_S48_D4194,L3_S49_D4208,L3_S50_D4242,L3_S51_D4255,StartTime,EndTime,Response,L3_S32_F3851,L3_S32_F3853,L3_S32_F3854
55,116,0,0,0,0,0,0,0,0,0,...,0,0,0,0,555.38,556.25,0,T1,,T16
71,146,1,1,0,1,0,1,0,1,1,...,0,0,0,0,1164.57,1170.89,0,T1,,T16
80,166,0,0,0,0,0,0,0,0,0,...,0,0,0,0,456.80,493.38,0,T1,,T256
141,293,1,1,1,0,0,1,0,1,1,...,0,0,0,0,1339.10,1342.73,0,,,
321,616,1,1,1,0,0,1,1,0,1,...,0,0,0,0,1575.48,1578.18,0,T1,,T128
424,825,1,1,0,1,1,0,0,1,1,...,0,0,0,0,1154.04,1155.52,0,T1,,T128
552,1078,1,1,0,1,1,0,1,0,1,...,0,0,0,0,677.64,686.48,0,T1,,T512
587,1150,1,1,1,0,0,1,1,0,1,...,0,0,0,0,1446.27,1448.33,0,,,
633,1250,1,1,0,1,1,0,0,1,1,...,0,0,0,0,619.18,622.21,1,T1,,T2
634,1251,1,1,0,1,1,0,0,1,1,...,0,0,0,0,619.18,625.71,0,T1,,T2


In [122]:
df_station32_pass[df_station32_pass['Response']==1]


Unnamed: 0,Id,L0_S0_D1,L0_S1_D26,L0_S2_D34,L0_S3_D70,L0_S4_D106,L0_S5_D115,L0_S6_D120,L0_S7_D137,L0_S8_D145,...,L3_S48_D4194,L3_S49_D4208,L3_S50_D4242,L3_S51_D4255,StartTime,EndTime,Response,L3_S32_F3851,L3_S32_F3853,L3_S32_F3854
633,1250,1,1,0,1,1,0,0,1,1,...,0,0,0,0,619.18,622.21,1,T1,,T2
2489,5017,0,0,0,0,0,0,0,0,0,...,0,0,0,0,712.63,755.72,1,T1,,T4
2490,5018,0,0,0,0,0,0,0,0,0,...,0,0,0,0,712.63,755.72,1,T1,,T4
3969,7944,1,1,0,1,1,0,1,0,1,...,0,0,0,0,1643.29,1646.25,1,T1,,T16
6178,12280,1,1,1,0,1,0,1,0,1,...,0,0,0,0,241.43,246.42,1,T1,,T2
10658,21212,0,0,0,0,0,0,0,0,0,...,0,0,0,0,712.61,755.70,1,T1,,T4
12231,24386,0,0,0,0,0,0,0,0,0,...,0,0,0,0,675.21,677.43,1,T1,,T16
12232,24387,0,0,0,0,0,0,0,0,0,...,0,0,0,0,675.21,677.43,1,T1,,T16
13228,26379,1,1,1,0,1,0,0,1,1,...,0,0,0,0,1426.75,1428.01,1,T1,,T48
13537,27071,0,0,0,0,0,0,0,0,0,...,0,0,0,0,197.52,198.22,1,T1,,T2


In [117]:
df_station32_pass[df_station32_pass['Response']==0]

Unnamed: 0,Id,L0_S0_D1,L0_S1_D26,L0_S2_D34,L0_S3_D70,L0_S4_D106,L0_S5_D115,L0_S6_D120,L0_S7_D137,L0_S8_D145,...,L3_S48_D4194,L3_S49_D4208,L3_S50_D4242,L3_S51_D4255,StartTime,EndTime,Response,L3_S32_F3851,L3_S32_F3853,L3_S32_F3854
55,116,0,0,0,0,0,0,0,0,0,...,0,0,0,0,555.38,556.25,0,T1,,T16
71,146,1,1,0,1,0,1,0,1,1,...,0,0,0,0,1164.57,1170.89,0,T1,,T16
80,166,0,0,0,0,0,0,0,0,0,...,0,0,0,0,456.80,493.38,0,T1,,T256
141,293,1,1,1,0,0,1,0,1,1,...,0,0,0,0,1339.10,1342.73,0,,,
321,616,1,1,1,0,0,1,1,0,1,...,0,0,0,0,1575.48,1578.18,0,T1,,T128
424,825,1,1,0,1,1,0,0,1,1,...,0,0,0,0,1154.04,1155.52,0,T1,,T128
552,1078,1,1,0,1,1,0,1,0,1,...,0,0,0,0,677.64,686.48,0,T1,,T512
587,1150,1,1,1,0,0,1,1,0,1,...,0,0,0,0,1446.27,1448.33,0,,,
634,1251,1,1,0,1,1,0,0,1,1,...,0,0,0,0,619.18,625.71,0,T1,,T2
689,1366,0,0,0,0,0,0,0,0,0,...,0,0,0,0,624.13,626.60,0,T1,,T512


In [123]:
g = df_station32_pass[df_station32_pass['Response']==1].groupby('L3_S32_F3854')
g.count()

Unnamed: 0_level_0,Id,L0_S0_D1,L0_S1_D26,L0_S2_D34,L0_S3_D70,L0_S4_D106,L0_S5_D115,L0_S6_D120,L0_S7_D137,L0_S8_D145,...,L3_S47_D4140,L3_S48_D4194,L3_S49_D4208,L3_S50_D4242,L3_S51_D4255,StartTime,EndTime,Response,L3_S32_F3851,L3_S32_F3853
L3_S32_F3854,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
T-2147482432,4,4,4,4,4,4,4,4,4,4,...,4,4,4,4,4,4,4,4,4,0
T-2147482688,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,0
T-2147482816,2,2,2,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,0
T-2147483648,2,2,2,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,0
T1,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,0
T128,31,31,31,31,31,31,31,31,31,31,...,31,31,31,31,31,31,31,31,31,0
T16,159,159,159,159,159,159,159,159,159,159,...,159,159,159,159,159,159,159,159,159,0
T2,667,667,667,667,667,667,667,667,667,667,...,667,667,667,667,667,667,667,667,667,0
T256,5,5,5,5,5,5,5,5,5,5,...,5,5,5,5,5,5,5,5,5,0
T4,156,156,156,156,156,156,156,156,156,156,...,156,156,156,156,156,156,156,156,156,0


In [124]:
g = df_station32_pass[df_station32_pass['Response']==0].groupby('L3_S32_F3854')
g.count()

Unnamed: 0_level_0,Id,L0_S0_D1,L0_S1_D26,L0_S2_D34,L0_S3_D70,L0_S4_D106,L0_S5_D115,L0_S6_D120,L0_S7_D137,L0_S8_D145,...,L3_S47_D4140,L3_S48_D4194,L3_S49_D4208,L3_S50_D4242,L3_S51_D4255,StartTime,EndTime,Response,L3_S32_F3851,L3_S32_F3853
L3_S32_F3854,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
T-2147481664,34,34,34,34,34,34,34,34,34,34,...,34,34,34,34,34,34,34,34,34,0
T-21474819,5,5,5,5,5,5,5,5,5,5,...,5,5,5,5,5,5,5,5,5,0
T-2147482176,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,0
T-2147482432,371,371,371,371,371,371,371,371,371,371,...,371,371,371,371,371,371,371,371,371,1
T-21474825,12,12,12,12,12,12,12,12,12,12,...,12,12,12,12,12,12,12,12,12,0
T-2147482688,363,363,363,363,363,363,363,363,363,363,...,363,363,363,363,363,363,363,363,363,1
T-2147482816,383,383,383,383,383,383,383,383,383,383,...,383,383,383,383,383,383,383,383,383,0
T-2147482944,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,0
T-2147483648,627,627,627,627,627,627,627,627,627,627,...,627,627,627,627,627,627,627,627,627,0
T-21474872,115,115,115,115,115,115,115,115,115,115,...,115,115,115,115,115,115,115,115,115,0


偏りがあるようなないような。ただ、L3_S32_F3854 は有効な feature にはなるかもしれない。

### station31編

In [125]:
df_station31_train = pd.read_csv("./train_categorical_station_31.csv")
df_station31_all = pd.merge(df_pass, df_time, on='Id', how='left')
df_station31_all = pd.merge(df_station31_all, df_station31_train, on = 'Id', how='left')
df_station31_pass = df_station31_all[df_station31_all['L3_S31_D3836'] ==1]
df_station31_pass

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,Id,L0_S0_D1,L0_S1_D26,L0_S2_D34,L0_S3_D70,L0_S4_D106,L0_S5_D115,L0_S6_D120,L0_S7_D137,L0_S8_D145,...,EndTime,Response,L3_S31_F3835,L3_S31_F3837,L3_S31_F3839,L3_S31_F3841,L3_S31_F3843,L3_S31_F3845,L3_S31_F3847,L3_S31_F3849
0,4,1,1,1,0,1,0,0,1,1,...,87.29,0,,,,,,,,
13,31,1,1,1,0,1,0,1,0,1,...,99.67,0,,,,,,,,
31,76,0,0,0,0,0,0,0,0,0,...,120.66,0,,,,,,,,
32,78,1,1,0,1,0,1,0,1,1,...,146.15,0,,,,,,,,
51,106,1,1,1,0,0,1,1,0,1,...,65.73,0,,,,,,,,
54,115,1,1,0,1,1,0,1,0,1,...,87.18,0,,,,,,,,
60,124,0,0,0,0,0,0,0,0,0,...,89.91,0,,,,,,,,
65,135,1,1,0,1,1,0,1,0,1,...,140.03,0,,,,,,,,
72,152,1,1,1,0,1,0,0,1,1,...,99.16,0,,,,,,,,
73,153,1,1,1,0,1,0,0,1,1,...,99.16,0,,,,,,,,


In [134]:
df_station31_valid = df_station31_pass.dropna(subset=['L3_S31_F3849'])
df_station31_valid

Unnamed: 0,Id,L0_S0_D1,L0_S1_D26,L0_S2_D34,L0_S3_D70,L0_S4_D106,L0_S5_D115,L0_S6_D120,L0_S7_D137,L0_S8_D145,...,EndTime,Response,L3_S31_F3835,L3_S31_F3837,L3_S31_F3839,L3_S31_F3841,L3_S31_F3843,L3_S31_F3845,L3_S31_F3847,L3_S31_F3849
34112,68412,1,1,1,0,0,1,0,1,1,...,118.66,0,T1,T128,T1,T128,T1,T128,T1,T128
411772,824035,1,1,0,1,0,1,0,1,1,...,97.82,0,T1,T128,T1,T128,T1,T128,T1,T128
540001,1080678,1,1,0,1,1,0,0,1,1,...,171.8,0,T1,T128,T1,T128,T1,T128,T1,T128
732909,1466068,0,0,0,0,0,0,0,0,0,...,130.91,0,T1,T128,T1,T128,T1,T128,T1,T128
1026421,2053110,1,1,0,1,1,0,0,1,1,...,118.62,0,T1,T128,T1,T128,T1,T128,T1,T128
1078115,2156582,1,1,1,0,0,1,0,1,1,...,28.89,0,T1,T128,T1,T128,T1,T128,T1,T128
1113328,2226927,1,1,0,1,0,1,1,0,1,...,118.63,0,T1,T128,T1,T128,T1,T128,T1,T128


有効な特徴がはいっているものが7つしかない。
有効なものには同じ値になっていて、使えない。

### station 46編

In [137]:
df_station46_train = pd.read_csv("./train_categorical_station_46.csv")
df_station46_all = pd.merge(df_pass, df_time, on='Id', how='left')
df_station46_all = pd.merge(df_station46_all, df_station46_train, on = 'Id', how='left')
df_station46_pass = df_station46_all[df_station46_all['L3_S46_D4135'] ==1]
df_station46_pass

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,Id,L0_S0_D1,L0_S1_D26,L0_S2_D34,L0_S3_D70,L0_S4_D106,L0_S5_D115,L0_S6_D120,L0_S7_D137,L0_S8_D145,...,L3_S48_D4194,L3_S49_D4208,L3_S50_D4242,L3_S51_D4255,StartTime,EndTime,Response,L3_S46_F4134,L3_S46_F4136,L3_S46_F4137
1171451,2342862,1,1,1,0,1,0,1,0,1,...,1,1,0,1,566.34,570.44,0,T3,,


In [140]:
df_station46_1 = df_station46_all.dropna(subset=['L3_S46_F4134'])
df_station46_1

Unnamed: 0,Id,L0_S0_D1,L0_S1_D26,L0_S2_D34,L0_S3_D70,L0_S4_D106,L0_S5_D115,L0_S6_D120,L0_S7_D137,L0_S8_D145,...,L3_S48_D4194,L3_S49_D4208,L3_S50_D4242,L3_S51_D4255,StartTime,EndTime,Response,L3_S46_F4134,L3_S46_F4136,L3_S46_F4137
1171451,2342862,1,1,1,0,1,0,1,0,1,...,1,1,0,1,566.34,570.44,0,T3,,


In [141]:
df_station46_2 = df_station46_all.dropna(subset=['L3_S46_F4136'])
df_station46_2

Unnamed: 0,Id,L0_S0_D1,L0_S1_D26,L0_S2_D34,L0_S3_D70,L0_S4_D106,L0_S5_D115,L0_S6_D120,L0_S7_D137,L0_S8_D145,...,L3_S48_D4194,L3_S49_D4208,L3_S50_D4242,L3_S51_D4255,StartTime,EndTime,Response,L3_S46_F4134,L3_S46_F4136,L3_S46_F4137


In [142]:
df_station46_3 = df_station46_all.dropna(subset=['L3_S46_F4137'])
df_station46_3

Unnamed: 0,Id,L0_S0_D1,L0_S1_D26,L0_S2_D34,L0_S3_D70,L0_S4_D106,L0_S5_D115,L0_S6_D120,L0_S7_D137,L0_S8_D145,...,L3_S48_D4194,L3_S49_D4208,L3_S50_D4242,L3_S51_D4255,StartTime,EndTime,Response,L3_S46_F4134,L3_S46_F4136,L3_S46_F4137


station46 の categorical data は、1行を除いてすべて Nan

In [112]:
df_pass.columns

Index(['Id', 'L0_S0_D1', 'L0_S1_D26', 'L0_S2_D34', 'L0_S3_D70', 'L0_S4_D106',
       'L0_S5_D115', 'L0_S6_D120', 'L0_S7_D137', 'L0_S8_D145', 'L0_S9_D152',
       'L0_S10_D216', 'L0_S11_D280', 'L0_S12_D331', 'L0_S13_D355',
       'L0_S14_D360', 'L0_S15_D395', 'L0_S16_D423', 'L0_S17_D432',
       'L0_S18_D437', 'L0_S19_D454', 'L0_S20_D462', 'L0_S21_D469',
       'L0_S22_D543', 'L0_S23_D617', 'L1_S24_D677', 'L1_S25_D1854',
       'L2_S26_D3037', 'L2_S27_D3130', 'L2_S28_D3223', 'L3_S29_D3316',
       'L3_S30_D3496', 'L3_S31_D3836', 'L3_S32_D3852', 'L3_S33_D3856',
       'L3_S34_D3875', 'L3_S35_D3886', 'L3_S36_D3919', 'L3_S37_D3942',
       'L3_S38_D3953', 'L3_S39_D3966', 'L3_S40_D3981', 'L3_S41_D3997',
       'L3_S42_D4029', 'L3_S43_D4062', 'L3_S44_D4101', 'L3_S45_D4125',
       'L3_S46_D4135', 'L3_S47_D4140', 'L3_S48_D4194', 'L3_S49_D4208',
       'L3_S50_D4242', 'L3_S51_D4255'],
      dtype='object')

In [30]:
df_cat_train = pd.read_csv(TEST_CATEGORICAL,  
                            usecols=use_cat_columns, nrows=NROWS)
df_cat_test  = pd.read_csv(TEST_CATEGORICAL,  
                            usecols=use_cat_columns, nrows=NROWS)

  interactivity=interactivity, compiler=compiler, result=result)


In [31]:
df_num_train = pd.read_csv(TEST_NUMERIC,  
                            usecols=use_num_columns, nrows=NROWS)
df_num_test  = pd.read_csv(TEST_NUMERIC,  
                            usecols=use_num_columns, nrows=NROWS)

In [29]:
use_station_list = [29, 30, 31, 33]

use_cat_columns = ['Id']
use_num_columns = ['Id']

for i in range(STATION_NUM):
    if i in use_station_list:
        use_cat_columns += cat_feature_list[i]
        use_num_columns += num_feature_list[i]

print(use_cat_columns)
print(use_num_columns)
print(len(use_cat_columns))
print(len(use_num_columns))

['Id', 'L3_S29_F3317', 'L3_S29_F3320', 'L3_S29_F3323', 'L3_S29_F3326', 'L3_S29_F3329', 'L3_S29_F3332', 'L3_S29_F3335', 'L3_S29_F3338', 'L3_S29_F3341', 'L3_S29_F3344', 'L3_S29_F3347', 'L3_S29_F3350', 'L3_S29_F3353', 'L3_S29_F3356', 'L3_S29_F3359', 'L3_S29_F3362', 'L3_S29_F3364', 'L3_S29_F3366', 'L3_S29_F3369', 'L3_S29_F3372', 'L3_S29_F3375', 'L3_S29_F3378', 'L3_S29_F3381', 'L3_S29_F3384', 'L3_S29_F3387', 'L3_S29_F3390', 'L3_S29_F3392', 'L3_S29_F3394', 'L3_S29_F3397', 'L3_S29_F3400', 'L3_S29_F3403', 'L3_S29_F3406', 'L3_S29_F3409', 'L3_S29_F3411', 'L3_S29_F3414', 'L3_S29_F3416', 'L3_S29_F3418', 'L3_S29_F3420', 'L3_S29_F3423', 'L3_S29_F3426', 'L3_S29_F3429', 'L3_S29_F3432', 'L3_S29_F3435', 'L3_S29_F3438', 'L3_S29_F3441', 'L3_S29_F3444', 'L3_S29_F3446', 'L3_S29_F3448', 'L3_S29_F3451', 'L3_S29_F3454', 'L3_S29_F3457', 'L3_S29_F3460', 'L3_S29_F3463', 'L3_S29_F3466', 'L3_S29_F3469', 'L3_S29_F3472', 'L3_S29_F3475', 'L3_S29_F3478', 'L3_S29_F3481', 'L3_S29_F3484', 'L3_S29_F3487', 'L3_S29_F3490', '

In [34]:
df_start_test.head()
df_start = pd.concat([df_start_train, df_start_test])
df_start.head()

Unnamed: 0,Id,StartTime,EndTime,Response
0,4,82.24,87.29,0
1,6,1313.12,1315.75,0
2,7,1618.7,1624.42,0
3,9,1149.2,1154.16,0
4,11,602.64,606.02,0


In [35]:
df_all = pd.merge(df_start, df_pass, on='Id',  how='left')
df_all = df_all.fillna(-1500)

In [36]:
df_all = pd.merge(df_all, df_num_train, on='Id', how='left')
df_all = pd.merge(df_all, df_num_test,  on='Id', how='left')


In [38]:
df_all = df_all.fillna(-1500)
df_all.head

<bound method NDFrame.head of               Id  StartTime  EndTime  Response  L0_S0_D1  L0_S1_D26  \
0              4      82.24    87.29         0         1          1   
1              6    1313.12  1315.75         0         0          0   
2              7    1618.70  1624.42         0         1          1   
3              9    1149.20  1154.16         0         1          1   
4             11     602.64   606.02         0         1          1   
5             13    1331.66  1339.73         0         1          1   
6             14    1662.63  1664.04         0         0          0   
7             16     791.22   804.36         0         0          0   
8             18     517.64   518.08         0         1          1   
9             23     156.27   157.89         0         0          0   
10            26    1104.78  1105.95         0         1          1   
11            27     392.85   401.41         0         1          1   
12            28      55.44    62.10         0 

In [39]:
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, log_loss, make_scorer

def calc_mcc(cf_mat):
    tn, fp, fn, tp = cf_mat.ravel()
    print(tn, fp, fn, tp)
    mcc = (tp * tn - fp * fn) / np.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))
    return mcc

def mcc_scorer(y_true, y_pred):
    cf_mat = confusion_matrix(y_true, y_pred)
    return calc_mcc(cf_mat)

def separate_X_y(df):
    X = df.drop(['Response'], axis=1)
    y = df['Response']
    return X, y

def train_with_r_forest(df):
    X_train, y_train = separate_X_y(df)

    cl_weight = {0:0.3, 1:0.7}
    
    # Grid Search
    params={'max_depth': [25],
            'subsample': [0.95],
            'colsample_bytree': [1.0],
            'scale_pos_weight': [2]
    }

    xgb_model = xgb.XGBClassifier()
    gs = GridSearchCV(xgb_model,
                      params,
                      cv=5,
                      scoring={'mcc' : make_scorer(mcc_scorer)},
                      n_jobs=1,
                      verbose=2,
                      refit='mcc')

    gs.fit(X_train, y_train)
#rf = RandomForestClassifier(max_depth=10,n_estimators=50, random_state=33)
#    rf.fit(X_train, y_train)
    return gs




In [41]:
def train_rf_parameter(df):
    df = df[(df['Response'] == 0) | (df['Response'] == 1)]
    df_train, df_test = train_test_split(df, random_state=33)
    
    df_train_ok   = df_train[df_train['Response'] == 0]
    df_train_ng   = df_train[df_train['Response'] == 1]
    undersample_rate = len(df_train_ng) * 30 / len(df_train_ok)
    df_train_ok_sample = df_train_ok.sample(frac = undersample_rate)
    df_train = pd.concat([df_train_ok_sample, df_train_ng])

    rf = train_with_r_forest(df_train)
    X_test, y_test = separate_X_y(df_test)
    y_pred = rf.predict(X_test)
    print(y_pred)   
    
    cf_mat = confusion_matrix(y_test, y_pred)
    print(cf_mat)

    mcc = calc_mcc(cf_mat)
    print(mcc)    
    return rf

model = train_rf_parameter(df_all)


Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] colsample_bytree=1.0, max_depth=25, scale_pos_weight=2, subsample=0.95 
31041 141 852 188
124717 11 572 3585
[CV]  colsample_bytree=1.0, max_depth=25, scale_pos_weight=2, subsample=0.95, total= 4.5min
[CV] colsample_bytree=1.0, max_depth=25, scale_pos_weight=2, subsample=0.95 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  4.6min remaining:    0.0s


31060 122 859 181
124720 8 621 3536
[CV]  colsample_bytree=1.0, max_depth=25, scale_pos_weight=2, subsample=0.95, total= 4.6min
[CV] colsample_bytree=1.0, max_depth=25, scale_pos_weight=2, subsample=0.95 
31055 127 875 164
124718 10 676 3482
[CV]  colsample_bytree=1.0, max_depth=25, scale_pos_weight=2, subsample=0.95, total= 4.5min
[CV] colsample_bytree=1.0, max_depth=25, scale_pos_weight=2, subsample=0.95 
31061 121 859 180
124714 14 520 3638
[CV]  colsample_bytree=1.0, max_depth=25, scale_pos_weight=2, subsample=0.95, total= 4.2min
[CV] colsample_bytree=1.0, max_depth=25, scale_pos_weight=2, subsample=0.95 
31051 131 849 190
124718 10 660 3498
[CV]  colsample_bytree=1.0, max_depth=25, scale_pos_weight=2, subsample=0.95, total= 4.3min


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed: 22.4min finished


[0 0 0 ..., 0 0 0]
[[293022   1233]
 [  1371    311]]
293022 1233 1371 311
0.18856737844
