In [1]:
import os
import pandas as pd
import numpy as np
import math
from matplotlib.pyplot import *
import matplotlib.pyplot as plt
from matplotlib import animation
from matplotlib import cm
from parse import *
import seaborn as sns

In [2]:
CHUNKSIZE = 100000

TRAIN_PATH        = "../../data/train_StationPathInfo.csv"
TRAIN_PATH_EX     = "../../data/train_StationPathInfoEx.csv"
TEST_PATH         = "../../data/test_StationPathInfo.csv"
TEST_PATH_EX      = "../../data/test_StationPathInfoEx.csv"

TRAIN_DATE        = "../../data/train_date.csv"
TRAIN_NUMERIC     = "../../data/train_numeric.csv"
TRAIN_CATEGORICAL = "../../data/train_categorical.csv"
TEST_DATE         = "../../data/test_date.csv"
TEST_NUMERIC      = "../../data/test_numeric.csv"
TEST_CATEGORICAL  = "../../data/test_categorical.csv"
DUPLICATE_FEAT_PATH     = "../../data/Feature_Duplicates.csv"
DUPLICATE_NUM_FEAT_PATH = "../../data/Feature_Numeric_Duplicates.csv"


SEED = 0
CHUNKSIZE = 50000
NROWS = 1200000

ID_COLUMN = 'Id'
TARGET_COLUMN = 'Response'
    
    

In [3]:
chunk_train_num = pd.read_csv(TRAIN_NUMERIC, nrows=NROWS, chunksize=CHUNKSIZE)
chunk_test_num  = pd.read_csv(TEST_NUMERIC, nrows=NROWS, chunksize=CHUNKSIZE)
chunk_train_cat = pd.read_csv(TRAIN_CATEGORICAL, nrows=NROWS, chunksize=CHUNKSIZE)
chunk_test_cat  = pd.read_csv(TEST_CATEGORICAL, nrows=NROWS, chunksize=CHUNKSIZE)

df_train_num = chunk_train_num.get_chunk(10)
df_test_num  = chunk_test_num.get_chunk(10)
df_train_cat = chunk_train_cat.get_chunk(10)
df_test_cat  = chunk_test_cat.get_chunk(10)

df_train_num.head()

Unnamed: 0,Id,L0_S0_F0,L0_S0_F2,L0_S0_F4,L0_S0_F6,L0_S0_F8,L0_S0_F10,L0_S0_F12,L0_S0_F14,L0_S0_F16,...,L3_S50_F4245,L3_S50_F4247,L3_S50_F4249,L3_S50_F4251,L3_S50_F4253,L3_S51_F4256,L3_S51_F4258,L3_S51_F4260,L3_S51_F4262,Response
0,4,0.03,-0.034,-0.197,-0.179,0.118,0.116,-0.015,-0.032,0.02,...,,,,,,,,,,0
1,6,,,,,,,,,,...,,,,,,,,,,0
2,7,0.088,0.086,0.003,-0.052,0.161,0.025,-0.015,-0.072,-0.225,...,,,,,,,,,,0
3,9,-0.036,-0.064,0.294,0.33,0.074,0.161,0.022,0.128,-0.026,...,,,,,,,,,,0
4,11,-0.055,-0.086,0.294,0.33,0.118,0.025,0.03,0.168,-0.169,...,,,,,,,,,,0


### 通過ステーション情報・時刻情報を読み出す。

In [4]:
# 読み出す。
TEST_START_TIME      = "../../data/test_StartEndTime.csv"
TRAIN_START_TIME     = "../../data/train_StartEndTime.csv"

df_train_bin = pd.read_csv(TRAIN_PATH, dtype='float32')
df_test_bin  = pd.read_csv(TEST_PATH, dtype='float32')
df_pass = pd.concat([df_train_bin, df_test_bin])



In [5]:
df_duplicate_feat = pd.read_csv(DUPLICATE_FEAT_PATH, dtype='float32')
df_pass = pd.merge(df_pass, df_duplicate_feat, how='left', on='Id')

df_duplicate_num_feat = pd.read_csv(DUPLICATE_NUM_FEAT_PATH, dtype='float32')
df_pass = pd.merge(df_pass, df_duplicate_num_feat, how='left', on='Id')



In [6]:

df_start_train = pd.read_csv(TRAIN_START_TIME, dtype='float32')
df_start_train = df_start_train.ix[:,['Id','StartTime','EndTime', 'Response']]
df_start_test = pd.read_csv(TEST_START_TIME, dtype='float32')
df_start_test['Response'] = -1
df_time = pd.concat([df_start_train, df_start_test])

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  This is separate from the ipykernel package so we can avoid doing imports until


## S32 の numeric, categorical とマージし、S32に限定して予想してみる。

In [7]:
def merge_one_sation_info(df_merge, station_id):
    categorical_file_name = "../../data/all_categorical_bitdec_station_" + str(station_id) + ".csv"
    if os.path.exists(categorical_file_name):
        df_cat = pd.read_csv(categorical_file_name, dtype='float32')
        df_merge = pd.merge(df_merge, df_cat, on="Id", how="left", copy=False)
    
    numeric_train_file_name = "../../data/train_numeric_station_" + str(station_id) + ".csv"
    numeric_test_file_name  = "../../data/test_numeric_station_"  + str(station_id) + ".csv"
    if os.path.exists(numeric_train_file_name):
        df_train_num = pd.read_csv(numeric_train_file_name, dtype='float32')
        df_test_num  = pd.read_csv(numeric_test_file_name, dtype='float32')
        df_num = pd.concat([df_train_num, df_test_num])    
        df_merge = pd.merge(df_merge, df_num, on='Id', how='left', copy=False)
 
    return df_merge

In [8]:
df_analyze = pd.merge(df_pass, df_time, on='Id', how='left')
df_analyze_S32 = df_analyze[df_analyze['L3_S32_D3852'] == 1]
for station_id in range(39):
    print('merging station ', station_id)
    df_analyze_S32 = merge_one_sation_info(df_analyze_S32, station_id)


merging station  0
merging station  1
merging station  2
merging station  3
merging station  4
merging station  5
merging station  6
merging station  7
merging station  8
merging station  9
merging station  10
merging station  11
merging station  12
merging station  13
merging station  14
merging station  15
merging station  16
merging station  17
merging station  18
merging station  19
merging station  20
merging station  21
merging station  22
merging station  23
merging station  24
merging station  25
merging station  26
merging station  27
merging station  28
merging station  29
merging station  30
merging station  31
merging station  32
merging station  33
merging station  34
merging station  35
merging station  36
merging station  37
merging station  38


In [9]:
df_analyze_S32.head()

Unnamed: 0,Id,L0_S0_D1,L0_S1_D26,L0_S2_D34,L0_S3_D70,L0_S4_D106,L0_S5_D115,L0_S6_D120,L0_S7_D137,L0_S8_D145,...,L3_S36_F3938,L3_S37_F3944,L3_S37_F3946,L3_S37_F3948,L3_S37_F3950,L3_S38_F3954_bit_0,L3_S38_F3955_bit_9,L3_S38_F3952,L3_S38_F3956,L3_S38_F3960
0,116.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,
1,146.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,...,,0.0,0.0,0.0,0.0,0.0,0.0,,,
2,166.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,
3,293.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,...,,0.0,0.0,0.0,0.0,0.0,0.0,,,
4,616.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,


In [10]:
import lightgbm as lgb
from lightgbm import LGBMModel, LGBMClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process.kernels import RBF
#from sklearn.model_selection  import cross_val_score, StratifiedKFold
from sklearn.cross_validation import cross_val_score, StratifiedKFold
from sklearn import metrics
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
%matplotlib inline



In [None]:
%%time

param = {
    'num_leaves':48,
    'min_data_in_leaf':1,
    'learning_rate':0.01,
    'objective':'binary',
    'boosting':'gbdt',
    'num_threads':16,
    'device_type':'cpu',
    'max_depth':-1,
}

def train_and_predict(df):
    dfs_train = df[df['Response'] != -1]
    dfs_test  = df[df['Response'] == -1]

    dfs_test  = dfs_test.astype({'Id':'int32'})
    dfs_train = dfs_train.astype({'Id':'int32'})
    id_test  = dfs_test['Id'].values
    id_train = dfs_train['Id'].values

    X_test = dfs_test.drop(['Response', 'Id'], axis=1)
    X = dfs_train.drop(['Response', 'Id'], axis=1)
    y = dfs_train['Response']   
    y_train = y.astype(np.float32).values.ravel()
    print(id_test.shape)
    print(id_train.shape)
    print(X_test.shape)

    n_folds=5

    cv = StratifiedKFold(y_train, n_folds)
    models = []
    preds_oof  = np.ones(y_train.shape[0])
    preds_test = np.empty((n_folds, X_test.shape[0]))

    for i, (train, valid) in enumerate(cv):
        train_data = lgb.Dataset(X.iloc[train], y_train[train])
        test_data = lgb.Dataset(X.iloc[valid], y_train[valid])

        model = lgb.train(param, train_data, 1000, early_stopping_rounds=50, valid_sets=[test_data])

        preds_oof[valid] = model.predict(X.iloc[valid])
        preds_test[i, :] = model.predict(X_test)

        models.append(model)
    lgb.plot_importance(model, max_num_features=100, figsize=(20,20))
    plt.figure()
    thresholds = np.linspace(0.01, 1.00, 100)
    mcc = np.array([metrics.matthews_corrcoef(y, preds_oof>thr) for thr in thresholds])
    plt.plot(thresholds, mcc)
    best_threshold = thresholds[mcc.argmax()]
    print("MCC: %f" % mcc.max())
    print("best_threshold: %f" % best_threshold)

    preds = (preds_test.mean(axis=0) > best_threshold).astype(np.int8)
    df_pred = pd.DataFrame(data={"Id":id_test, "Response":preds})
    
    return df_pred


Wall time: 0 ns


In [None]:
df_result_S32 = train_and_predict(df_analyze_S32)


(24135,)
(24543,)
(24135, 1451)
[1]	valid_0's binary_logloss: 0.176632
Training until validation scores don't improve for 50 rounds.
[2]	valid_0's binary_logloss: 0.170473
[3]	valid_0's binary_logloss: 0.165388
[4]	valid_0's binary_logloss: 0.161014
[5]	valid_0's binary_logloss: 0.157109
[6]	valid_0's binary_logloss: 0.153526
[7]	valid_0's binary_logloss: 0.150184
[8]	valid_0's binary_logloss: 0.147085
[9]	valid_0's binary_logloss: 0.144252
[10]	valid_0's binary_logloss: 0.141582
[11]	valid_0's binary_logloss: 0.139134
[12]	valid_0's binary_logloss: 0.136809
[13]	valid_0's binary_logloss: 0.134649
[14]	valid_0's binary_logloss: 0.132598
[15]	valid_0's binary_logloss: 0.130644
[16]	valid_0's binary_logloss: 0.128814
[17]	valid_0's binary_logloss: 0.126912
[18]	valid_0's binary_logloss: 0.125171
[19]	valid_0's binary_logloss: 0.123505
[20]	valid_0's binary_logloss: 0.121932
[21]	valid_0's binary_logloss: 0.120426
[22]	valid_0's binary_logloss: 0.118969
[23]	valid_0's binary_logloss: 0.11

[199]	valid_0's binary_logloss: 0.0531218
[200]	valid_0's binary_logloss: 0.0530362
[201]	valid_0's binary_logloss: 0.0529369
[202]	valid_0's binary_logloss: 0.0528361
[203]	valid_0's binary_logloss: 0.0527467
[204]	valid_0's binary_logloss: 0.0526642
[205]	valid_0's binary_logloss: 0.0525537
[206]	valid_0's binary_logloss: 0.0524648
[207]	valid_0's binary_logloss: 0.0523934
[208]	valid_0's binary_logloss: 0.0522873
[209]	valid_0's binary_logloss: 0.0522119
[210]	valid_0's binary_logloss: 0.0521239
[211]	valid_0's binary_logloss: 0.052048
[212]	valid_0's binary_logloss: 0.0519854
[213]	valid_0's binary_logloss: 0.0519166
[214]	valid_0's binary_logloss: 0.0518386
[215]	valid_0's binary_logloss: 0.0517621
[216]	valid_0's binary_logloss: 0.0516803
[217]	valid_0's binary_logloss: 0.0515957
[218]	valid_0's binary_logloss: 0.0515376
[219]	valid_0's binary_logloss: 0.0514533
[220]	valid_0's binary_logloss: 0.051391
[221]	valid_0's binary_logloss: 0.0513032
[222]	valid_0's binary_logloss: 0.05

[395]	valid_0's binary_logloss: 0.0464664
[396]	valid_0's binary_logloss: 0.0464489
[397]	valid_0's binary_logloss: 0.0464484
[398]	valid_0's binary_logloss: 0.0464439
[399]	valid_0's binary_logloss: 0.0464389
[400]	valid_0's binary_logloss: 0.0464222
[401]	valid_0's binary_logloss: 0.0464087
[402]	valid_0's binary_logloss: 0.0463978
[403]	valid_0's binary_logloss: 0.0463917
[404]	valid_0's binary_logloss: 0.0463961
[405]	valid_0's binary_logloss: 0.0463914
[406]	valid_0's binary_logloss: 0.0463979
[407]	valid_0's binary_logloss: 0.0463841
[408]	valid_0's binary_logloss: 0.0463871
[409]	valid_0's binary_logloss: 0.0463758
[410]	valid_0's binary_logloss: 0.0463748
[411]	valid_0's binary_logloss: 0.0463755
[412]	valid_0's binary_logloss: 0.0463695
[413]	valid_0's binary_logloss: 0.0463728
[414]	valid_0's binary_logloss: 0.0463714
[415]	valid_0's binary_logloss: 0.0463566
[416]	valid_0's binary_logloss: 0.0463412
[417]	valid_0's binary_logloss: 0.0463536
[418]	valid_0's binary_logloss: 0.

[106]	valid_0's binary_logloss: 0.071298
[107]	valid_0's binary_logloss: 0.0710336
[108]	valid_0's binary_logloss: 0.07077
[109]	valid_0's binary_logloss: 0.0705093
[110]	valid_0's binary_logloss: 0.0702618
[111]	valid_0's binary_logloss: 0.0700256
[112]	valid_0's binary_logloss: 0.0697796
[113]	valid_0's binary_logloss: 0.0695457
[114]	valid_0's binary_logloss: 0.0693046
[115]	valid_0's binary_logloss: 0.0690672
[116]	valid_0's binary_logloss: 0.0688339
[117]	valid_0's binary_logloss: 0.0686181
[118]	valid_0's binary_logloss: 0.0683935
[119]	valid_0's binary_logloss: 0.0681803
[120]	valid_0's binary_logloss: 0.0679538
[121]	valid_0's binary_logloss: 0.0677406
[122]	valid_0's binary_logloss: 0.0675338
[123]	valid_0's binary_logloss: 0.0673278
[124]	valid_0's binary_logloss: 0.067113
[125]	valid_0's binary_logloss: 0.0669173
[126]	valid_0's binary_logloss: 0.0667204
[127]	valid_0's binary_logloss: 0.0665235
[128]	valid_0's binary_logloss: 0.0663364
[129]	valid_0's binary_logloss: 0.0661

[302]	valid_0's binary_logloss: 0.053654
[303]	valid_0's binary_logloss: 0.0536478
[304]	valid_0's binary_logloss: 0.0536119
[305]	valid_0's binary_logloss: 0.0536013
[306]	valid_0's binary_logloss: 0.0535797
[307]	valid_0's binary_logloss: 0.0535735
[308]	valid_0's binary_logloss: 0.0535582
[309]	valid_0's binary_logloss: 0.0535523
[310]	valid_0's binary_logloss: 0.0535315
[311]	valid_0's binary_logloss: 0.0535491
[312]	valid_0's binary_logloss: 0.0535514
[313]	valid_0's binary_logloss: 0.0535503
[314]	valid_0's binary_logloss: 0.0535376
[315]	valid_0's binary_logloss: 0.0535468
[316]	valid_0's binary_logloss: 0.0535297
[317]	valid_0's binary_logloss: 0.0535284
[318]	valid_0's binary_logloss: 0.0535345
[319]	valid_0's binary_logloss: 0.0535336
[320]	valid_0's binary_logloss: 0.0535279
[321]	valid_0's binary_logloss: 0.0535317
[322]	valid_0's binary_logloss: 0.0535298
[323]	valid_0's binary_logloss: 0.0535226
[324]	valid_0's binary_logloss: 0.0535264
[325]	valid_0's binary_logloss: 0.0

### Station32 を通過していないサンプルで解析する。

更にStation29を通過しているものとそれ以外に分離。

In [None]:
df_analyze_S28_exS32 = df_analyze[(df_analyze['L3_S29_D3316'] == 1) & (df_analyze['L3_S32_D3852'] == 0)]

print(df_analyze.shape)
print(df_analyze_S28_exS32.shape)


feature を足す。


In [None]:
for i in range(0, 39):
    df_analyze_S28_exS32 = merge_one_sation_info(df_analyze_S28_exS32, i)

df_analyze_S28_exS32.shape

In [None]:
df_analyze_S28_exS32.head()

In [None]:
df_result_S28_ex_S32 = train_and_predict(df_analyze_S28_exS32)


<6/27 朝>
Subsample = 0.2

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1.0,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=10,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=2, seed=0, silent=True, subsample=0.95)
       
<Subsampleなし>

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1.0,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=20,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=2, seed=0, silent=True, subsample=0.95)

In [None]:
df_analyze_exS28     = df_analyze[(df_analyze['L3_S29_D3316'] == 0) & (df_analyze['L3_S32_D3852'] == 0)]
print(df_analyze_exS28.shape)

for i in range(0, 29):
    df_analyze_exS28 = merge_one_sation_info(df_analyze_exS28, i)

for i in range(39, 52):
    df_analyze_exS28 = merge_one_sation_info(df_analyze_exS28, i)


In [None]:
df_result_ex_S28 = train_and_predict(df_analyze_exS28)

In [None]:
df_result_ex_S28

In [None]:
df_result = pd.concat([df_result_S32, df_result_S28_ex_S32, df_result_ex_S28]).sort_values('Id')

In [None]:
df_result['Id'] = df_result['Id'].astype('int32')
df_result.head

In [None]:
df_result.to_csv("../../submission/submit_20180717_2.csv", index=False)