In [1]:
import os
import pandas as pd
import numpy as np
import math
from matplotlib.pyplot import *
import matplotlib.pyplot as plt
from matplotlib import animation
from matplotlib import cm
from parse import *
import seaborn as sns

In [2]:
CHUNKSIZE = 100000

TRAIN_PATH        = "../../data/train_StationPathInfo.csv"
TRAIN_PATH_EX     = "../../data/train_StationPathInfoEx.csv"
TEST_PATH         = "../../data/test_StationPathInfo.csv"
TEST_PATH_EX      = "../../data/test_StationPathInfoEx.csv"

TRAIN_DATE        = "../../data/train_date.csv"
TRAIN_NUMERIC     = "../../data/train_numeric.csv"
TRAIN_CATEGORICAL = "../../data/train_categorical.csv"
TEST_DATE         = "../../data/test_date.csv"
TEST_NUMERIC      = "../../data/test_numeric.csv"
TEST_CATEGORICAL  = "../../data/test_categorical.csv"

SEED = 0
CHUNKSIZE = 50000
NROWS = 1200000

ID_COLUMN = 'Id'
TARGET_COLUMN = 'Response'
    
    

In [3]:
chunk_train_num = pd.read_csv(TRAIN_NUMERIC, nrows=NROWS, chunksize=CHUNKSIZE)
chunk_test_num  = pd.read_csv(TEST_NUMERIC, nrows=NROWS, chunksize=CHUNKSIZE)
chunk_train_cat = pd.read_csv(TRAIN_CATEGORICAL, nrows=NROWS, chunksize=CHUNKSIZE)
chunk_test_cat  = pd.read_csv(TEST_CATEGORICAL, nrows=NROWS, chunksize=CHUNKSIZE)

df_train_num = chunk_train_num.get_chunk(10)
df_test_num  = chunk_test_num.get_chunk(10)
df_train_cat = chunk_train_cat.get_chunk(10)
df_test_cat  = chunk_test_cat.get_chunk(10)

df_train_num.head()

Unnamed: 0,Id,L0_S0_F0,L0_S0_F2,L0_S0_F4,L0_S0_F6,L0_S0_F8,L0_S0_F10,L0_S0_F12,L0_S0_F14,L0_S0_F16,...,L3_S50_F4245,L3_S50_F4247,L3_S50_F4249,L3_S50_F4251,L3_S50_F4253,L3_S51_F4256,L3_S51_F4258,L3_S51_F4260,L3_S51_F4262,Response
0,4,0.03,-0.034,-0.197,-0.179,0.118,0.116,-0.015,-0.032,0.02,...,,,,,,,,,,0
1,6,,,,,,,,,,...,,,,,,,,,,0
2,7,0.088,0.086,0.003,-0.052,0.161,0.025,-0.015,-0.072,-0.225,...,,,,,,,,,,0
3,9,-0.036,-0.064,0.294,0.33,0.074,0.161,0.022,0.128,-0.026,...,,,,,,,,,,0
4,11,-0.055,-0.086,0.294,0.33,0.118,0.025,0.03,0.168,-0.169,...,,,,,,,,,,0


### 通過ステーション情報・時刻情報を読み出す。

In [4]:
# 読み出す。
TEST_START_TIME      = "../../data/test_StartEndTime.csv"
TRAIN_START_TIME     = "../../data/train_StartEndTime.csv"

df_train_bin = pd.read_csv(TRAIN_PATH, dtype='float32')
df_test_bin  = pd.read_csv(TEST_PATH, dtype='float32')
df_pass = pd.concat([df_train_bin, df_test_bin])

df_start_train = pd.read_csv(TRAIN_START_TIME, dtype='float32')
df_start_train = df_start_train.ix[:,['Id','StartTime','EndTime', 'Response']]
df_start_test = pd.read_csv(TEST_START_TIME, dtype='float32')
df_start_test['Response'] = -1
df_time = pd.concat([df_start_train, df_start_test])


.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  # Remove the CWD from sys.path while we load stuff.


## S32 の numeric, categorical とマージし、S32に限定して予想してみる。

In [5]:
def merge_one_sation_info(df_merge, station_id):
    categorical_file_name = "../../data/all_categorical_bitdec_station_" + str(station_id) + ".csv"
    if os.path.exists(categorical_file_name):
        df_cat = pd.read_csv(categorical_file_name, dtype='float32')
        df_merge = pd.merge(df_merge, df_cat, on="Id", how="left", copy=False)
    
    numeric_train_file_name = "../../data/train_numeric_station_" + str(station_id) + ".csv"
    numeric_test_file_name  = "../../data/test_numeric_station_"  + str(station_id) + ".csv"
    if os.path.exists(numeric_train_file_name):
        df_train_num = pd.read_csv(numeric_train_file_name, dtype='float32')
        df_test_num  = pd.read_csv(numeric_test_file_name, dtype='float32')
        df_num = pd.concat([df_train_num, df_test_num])    
        df_merge = pd.merge(df_merge, df_num, on='Id', how='left', copy=False)
 
    return df_merge

In [34]:
df_analyze = pd.merge(df_pass, df_time, on='Id', how='left')
df_analyze_S32 = df_analyze[df_analyze['L3_S32_D3852'] == 1]
for station_id in range(39):
    df_analyze_S32 = merge_one_sation_info(df_analyze_S32, station_id)


In [35]:
df_analyze_S32.head()

Unnamed: 0,Id,L0_S0_D1,L0_S1_D26,L0_S2_D34,L0_S3_D70,L0_S4_D106,L0_S5_D115,L0_S6_D120,L0_S7_D137,L0_S8_D145,...,L3_S36_F3938,L3_S37_F3944,L3_S37_F3946,L3_S37_F3948,L3_S37_F3950,L3_S38_F3954_bit_0,L3_S38_F3955_bit_9,L3_S38_F3952,L3_S38_F3956,L3_S38_F3960
0,116.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,
1,146.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,...,,0.0,0.0,0.0,0.0,0.0,0.0,,,
2,166.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,
3,293.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,...,,0.0,0.0,0.0,0.0,0.0,0.0,,,
4,616.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,


In [36]:
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, log_loss, make_scorer

def calc_mcc(cf_mat):
    tn, fp, fn, tp = cf_mat.ravel()
    if (tp + fp) * (tp + fn) * (tn + fp) * (tn + fn) == 0:
        return 0
    mcc = (tp * tn - fp * fn) / np.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))
    return mcc

def mcc_scorer(y_true, y_pred):
    cf_mat = confusion_matrix(y_true, y_pred)
    return calc_mcc(cf_mat)

def separate_X_y(df):
    X = df.drop(['Response'], axis=1)
    y = df['Response']
    return X, y

def serach_best_threshold(y_pred_proba, y_test):
    vals = []
    thresholds = []
    for i in range(1, 90):
        threshold = i / 100.0
        y_pred = (y_pred_proba[:, 1] > threshold).astype(int)
        cf_mat = confusion_matrix(y_test, y_pred)

        mcc = calc_mcc(cf_mat)
        vals.append(mcc)
        thresholds.append(threshold)
    best_threshold = thresholds[np.argmax(vals)]
    best_mcc = np.max(vals)
    print(best_threshold, best_mcc)
    return best_mcc, best_threshold 

from sklearn.model_selection import KFold

def train_with_r_forest(df, param):
    kf = KFold(n_splits=3, random_state=2, shuffle=True)
    vals = []
    thresholds = []
    
    for train_index, test_index in kf.split(df):
        data_tr   = df.iloc[train_index]
        data_val  = df.iloc[test_index]
        
        X_train, y_train = separate_X_y(data_tr)
        xgb_model = xgb.XGBClassifier(max_depth=param['max_depth'],
                                    subsample=param['subsample'],
                                    colsample_bytree =param['colsample_bytree'],
                                    scale_pos_weight=param['scale_pos_weight'])
        xgb_model.fit(X_train, y_train)
        
        X_test, y_test = separate_X_y(data_val)
        y_pred_proba = xgb_model.predict_proba(X_test)
        mcc, threshold = serach_best_threshold(y_pred_proba, y_test)
        vals.append(mcc)
        thresholds.append(threshold)
        del X_train, y_train
        del data_tr, data_val
    return np.mean(vals), np.mean(threshold)

def grid_search(df, param_grid):
    scores = []
    params = []
    thresholds = []
    for max_depth in param_grid['max_depth']:
        for subsample in param_grid['subsample']:
            for colsample_bytree in param_grid['colsample_bytree']:
                for scale_pos_weight in param_grid['scale_pos_weight']:
                    param = {'max_depth': max_depth,
                                'subsample': subsample,
                                'colsample_bytree': colsample_bytree,
                                'scale_pos_weight': scale_pos_weight
                            }
                    score, threshold = train_with_r_forest(df, param)
                    print(score, threshold)
                    scores.append(score)
                    params.append(param)
                    thresholds.append(threshold)
    # ベストスコアのパラメータを使って再トレーニング
    best_estimator = params[np.argmax(scores)]
    best_threshold = thresholds[np.argmax(thresholds)]
    xgb_model = xgb.XGBClassifier(max_depth=best_estimator['max_depth'],
                                subsample=best_estimator['subsample'],
                                colsample_bytree =best_estimator['colsample_bytree'],
                                scale_pos_weight=best_estimator['scale_pos_weight'])
    X_train, y_train = separate_X_y(df)
    xgb_model.fit(X_train, y_train)
    del X_train, y_train
    return xgb_model, best_estimator, best_threshold
    


In [37]:
def train_rf_parameter(df, params, undersample_rate):
    print(df.shape)
    df = df[(df['Response'] == 0) | (df['Response'] == 1)]
    df = df.drop(['Id'], axis=1)
    print(df.shape)
    df_train, df_test = train_test_split(df, random_state=33)
    
    df_train_ok   = df_train[df_train['Response'] == 0]
    df_train_ng   = df_train[df_train['Response'] == 1]
    df_train_ok_sample = df_train_ok.sample(frac = undersample_rate)
    df_train = pd.concat([df_train_ok_sample, df_train_ng])
 
    rf, params, threshold = grid_search(df_train, params)
    X_test, y_test = separate_X_y(df_test)
    y_pred_proba = rf.predict_proba(X_test)
    mcc, threshold = serach_best_threshold(y_pred_proba, y_test)
    print(mcc)    
    return rf, params, threshold


def train_and_predict_submission(df, param, threshold, undersample_rate):
    df_train_ok   = df[df['Response'] == 0]
    df_train_ng   = df[df['Response'] == 1]
    df_test       = df[df['Response'] == -1]
    
    df_train_ok_sample = df_train_ok.sample(frac = undersample_rate)
    df_train = pd.concat([df_train_ok_sample, df_train_ng])
    df_train_balance = df_train.drop(['Id'], axis=1)
    df_test_ex_id    = df_test.drop(['Id'], axis=1)

    del df_train_ok
    del df_train_ng
    
    X_train, y_train = separate_X_y(df_train_balance)
    xgb_model = xgb.XGBClassifier(max_depth=param['max_depth'],
                                subsample=param['subsample'],
                                colsample_bytree =param['colsample_bytree'],
                                scale_pos_weight=param['scale_pos_weight'])
    xgb_model.fit(X_train, y_train)

    X_test, y_test = separate_X_y(df_test_ex_id)
    y_pred_proba = xgb_model.predict_proba(X_test)
    y_pred = (y_pred_proba[:, 1] > threshold).astype(int)
      
    df_result_add = pd.DataFrame(columns=['Id', 'Response'])
    df_result_add.loc[:, 'Id']       = df_test['Id'].values
    df_result_add.loc[:, 'Response'] = y_pred
    
    return xgb_model, df_result_add


In [38]:

# Grid Search
params={'max_depth': [5, 10],
        'subsample': [0.95],
        'colsample_bytree': [1.0],
        'scale_pos_weight': [2, 4, 8]
}
model, best_param, best_threshold = train_rf_parameter(df_analyze_S32, params, 1)
print(best_param)
print('threshold:', best_threshold)

(48678, 1441)
(24543, 1440)
0.21 0.497112066566
0.21 0.476451397174
0.29 0.461385917277
0.478316460339 0.29
0.35 0.495541544535
0.36 0.478045637547
0.31 0.476067135437
0.48321810584 0.31
0.46 0.487645776108
0.36 0.47404835801
0.36 0.457726102838
0.473140078985 0.36
0.13 0.474001046442
0.1 0.462189091829
0.06 0.447461571166
0.461217236479 0.06
0.14 0.465653856123
0.1 0.452393080906
0.06 0.447010710978
0.455019216003 0.06
0.11 0.452958754886
0.16 0.467329604943
0.07 0.437421477972
0.452569945934 0.07
0.33 0.464532270671
0.464532270671
{'max_depth': 5, 'subsample': 0.95, 'colsample_bytree': 1.0, 'scale_pos_weight': 4}
threshold: 0.33


In [39]:
# predict

model, df_result_S32 = train_and_predict_submission(df_analyze_S32, best_param, best_threshold, 1)


In [12]:
model.feature_importances_


array([ 0.        ,  0.        ,  0.        , ...,  0.00202532,
        0.00556962,  0.01721519], dtype=float32)

In [13]:
df_feature_importance = pd.DataFrame(columns=['name', 'importance'])

In [14]:
df_feature_importance.loc[:, 'importance'] = model.feature_importances_
df_feature_importance.loc[:, 'name'] = np.array(df_analyze_S32.drop(['Id','Response'], axis=1).columns)
df_feature_importance.sort_values('importance', ascending=False)
del df_analyze_S32

### Station32 を通過していないサンプルで解析する。

更にStation29を通過しているものとそれ以外に分離。

In [None]:
df_analyze_S28_exS32 = df_analyze[(df_analyze['L3_S29_D3316'] == 1) & (df_analyze['L3_S32_D3852'] == 0)]

print(df_analyze.shape)
print(df_analyze_S28_exS32.shape)


feature を足す。


In [None]:
for i in range(0, 39):
    df_analyze_S28_exS32 = merge_one_sation_info(df_analyze_S28_exS32, i)

df_analyze_S28_exS32.shape

In [20]:
df_analyze_S28_exS32.head()

Unnamed: 0,Id,L0_S0_D1,L0_S1_D26,L0_S2_D34,L0_S3_D70,L0_S4_D106,L0_S5_D115,L0_S6_D120,L0_S7_D137,L0_S8_D145,...,L3_S36_F3938,L3_S37_F3944,L3_S37_F3946,L3_S37_F3948,L3_S37_F3950,L3_S38_F3954_bit_0,L3_S38_F3955_bit_9,L3_S38_F3952,L3_S38_F3956,L3_S38_F3960
0,4.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,...,,0.0,0.0,0.0,0.0,0.0,0.0,,,
1,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,0.0,0.0,0.0,0.0,0.0,0.0,,,
2,7.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,...,,0.0,0.0,0.0,0.0,0.0,0.0,,,
3,9.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,
4,11.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,


In [21]:

# Grid Search
params={'max_depth': [10],
        'subsample': [0.95],
        'colsample_bytree': [1.0],
        'scale_pos_weight': [2, 4, 8]
}
model_ex_s32, best_param_ex_S32, best_threshold_ex_S32 = train_rf_parameter(df_analyze_S28_exS32, params, 0.1)
print(best_param_ex_S32)
print('threshold:', best_threshold_ex_S32)



(2190476, 1441)
(1095118, 1440)
0.37 0.271811644448
0.38 0.279417152451
0.37 0.260841289743
0.270690028881 0.37
0.52 0.269047793083
0.44 0.265773865941
0.45 0.255525664407
0.26344910781 0.45
0.62 0.257148566708
0.63 0.259306285155
0.62 0.253338906226
0.256597919363 0.62
0.76 0.201773088334
0.201773088334
{'max_depth': 10, 'subsample': 0.95, 'colsample_bytree': 1.0, 'scale_pos_weight': 2}
threshold: 0.76


In [22]:
import pickle
def save_model_and_param(model, name):
    filename = name + '.model_sav'
    pickle.dump(model, open(filename, 'wb'))

model_param_ex_s32 = {'model': model_ex_s32, 'param': best_param_ex_S32, 'threshold': best_threshold_ex_S32}
save_model_and_param(model_param_ex_s32, 'ex_s_32')

In [23]:
del model_ex_s32


In [24]:
model_result_S28_ex_S32_all, df_result_S28_ex_S32 = train_and_predict_submission(df_analyze_S28_exS32, best_param_ex_S32, best_threshold_ex_S32, 0.1)


<6/27 朝>
Subsample = 0.2

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1.0,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=10,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=2, seed=0, silent=True, subsample=0.95)
       
<Subsampleなし>

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1.0,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=20,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=2, seed=0, silent=True, subsample=0.95)

In [40]:
df_analyze_exS28     = df_analyze[(df_analyze['L3_S29_D3316'] == 0) & (df_analyze['L3_S32_D3852'] == 0)]
print(df_analyze_exS28.shape)

for i in range(0, 29):
    df_analyze_exS28 = merge_one_sation_info(df_analyze_exS28, i)

for i in range(39, 52):
    df_analyze_exS28 = merge_one_sation_info(df_analyze_exS28, i)


(128341, 56)


In [42]:

# Grid Search
params={'max_depth': [5, 10, 20],
        'subsample': [0.95],
        'colsample_bytree': [1.0],
        'scale_pos_weight': [2, 4, 8]
}
model_exS28, best_param_ex_S28, best_threshold_ex_S28= train_rf_parameter(df_analyze_exS28, params, 1)

(128341, 1347)
(64086, 1346)
0.33 0.172549857579
0.2 0.138958181279
0.42 0.153907909905
0.155138649588 0.42
0.54 0.162926829876
0.22 0.126609678196
0.47 0.153907909905
0.147814805992 0.47
0.55 0.199550027553
0.44 0.155655367038
0.32 0.122552764391
0.159252719661 0.32
0.69 0.15273921855
0.8 0.1470271841
0.65 0.145510434951
0.148425612534 0.65
0.63 0.144734287054
0.86 0.1470271841
0.63 0.132602187421
0.141454552859 0.63
0.89 0.162926829876
0.81 0.155655367038
0.32 0.153125378877
0.157235858597 0.32
0.73 0.144734287054
0.2 0.138958181279
0.89 0.145510434951
0.143067634428 0.89
0.88 0.144734287054
0.26 0.138958181279
0.16 0.144181253771
0.142624574035 0.16
0.06 0.134179155678
0.23 0.138958181279
0.13 0.136605181258
0.136580839405 0.13
0.37 0.214320591483
0.214320591483


In [None]:
model_result_ex_S28_all, df_result_ex_S28 = train_and_predict_submission(df_analyze_exS28,  best_param_ex_S28,best_threshold_ex_S28, 1)

In [None]:
df_result_ex_S28

In [None]:
df_result = pd.concat([df_result_S32, df_result_S28_ex_S32, df_result_ex_S28]).sort_values('Id')

In [None]:
df_result['Id'] = df_result['Id'].astype('int32')
df_result.head

In [None]:
df_result.to_csv("../../submission/submit_20180713_2.csv", index=False)