In [1]:
import os
import pandas as pd
import numpy as np
import math
from matplotlib.pyplot import *
import matplotlib.pyplot as plt
from matplotlib import animation
from matplotlib import cm
from parse import *
import seaborn as sns

In [2]:
CHUNKSIZE = 100000

TRAIN_PATH        = "../../data/train_StationPathInfo.csv"
TRAIN_PATH_EX     = "../../data/train_StationPathInfoEx.csv"
TEST_PATH         = "../../data/test_StationPathInfo.csv"
TEST_PATH_EX      = "../../data/test_StationPathInfoEx.csv"

TRAIN_DATE        = "../../data/train_date.csv"
TRAIN_NUMERIC     = "../../data/train_numeric.csv"
TRAIN_CATEGORICAL = "../../data/train_categorical.csv"
TEST_DATE         = "../../data/test_date.csv"
TEST_NUMERIC      = "../../data/test_numeric.csv"
TEST_CATEGORICAL  = "../../data/test_categorical.csv"

SEED = 0
CHUNKSIZE = 50000
NROWS = 1200000

ID_COLUMN = 'Id'
TARGET_COLUMN = 'Response'
    
    

In [3]:
chunk_train_num = pd.read_csv(TRAIN_NUMERIC, nrows=NROWS, chunksize=CHUNKSIZE)
chunk_test_num  = pd.read_csv(TEST_NUMERIC, nrows=NROWS, chunksize=CHUNKSIZE)
chunk_train_cat = pd.read_csv(TRAIN_CATEGORICAL, nrows=NROWS, chunksize=CHUNKSIZE)
chunk_test_cat  = pd.read_csv(TEST_CATEGORICAL, nrows=NROWS, chunksize=CHUNKSIZE)

df_train_num = chunk_train_num.get_chunk(10)
df_test_num  = chunk_test_num.get_chunk(10)
df_train_cat = chunk_train_cat.get_chunk(10)
df_test_cat  = chunk_test_cat.get_chunk(10)

df_train_num.head()

Unnamed: 0,Id,L0_S0_F0,L0_S0_F2,L0_S0_F4,L0_S0_F6,L0_S0_F8,L0_S0_F10,L0_S0_F12,L0_S0_F14,L0_S0_F16,...,L3_S50_F4245,L3_S50_F4247,L3_S50_F4249,L3_S50_F4251,L3_S50_F4253,L3_S51_F4256,L3_S51_F4258,L3_S51_F4260,L3_S51_F4262,Response
0,4,0.03,-0.034,-0.197,-0.179,0.118,0.116,-0.015,-0.032,0.02,...,,,,,,,,,,0
1,6,,,,,,,,,,...,,,,,,,,,,0
2,7,0.088,0.086,0.003,-0.052,0.161,0.025,-0.015,-0.072,-0.225,...,,,,,,,,,,0
3,9,-0.036,-0.064,0.294,0.33,0.074,0.161,0.022,0.128,-0.026,...,,,,,,,,,,0
4,11,-0.055,-0.086,0.294,0.33,0.118,0.025,0.03,0.168,-0.169,...,,,,,,,,,,0


### 通過ステーション情報・時刻情報を読み出す。

In [4]:
# 読み出す。
TEST_START_TIME      = "../../data/test_StartEndTime.csv"
TRAIN_START_TIME     = "../../data/train_StartEndTime.csv"

df_train_bin = pd.read_csv(TRAIN_PATH, dtype='float32')
df_test_bin  = pd.read_csv(TEST_PATH, dtype='float32')
df_pass = pd.concat([df_train_bin, df_test_bin])

df_start_train = pd.read_csv(TRAIN_START_TIME, dtype='float32')
df_start_train = df_start_train.ix[:,['Id','StartTime','EndTime', 'Response']]
df_start_test = pd.read_csv(TEST_START_TIME, dtype='float32')
df_start_test['Response'] = -1
df_time = pd.concat([df_start_train, df_start_test])


.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  # Remove the CWD from sys.path while we load stuff.


## S32 の numeric, categorical とマージし、S32に限定して予想してみる。

In [5]:
def merge_one_sation_info(df_merge, station_id):
    categorical_file_name = "../../data/all_categorical_bitdec_station_" + str(station_id) + ".csv"
    if os.path.exists(categorical_file_name):
        df_cat = pd.read_csv(categorical_file_name, dtype='float32')
        df_merge = pd.merge(df_merge, df_cat, on="Id", how="left", copy=False)
    
    numeric_train_file_name = "../../data/train_numeric_station_" + str(station_id) + ".csv"
    numeric_test_file_name  = "../../data/test_numeric_station_"  + str(station_id) + ".csv"
    if os.path.exists(numeric_train_file_name):
        df_train_num = pd.read_csv(numeric_train_file_name, dtype='float32')
        df_test_num  = pd.read_csv(numeric_test_file_name, dtype='float32')
        df_num = pd.concat([df_train_num, df_test_num])    
        df_merge = pd.merge(df_merge, df_num, on='Id', how='left', copy=False)
 
    return df_merge

In [6]:
df_analyze = pd.merge(df_pass, df_time, on='Id', how='left')
df_analyze_S32 = df_analyze[df_analyze['L3_S32_D3852'] == 1]
for station_id in range(39):
    df_analyze_S32 = merge_one_sation_info(df_analyze_S32, station_id)


In [7]:
df_analyze_S32.head()

Unnamed: 0,Id,L0_S0_D1,L0_S1_D26,L0_S2_D34,L0_S3_D70,L0_S4_D106,L0_S5_D115,L0_S6_D120,L0_S7_D137,L0_S8_D145,...,L3_S36_F3938,L3_S37_F3944,L3_S37_F3946,L3_S37_F3948,L3_S37_F3950,L3_S38_F3954_bit_0,L3_S38_F3955_bit_9,L3_S38_F3952,L3_S38_F3956,L3_S38_F3960
0,116.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,
1,146.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,...,,0.0,0.0,0.0,0.0,0.0,0.0,,,
2,166.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,
3,293.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,...,,0.0,0.0,0.0,0.0,0.0,0.0,,,
4,616.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,


In [8]:
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, log_loss, make_scorer

def calc_mcc(cf_mat):
    tn, fp, fn, tp = cf_mat.ravel()
    if (tp + fp) * (tp + fn) * (tn + fp) * (tn + fn) == 0:
        return 0
    mcc = (tp * tn - fp * fn) / np.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))
    return mcc

def mcc_scorer(y_true, y_pred):
    cf_mat = confusion_matrix(y_true, y_pred)
    return calc_mcc(cf_mat)

def separate_X_y(df):
    X = df.drop(['Response'], axis=1)
    y = df['Response']
    return X, y

def serach_best_threshold(y_pred_proba, y_test):
    vals = []
    thresholds = []
    for i in range(1, 90):
        threshold = i / 100.0
        y_pred = (y_pred_proba[:, 1] > threshold).astype(int)
        cf_mat = confusion_matrix(y_test, y_pred)

        mcc = calc_mcc(cf_mat)
        vals.append(mcc)
        thresholds.append(threshold)
    best_threshold = thresholds[np.argmax(vals)]
    best_mcc = np.max(vals)
    print(best_threshold, best_mcc)
    return best_mcc, best_threshold 

from sklearn.model_selection import KFold

def train_with_r_forest(df, param):
    kf = KFold(n_splits=3, random_state=2, shuffle=True)
    vals = []
    thresholds = []
    
    for train_index, test_index in kf.split(df):
        data_tr   = df.iloc[train_index]
        data_val  = df.iloc[test_index]
        
        X_train, y_train = separate_X_y(data_tr)
        xgb_model = xgb.XGBClassifier(max_depth=param['max_depth'],
                                    subsample=param['subsample'],
                                    colsample_bytree =param['colsample_bytree'],
                                    scale_pos_weight=param['scale_pos_weight'])
        xgb_model.fit(X_train, y_train)
        
        X_test, y_test = separate_X_y(data_val)
        y_pred_proba = xgb_model.predict_proba(X_test)
        mcc, threshold = serach_best_threshold(y_pred_proba, y_test)
        vals.append(mcc)
        thresholds.append(threshold)
    return np.mean(vals), np.mean(threshold)

def grid_search(df, param_grid):
    scores = []
    params = []
    thresholds = []
    for max_depth in param_grid['max_depth']:
        for subsample in param_grid['subsample']:
            for colsample_bytree in param_grid['colsample_bytree']:
                for scale_pos_weight in param_grid['scale_pos_weight']:
                    param = {'max_depth': max_depth,
                                'subsample': subsample,
                                'colsample_bytree': colsample_bytree,
                                'scale_pos_weight': scale_pos_weight
                            }
                    score, threshold = train_with_r_forest(df, param)
                    print(score, threshold)
                    scores.append(score)
                    params.append(param)
                    thresholds.append(threshold)
    # ベストスコアのパラメータを使って再トレーニング
    best_estimator = params[np.argmax(scores)]
    best_threshold = thresholds[np.argmax(thresholds)]
    xgb_model = xgb.XGBClassifier(max_depth=best_estimator['max_depth'],
                                subsample=best_estimator['subsample'],
                                colsample_bytree =best_estimator['colsample_bytree'],
                                scale_pos_weight=best_estimator['scale_pos_weight'])
    X_train, y_train = separate_X_y(df)
    xgb_model.fit(X_train, y_train)
    return xgb_model, best_estimator, best_threshold
    




In [9]:


def train_rf_parameter(df, params, undersample_rate):
    print(df.shape)
    df = df[(df['Response'] == 0) | (df['Response'] == 1)]
    df = df.drop(['Id'], axis=1)
    print(df.shape)
    df_train, df_test = train_test_split(df, random_state=33)
    
    df_train_ok   = df_train[df_train['Response'] == 0]
    df_train_ng   = df_train[df_train['Response'] == 1]
    df_train_ok_sample = df_train_ok.sample(frac = undersample_rate)
    df_train = pd.concat([df_train_ok_sample, df_train_ng])
 
    rf, params, threshold = grid_search(df_train, params)
    X_test, y_test = separate_X_y(df_test)
    y_pred_proba = rf.predict_proba(X_test)
    mcc, threshold = serach_best_threshold(y_pred_proba, y_test)
    print(mcc)    
    return rf, params, threshold


def train_and_predict_submission(df, param, threshold):
    df_train_ok   = df[df['Response'] == 0]
    df_train_ng   = df[df['Response'] == 1]
    df_test       = df[df['Response'] == -1]
    
    df_train_balance = pd.concat([df_train_ok, df_train_ng]).drop(['Id'], axis=1)
    df_test_ex_id    = df_test.drop(['Id'], axis=1)
    
    X_train, y_train = separate_X_y(df_train_balance)
    xgb_model = xgb.XGBClassifier(max_depth=param['max_depth'],
                                subsample=param['subsample'],
                                colsample_bytree =param['colsample_bytree'],
                                scale_pos_weight=param['scale_pos_weight'])
    xgb_model.fit(X_train, y_train)

    X_test, y_test = separate_X_y(df_test_ex_id)
    y_pred_proba = xgb_model.predict_proba(X_test)
    y_pred = (y_pred_proba[:, 1] > (threshold*1.2)).astype(int)
      
    df_result_add = pd.DataFrame(columns=['Id', 'Response'])
    df_result_add.loc[:, 'Id']       = df_test['Id'].values
    df_result_add.loc[:, 'Response'] = y_pred
    
    return xgb_model, df_result_add


In [10]:

# Grid Search
params={'max_depth': [5, 10],
        'subsample': [0.95],
        'colsample_bytree': [1.0],
        'scale_pos_weight': [2]
}
model, best_param, best_threshold = train_rf_parameter(df_analyze_S32, params, 1)
print(best_param)
print('threshold:', best_threshold)

(48678, 1441)
(24543, 1440)
0.23 0.486979362092
0.23 0.470909453805
0.16 0.467285417507
0.475058077801 0.16
0.06 0.469781036553
0.11 0.458495759141
0.09 0.465956749546
0.46474451508 0.09
0.25 0.461362366488
0.461362366488
{'max_depth': 5, 'subsample': 0.95, 'colsample_bytree': 1.0, 'scale_pos_weight': 2}
threshold: 0.25


In [11]:
# predict

model, df_result_S32 = train_and_predict_submission(df_analyze_S32, best_param, best_threshold)


In [12]:
model.feature_importances_

array([ 0.        ,  0.        ,  0.        , ...,  0.00152827,
        0.00662252,  0.01630158], dtype=float32)

In [13]:
df_feature_importance = pd.DataFrame(columns=['name', 'importance'])

In [14]:
df_feature_importance.loc[:, 'importance'] = model.feature_importances_
df_feature_importance.loc[:, 'name'] = np.array(df_analyze_S32.drop(['Id','Response'], axis=1).columns)
df_feature_importance.sort_values('importance', ascending=False)

Unnamed: 0,name,importance
1352,L3_S30_F3754,0.018849
1438,L3_S38_F3960,0.016302
1380,L3_S32_F3854_bit_1,0.013754
1397,L3_S33_F3865,0.012736
1238,L3_S29_F3321,0.012736
1312,L3_S30_F3554,0.012226
1255,L3_S29_F3376,0.011717
1394,L3_S33_F3859,0.010698
1423,L3_S36_F3920,0.010188
52,StartTime,0.009679


### Station32 を通過していないサンプルで解析する。

更にStation29を通過しているものとそれ以外に分離。

In [15]:
df_analyze_S28_exS32 = df_analyze[(df_analyze['L3_S29_D3316'] == 1) & (df_analyze['L3_S32_D3852'] == 0)]
df_analyze_exS28     = df_analyze[(df_analyze['L3_S29_D3316'] == 0) & (df_analyze['L3_S32_D3852'] == 0)]


print(df_analyze.shape)
print(df_analyze_S28_exS32.shape)
print(df_analyze_exS28.shape)

(2367495, 56)
(2190476, 56)
(128341, 56)


feature を足す。


In [16]:


df_analyze_S28_exS32 = merge_one_sation_info(df_analyze_S28_exS32, 29)
df_analyze_S28_exS32 = merge_one_sation_info(df_analyze_S28_exS32, 30)
df_analyze_S28_exS32 = merge_one_sation_info(df_analyze_S28_exS32, 31)
df_analyze_S28_exS32 = merge_one_sation_info(df_analyze_S28_exS32, 33)
df_analyze_S28_exS32 = merge_one_sation_info(df_analyze_S28_exS32, 34)
df_analyze_S28_exS32 = merge_one_sation_info(df_analyze_S28_exS32, 35)
df_analyze_S28_exS32 = merge_one_sation_info(df_analyze_S28_exS32, 36)
df_analyze_S28_exS32 = merge_one_sation_info(df_analyze_S28_exS32, 37)
df_analyze_S28_exS32 = merge_one_sation_info(df_analyze_S28_exS32, 38)

df_analyze_S28_exS32.shape

(2190476, 243)

In [17]:
df_analyze_S28_exS32.head()

Unnamed: 0,Id,L0_S0_D1,L0_S1_D26,L0_S2_D34,L0_S3_D70,L0_S4_D106,L0_S5_D115,L0_S6_D120,L0_S7_D137,L0_S8_D145,...,L3_S36_F3938,L3_S37_F3944,L3_S37_F3946,L3_S37_F3948,L3_S37_F3950,L3_S38_F3954_bit_0,L3_S38_F3955_bit_9,L3_S38_F3952,L3_S38_F3956,L3_S38_F3960
0,4.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,...,,0.0,0.0,0.0,0.0,0.0,0.0,,,
1,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,0.0,0.0,0.0,0.0,0.0,0.0,,,
2,7.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,...,,0.0,0.0,0.0,0.0,0.0,0.0,,,
3,9.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,
4,11.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,


In [18]:
# Grid Search
params={'max_depth': [10],
        'subsample': [0.95],
        'colsample_bytree': [1.0],
        'scale_pos_weight': [2, 4, 8]
}
model_ex_s32, best_param_ex_S32, best_threshold_ex_S32 = train_rf_parameter(df_analyze_S28_exS32, params, 1)
print(best_param_ex_S32)
print('threshold:', best_threshold_ex_S32)


(2190476, 243)
(1095118, 242)
0.08 0.0905819923564
0.12 0.111272832291
0.1 0.103684351704
0.101846392117 0.1
0.15 0.0956803319768
0.15 0.104176523291
0.16 0.10149169362
0.100449516296 0.16
0.2 0.107243310696
0.4 0.104551146764
0.37 0.1046494497
0.105481302387 0.37
0.21 0.124588700736
0.124588700736
{'max_depth': 10, 'subsample': 0.95, 'colsample_bytree': 1.0, 'scale_pos_weight': 8}
threshold: 0.21


undersample しない場合の記録。結果が安定していない。
(2190476, 243)
(1095118, 242)
0.08 0.0905819923564
0.12 0.111272832291
0.1 0.103684351704
0.101846392117 0.1
0.15 0.0956803319768
0.15 0.104176523291
0.16 0.10149169362
0.100449516296 0.16
0.2 0.107243310696
0.4 0.104551146764
0.37 0.1046494497
0.105481302387 0.37
0.21 0.124588700736
0.124588700736
{'max_depth': 10, 'subsample': 0.95, 'colsample_bytree': 1.0, 'scale_pos_weight': 8}
threshold: 0.21

In [19]:
df_result_S28_ex_S32 = train_and_predict_submission(df_analyze_S28_exS32, model_ex_s32.best_params_, 0.1)


AttributeError: 'XGBClassifier' object has no attribute 'best_params_'

In [None]:
model_ex_s32.best_estimator_

<6/27 朝>
Subsample = 0.2

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1.0,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=10,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=2, seed=0, silent=True, subsample=0.95)
       
<Subsampleなし>

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1.0,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=20,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=2, seed=0, silent=True, subsample=0.95)

In [None]:
df_station39_cat = get_categorical_cols(
    "../../data/train_categorical_station_39.csv",
    "../../data/test_categorical_station_39.csv",
    ['Id', 'L3_S39_F3967'])

df_station47_cat = get_categorical_cols(
    "../../data/train_categorical_station_47.csv",
    "../../data/test_categorical_station_47.csv",
    ['Id', 'L3_S47_F4141'])

df_analyze_exS28 = pd.merge(df_analyze_exS28, df_station39_cat, on='Id', how='left')
df_analyze_exS28 = pd.merge(df_analyze_exS28, df_station47_cat, on='Id', how='left')
decode_categorical_data(df_analyze_exS28, ['L3_S39_F3967', 'L3_S47_F4141'])

df_station47_train_num = pd.read_csv("../../data/train_numeric_station_47.csv")
df_station47_test_num  = pd.read_csv("../../data/test_numeric_station_47.csv")
df_station47_num = pd.concat([df_station47_train_num, df_station47_test_num])
df_analyze_exS28 = pd.merge(df_analyze_exS28, df_station47_num, on='Id', how='left')


In [None]:

# Grid Search
params={'max_depth': [20],
        'subsample': [0.95],
        'colsample_bytree': [1.0],
        'n_estimators': [100],
        'scale_pos_weight': [2]
}
model_exS28 = train_rf_parameter(df_analyze_exS28, params, 1)

In [None]:
df_result_ex_S28 = train_and_predict_submission(df_analyze_exS28, model_exS28.best_params_, 1)

In [None]:
df_result = pd.concat([df_result_S32, df_result_S28_ex_S32, df_result_ex_S28]).sort_values('Id')

In [None]:
df_result.head

In [None]:
df_result.to_csv("../../submission/submit_20180627_1.csv", index=False)