In [1]:
import os
import pandas as pd
import numpy as np
import math
from matplotlib.pyplot import *
import matplotlib.pyplot as plt
from matplotlib import animation
from matplotlib import cm
from parse import *
import seaborn as sns

In [2]:
CHUNKSIZE = 100000

TRAIN_PATH        = "../../data/train_StationPathInfo.csv"
TRAIN_PATH_EX     = "../../data/train_StationPathInfoEx.csv"
TEST_PATH         = "../../data/test_StationPathInfo.csv"
TEST_PATH_EX      = "../../data/test_StationPathInfoEx.csv"

TRAIN_DATE        = "../../data/train_date.csv"
TRAIN_NUMERIC     = "../../data/train_numeric.csv"
TRAIN_CATEGORICAL = "../../data/train_categorical.csv"
TEST_DATE         = "../../data/test_date.csv"
TEST_NUMERIC      = "../../data/test_numeric.csv"
TEST_CATEGORICAL  = "../../data/test_categorical.csv"

SEED = 0
CHUNKSIZE = 50000
NROWS = 1200000

ID_COLUMN = 'Id'
TARGET_COLUMN = 'Response'
    
    

In [3]:
chunk_train_num = pd.read_csv(TRAIN_NUMERIC, nrows=NROWS, chunksize=CHUNKSIZE)
chunk_test_num  = pd.read_csv(TEST_NUMERIC, nrows=NROWS, chunksize=CHUNKSIZE)
chunk_train_cat = pd.read_csv(TRAIN_CATEGORICAL, nrows=NROWS, chunksize=CHUNKSIZE)
chunk_test_cat  = pd.read_csv(TEST_CATEGORICAL, nrows=NROWS, chunksize=CHUNKSIZE)

df_train_num = chunk_train_num.get_chunk(10)
df_test_num  = chunk_test_num.get_chunk(10)
df_train_cat = chunk_train_cat.get_chunk(10)
df_test_cat  = chunk_test_cat.get_chunk(10)

df_train_num.head()

Unnamed: 0,Id,L0_S0_F0,L0_S0_F2,L0_S0_F4,L0_S0_F6,L0_S0_F8,L0_S0_F10,L0_S0_F12,L0_S0_F14,L0_S0_F16,...,L3_S50_F4245,L3_S50_F4247,L3_S50_F4249,L3_S50_F4251,L3_S50_F4253,L3_S51_F4256,L3_S51_F4258,L3_S51_F4260,L3_S51_F4262,Response
0,4,0.03,-0.034,-0.197,-0.179,0.118,0.116,-0.015,-0.032,0.02,...,,,,,,,,,,0
1,6,,,,,,,,,,...,,,,,,,,,,0
2,7,0.088,0.086,0.003,-0.052,0.161,0.025,-0.015,-0.072,-0.225,...,,,,,,,,,,0
3,9,-0.036,-0.064,0.294,0.33,0.074,0.161,0.022,0.128,-0.026,...,,,,,,,,,,0
4,11,-0.055,-0.086,0.294,0.33,0.118,0.025,0.03,0.168,-0.169,...,,,,,,,,,,0


### 通過ステーション情報・時刻情報を読み出す。

In [4]:
# 読み出す。

df_train_bin = pd.read_csv(TRAIN_PATH)
df_test_bin  = pd.read_csv(TEST_PATH)
df_pass = pd.concat([df_train_bin, df_test_bin])

In [5]:
# 時系列でソートしたヒートマップを出してみる。

TEST_START_TIME      = "../../data/test_StartEndTime.csv"
TRAIN_START_TIME     = "../../data/train_StartEndTime.csv"

df_start_train = pd.read_csv(TRAIN_START_TIME)
df_start_train = df_start_train.ix[:,['Id','StartTime','EndTime', 'Response']]
df_start_test = pd.read_csv(TEST_START_TIME)
df_start_test['Response'] = -1
df_time = pd.concat([df_start_train, df_start_test])


.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  import sys


## S32 の numeric, categorical とマージし、S32に限定して予想してみる。

In [6]:
df_station32_train_cat = pd.read_csv("../../data/train_categorical_station_32.csv")
df_station32_test_cat  = pd.read_csv("../../data/test_categorical_station_32.csv")
df_station32_cat = pd.concat([df_station32_train_cat, df_station32_test_cat]).fillna('T0')


df_station32_train_num = pd.read_csv("../../data/train_numeric_station_32.csv")
df_station32_test_num  = pd.read_csv("../../data/test_numeric_station_32.csv")
df_station32_num = pd.concat([df_station32_train_num, df_station32_test_num])


  interactivity=interactivity, compiler=compiler, result=result)


In [7]:
df_analyze = pd.merge(df_pass, df_time, on='Id', how='left')
df_analyze_S32 = df_analyze[df_analyze['L3_S32_D3852'] == 1]

df_analyze_S32 = pd.merge(df_analyze_S32, df_station32_cat, on='Id', how='left')
df_analyze_S32 = pd.merge(df_analyze_S32, df_station32_num, on='Id', how='left')

In [8]:
df_analyze_S32.head(5)

Unnamed: 0,Id,L0_S0_D1,L0_S1_D26,L0_S2_D34,L0_S3_D70,L0_S4_D106,L0_S5_D115,L0_S6_D120,L0_S7_D137,L0_S8_D145,...,L3_S49_D4208,L3_S50_D4242,L3_S51_D4255,StartTime,EndTime,Response,L3_S32_F3851,L3_S32_F3853,L3_S32_F3854,L3_S32_F3850
0,116,0,0,0,0,0,0,0,0,0,...,0,0,0,555.38,556.25,0,T1,T0,T16,0.044
1,146,1,1,0,1,0,1,0,1,1,...,0,0,0,1164.57,1170.89,0,T1,T0,T16,-0.055
2,166,0,0,0,0,0,0,0,0,0,...,0,0,0,456.8,493.38,0,T1,T0,T256,0.009
3,293,1,1,1,0,0,1,0,1,1,...,0,0,0,1339.1,1342.73,0,T0,T0,T0,0.006
4,616,1,1,1,0,0,1,1,0,1,...,0,0,0,1575.48,1578.18,0,T1,T0,T128,-0.019


In [9]:

def is_bit_on(s, pos):
    if type(s) is str:
        r = parse("T{}", s)
        val = (int(r[0]))
        mask = 1 << pos
        if val & mask:
            return 1
        else:
            return 0
    else:
        return 0

In [10]:
def decode_categorical_data(df, column_list):
    for column_name, item in df.iteritems():
        if column_name in column_list:
            vallist = item.unique()
            bitlist = []
            for c in vallist:
                if type(c) is str:
                    r = parse("T{}", c)
                    val = int(r[0])
                    if val < 0:
                        val = val + 4294967296
                    if 0 < val:
                        bitpos = int(np.log2(val))
                        bitlist.append(bitpos)

            for bit in bitlist:
                df[column_name + "_bit_" + str(bit)] = df[column_name].apply(lambda x:is_bit_on(x, bit)).astype('int8')
            df.drop(column_name, axis=1, inplace=True)


In [11]:
decode_categorical_data(df_analyze_S32, ['L3_S32_F3851', 'L3_S32_F3853', 'L3_S32_F3854'])


In [12]:
df_analyze_S32.head(5)

Unnamed: 0,Id,L0_S0_D1,L0_S1_D26,L0_S2_D34,L0_S3_D70,L0_S4_D106,L0_S5_D115,L0_S6_D120,L0_S7_D137,L0_S8_D145,...,L3_S32_F3854_bit_3,L3_S32_F3854_bit_2,L3_S32_F3854_bit_31,L3_S32_F3854_bit_0,L3_S32_F3854_bit_10,L3_S32_F3854_bit_5,L3_S32_F3854_bit_15,L3_S32_F3854_bit_6,L3_S32_F3854_bit_16,L3_S32_F3854_bit_14
0,116,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,146,1,1,0,1,0,1,0,1,1,...,0,0,0,0,0,0,0,0,0,0
2,166,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,293,1,1,1,0,0,1,0,1,1,...,0,0,0,0,0,0,0,0,0,0
4,616,1,1,1,0,0,1,1,0,1,...,0,0,0,0,0,0,0,0,0,0


In [13]:
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, log_loss, make_scorer

def calc_mcc(cf_mat):
    tn, fp, fn, tp = cf_mat.ravel()
    print(tn, fp, fn, tp)
    mcc = (tp * tn - fp * fn) / np.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))
    return mcc

def mcc_scorer(y_true, y_pred):
    cf_mat = confusion_matrix(y_true, y_pred)
    return calc_mcc(cf_mat)

def separate_X_y(df):
    X = df.drop(['Response'], axis=1)
    y = df['Response']
    return X, y

def train_with_r_forest(df, params):
    X_train, y_train = separate_X_y(df)

    xgb_model = xgb.XGBClassifier()
    gs = GridSearchCV(xgb_model,
                      params,
                      cv=5,
                      scoring={'mcc' : make_scorer(mcc_scorer)},
                      n_jobs=1,
                      verbose=2,
                      refit='mcc')

    gs.fit(X_train, y_train)
#rf = RandomForestClassifier(max_depth=10,n_estimators=50, random_state=33)
#    rf.fit(X_train, y_train)
    return gs




In [14]:


def train_rf_parameter(df, params, undersample_rate):
    print(df.shape)
    df = df[(df['Response'] == 0) | (df['Response'] == 1)]
    df = df.drop(['Id'], axis=1)
    print(df.shape)
    df_train, df_test = train_test_split(df, random_state=33)
    
    df_train_ok   = df_train[df_train['Response'] == 0]
    df_train_ng   = df_train[df_train['Response'] == 1]
    df_train_ok_sample = df_train_ok.sample(frac = undersample_rate)
    df_train = pd.concat([df_train_ok_sample, df_train_ng])
 
    rf = train_with_r_forest(df_train, params)
    X_test, y_test = separate_X_y(df_test)
    y_pred = rf.predict(X_test)
    print(y_pred)   
    
    cf_mat = confusion_matrix(y_test, y_pred)
    print(cf_mat)

    mcc = calc_mcc(cf_mat)
    print(mcc)    
    return rf


def train_and_predict_submission(df, param, undersample_rate):
    df_train_ok   = df[df['Response'] == 0]
    df_train_ng   = df[df['Response'] == 1]
    df_test       = df[df['Response'] == -1]
    
    df_train_ok_sample = df_train_ok.sample(frac = undersample_rate)
    df_train_balance = pd.concat([df_train_ok_sample, df_train_ng]).drop(['Id'], axis=1)
    df_test_ex_id    = df_test.drop(['Id'], axis=1)
    
    X_train, y_train = separate_X_y(df_train_balance)
    xgb_model = xgb.XGBClassifier(max_depth=param['max_depth'],
                                subsample=param['subsample'],
                                colsample_bytree =param['colsample_bytree'],
                                scale_pos_weight=param['scale_pos_weight'])
    xgb_model.fit(X_train, y_train)

    X_test, y_test = separate_X_y(df_test_ex_id)
    y_pred = xgb_model.predict(X_test)
      
    df_result_add = pd.DataFrame(columns=['Id', 'Response'])
    df_result_add.loc[:, 'Id']       = df_test['Id'].values
    df_result_add.loc[:, 'Response'] = y_pred
    
    return df_result_add


In [15]:

# Grid Search
params={'max_depth': [5, 10, 20],
        'subsample': [0.95],
        'colsample_bytree': [1.0],
        'scale_pos_weight': [2, 3]
}
model = train_rf_parameter(df_analyze_S32, params, 1)

(48678, 74)
(24543, 73)
Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV] colsample_bytree=1.0, max_depth=5, scale_pos_weight=2, subsample=0.95 
3414 102 92 74
13806 256 267 396
[CV]  colsample_bytree=1.0, max_depth=5, scale_pos_weight=2, subsample=0.95, total=   1.6s
[CV] colsample_bytree=1.0, max_depth=5, scale_pos_weight=2, subsample=0.95 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.7s remaining:    0.0s


3432 84 93 73
13789 273 259 404
[CV]  colsample_bytree=1.0, max_depth=5, scale_pos_weight=2, subsample=0.95, total=   1.5s
[CV] colsample_bytree=1.0, max_depth=5, scale_pos_weight=2, subsample=0.95 
3438 78 99 67
13814 248 272 391
[CV]  colsample_bytree=1.0, max_depth=5, scale_pos_weight=2, subsample=0.95, total=   1.5s
[CV] colsample_bytree=1.0, max_depth=5, scale_pos_weight=2, subsample=0.95 
3432 83 79 87
13806 257 283 380
[CV]  colsample_bytree=1.0, max_depth=5, scale_pos_weight=2, subsample=0.95, total=   1.6s
[CV] colsample_bytree=1.0, max_depth=5, scale_pos_weight=2, subsample=0.95 
3440 75 95 70
13784 279 276 388
[CV]  colsample_bytree=1.0, max_depth=5, scale_pos_weight=2, subsample=0.95, total=   1.6s
[CV] colsample_bytree=1.0, max_depth=5, scale_pos_weight=3, subsample=0.95 
3358 158 69 97
13650 412 182 481
[CV]  colsample_bytree=1.0, max_depth=5, scale_pos_weight=3, subsample=0.95, total=   1.6s
[CV] colsample_bytree=1.0, max_depth=5, scale_pos_weight=3, subsample=0.95 
3387

[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:  1.9min finished


[0 0 0 ..., 0 0 0]
[[5676  183]
 [ 137  140]]
5676 183 137 140
0.440853719017


In [16]:
model.best_params_

{'colsample_bytree': 1.0,
 'max_depth': 5,
 'scale_pos_weight': 3,
 'subsample': 0.95}

In [17]:
# predict

df_result_S32 = train_and_predict_submission(df_analyze_S32, model.best_params_, 1)


In [18]:
model.best_estimator_.feature_importances_

array([ 0.00257334,  0.00051467,  0.00720535,  0.0128667 ,  0.01080803,
        0.00874936,  0.01801338,  0.00669068,  0.00102934,  0.02367473,
        0.00823469,  0.01183737,  0.00102934,  0.        ,  0.00566135,
        0.00411734,  0.00566135,  0.01544004,  0.00720535,  0.00617602,
        0.        ,  0.00720535,  0.00360268,  0.00823469,  0.00463201,
        0.00360268,  0.01646938,  0.01698405,  0.00205867,  0.        ,
        0.        ,  0.001544  ,  0.        ,  0.02727741,  0.02264539,
        0.01441071,  0.01029336,  0.        ,  0.03190942,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.19145651,  0.16006176,  0.15748842,
        0.00720535,  0.        ,  0.02058672,  0.00566135,  0.00772002,
        0.00772002,  0.02882141,  0.01441071,  0.01904272,  0.01698405,
        0.01183737,  0.00411734,  0.01595471,  0.00102934,  0.00

In [19]:
df_feature_importance = pd.DataFrame(columns=['name', 'importance'])

In [20]:
df_feature_importance.loc[:, 'importance'] = model.best_estimator_.feature_importances_
df_feature_importance.loc[:, 'name'] = np.array(df_analyze_S32.drop(['Id','Response'], axis=1).columns)
df_feature_importance.sort_values('importance', ascending=False)

Unnamed: 0,name,importance
52,StartTime,0.191457
53,EndTime,0.160062
54,L3_S32_F3850,0.157488
38,L3_S38_D3953,0.031909
61,L3_S32_F3854_bit_1,0.028821
33,L3_S33_D3856,0.027277
9,L0_S9_D152,0.023675
34,L3_S34_D3875,0.022645
57,L3_S32_F3854_bit_4,0.020587
63,L3_S32_F3854_bit_2,0.019043


In [21]:
np.array(df_analyze.columns)

array(['Id', 'L0_S0_D1', 'L0_S1_D26', 'L0_S2_D34', 'L0_S3_D70',
       'L0_S4_D106', 'L0_S5_D115', 'L0_S6_D120', 'L0_S7_D137',
       'L0_S8_D145', 'L0_S9_D152', 'L0_S10_D216', 'L0_S11_D280',
       'L0_S12_D331', 'L0_S13_D355', 'L0_S14_D360', 'L0_S15_D395',
       'L0_S16_D423', 'L0_S17_D432', 'L0_S18_D437', 'L0_S19_D454',
       'L0_S20_D462', 'L0_S21_D469', 'L0_S22_D543', 'L0_S23_D617',
       'L1_S24_D677', 'L1_S25_D1854', 'L2_S26_D3037', 'L2_S27_D3130',
       'L2_S28_D3223', 'L3_S29_D3316', 'L3_S30_D3496', 'L3_S31_D3836',
       'L3_S32_D3852', 'L3_S33_D3856', 'L3_S34_D3875', 'L3_S35_D3886',
       'L3_S36_D3919', 'L3_S37_D3942', 'L3_S38_D3953', 'L3_S39_D3966',
       'L3_S40_D3981', 'L3_S41_D3997', 'L3_S42_D4029', 'L3_S43_D4062',
       'L3_S44_D4101', 'L3_S45_D4125', 'L3_S46_D4135', 'L3_S47_D4140',
       'L3_S48_D4194', 'L3_S49_D4208', 'L3_S50_D4242', 'L3_S51_D4255',
       'StartTime', 'EndTime', 'Response'], dtype=object)

### Station32 を通過していないサンプルで解析する。

更にStation29を通過しているものとそれ以外に分離。

In [22]:
df_analyze_S28_exS32 = df_analyze[(df_analyze['L3_S29_D3316'] == 1) & (df_analyze['L3_S32_D3852'] == 0)]
df_analyze_exS28     = df_analyze[(df_analyze['L3_S29_D3316'] == 0) & (df_analyze['L3_S32_D3852'] == 0)]


print(df_analyze.shape)
print(df_analyze_S28_exS32.shape)
print(df_analyze_exS28.shape)

(2367495, 56)
(2190476, 56)
(128341, 56)


feature を足す。


In [23]:
def get_categorical_cols(f_train, f_test, col):
    df_train = pd.read_csv(f_train, usecols=col)
    df_test  = pd.read_csv(f_test, usecols=col)
    df = pd.concat([df_train, df_test]).fillna('T0')
    return df

df_station29_cat = get_categorical_cols(
    "../../data/train_categorical_station_29.csv",
    "../../data/test_categorical_station_29.csv",
    ['Id', 'L3_S29_F3481', 'L3_S29_F3484'])

df_station30_cat = get_categorical_cols(
    "../../data/train_categorical_station_30.csv",
    "../../data/test_categorical_station_30.csv",
    ['Id', 'L3_S30_F3818'])

df_station31_cat = get_categorical_cols(
    "../../data/train_categorical_station_31.csv",
    "../../data/test_categorical_station_31.csv",
    ['Id', 'L3_S31_F3835'])


df_station35_cat = get_categorical_cols(
    "../../data/train_categorical_station_35.csv",
    "../../data/test_categorical_station_35.csv",
    ['Id', 'L3_S35_F3902'])

df_analyze_S28_exS32 = pd.merge(df_analyze_S28_exS32, df_station29_cat, on='Id', how='left')
df_analyze_S28_exS32 = pd.merge(df_analyze_S28_exS32, df_station30_cat, on='Id', how='left')
df_analyze_S28_exS32 = pd.merge(df_analyze_S28_exS32, df_station31_cat, on='Id', how='left')
df_analyze_S28_exS32 = pd.merge(df_analyze_S28_exS32, df_station35_cat, on='Id', how='left')
decode_categorical_data(df_analyze_S28_exS32, ['L3_S29_F3481', 'L3_S29_F3484', 'L3_S30_F3818',
                                            'L3_S31_F3835',
                                            'L3_S35_F3902'])

df_station29_train_num = pd.read_csv("../../data/train_numeric_station_29.csv")
df_station29_test_num  = pd.read_csv("../../data/test_numeric_station_29.csv")
df_station29_num = pd.concat([df_station29_train_num, df_station29_test_num])
df_analyze_S28_exS32 = pd.merge(df_analyze_S28_exS32, df_station29_num, on='Id', how='left')

df_analyze_S28_exS32.shape

  if self.run_code(code, result):
  if self.run_code(code, result):
  if self.run_code(code, result):


(2190476, 116)

In [24]:
df_analyze_S28_exS32

Unnamed: 0,Id,L0_S0_D1,L0_S1_D26,L0_S2_D34,L0_S3_D70,L0_S4_D106,L0_S5_D115,L0_S6_D120,L0_S7_D137,L0_S8_D145,...,L3_S29_F3464,L3_S29_F3467,L3_S29_F3470,L3_S29_F3473,L3_S29_F3476,L3_S29_F3479,L3_S29_F3482,L3_S29_F3485,L3_S29_F3488,L3_S29_F3491
0,4,1,1,1,0,1,0,0,1,1,...,0.52,-0.240,0.52,0.015,0.015,0.067,0.000,0.0,0.0,0.0
1,6,0,0,0,0,0,0,0,0,0,...,-0.48,0.017,-0.48,-0.019,-0.020,-0.071,0.000,0.0,0.0,0.0
2,7,1,1,1,0,0,1,1,0,1,...,-0.48,0.017,-0.48,-0.019,-0.009,-0.027,0.000,0.0,0.0,0.0
3,9,1,1,1,0,1,0,0,1,1,...,-0.48,0.017,-0.48,-0.028,-0.020,-0.043,0.000,0.0,0.0,0.0
4,11,1,1,0,1,1,0,0,1,1,...,0.52,-0.040,0.52,0.039,0.031,0.106,0.000,0.0,0.0,0.0
5,13,1,1,0,1,1,0,0,1,1,...,-0.48,0.017,-0.48,-0.023,-0.020,-0.075,0.000,0.0,0.0,0.0
6,14,0,0,0,0,0,0,0,0,0,...,-0.48,0.017,-0.48,-0.023,-0.020,-0.090,0.000,0.0,0.0,0.0
7,16,0,0,0,0,0,0,0,0,0,...,0.52,-0.040,0.52,-0.004,-0.005,-0.004,0.000,0.0,0.0,0.0
8,18,1,1,1,0,1,0,0,1,1,...,0.52,0.017,0.52,0.058,0.038,0.125,0.000,0.0,0.0,0.0
9,23,0,0,0,0,0,0,0,0,0,...,0.52,-0.240,0.52,0.001,-0.005,-0.012,0.000,0.0,0.0,0.0


In [25]:


# Grid Search
params={'max_depth': [20],
        'subsample': [0.95],
        'colsample_bytree': [1.0],
        'scale_pos_weight': [2]
}
model_ex_s32 = train_rf_parameter(df_analyze_S28_exS32, params, 0.1)

(2190476, 116)
(1095118, 115)
Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] colsample_bytree=1.0, max_depth=20, scale_pos_weight=2, subsample=0.95 
16319 26 767 55
65345 33 0 3287
[CV]  colsample_bytree=1.0, max_depth=20, scale_pos_weight=2, subsample=0.95, total=  51.4s
[CV] colsample_bytree=1.0, max_depth=20, scale_pos_weight=2, subsample=0.95 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   52.2s remaining:    0.0s


16329 16 754 68
65346 32 0 3287
[CV]  colsample_bytree=1.0, max_depth=20, scale_pos_weight=2, subsample=0.95, total=  53.6s
[CV] colsample_bytree=1.0, max_depth=20, scale_pos_weight=2, subsample=0.95 
16329 16 761 61
65341 37 0 3287
[CV]  colsample_bytree=1.0, max_depth=20, scale_pos_weight=2, subsample=0.95, total=  56.8s
[CV] colsample_bytree=1.0, max_depth=20, scale_pos_weight=2, subsample=0.95 
16314 30 760 62
65351 28 0 3287
[CV]  colsample_bytree=1.0, max_depth=20, scale_pos_weight=2, subsample=0.95, total=  57.4s
[CV] colsample_bytree=1.0, max_depth=20, scale_pos_weight=2, subsample=0.95 
16318 26 761 60
65348 31 1 3287
[CV]  colsample_bytree=1.0, max_depth=20, scale_pos_weight=2, subsample=0.95, total=  56.3s


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  4.7min finished


[0 0 0 ..., 0 0 0]
[[271992    456]
 [  1215    117]]
271992 456 1215 117
0.13118892674


In [26]:
df_result_S28_ex_S32 = train_and_predict_submission(df_analyze_S28_exS32, model_ex_s32.best_params_, 0.1)


In [27]:
model_ex_s32.best_estimator_

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1.0,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=20,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=2, seed=0, silent=True, subsample=0.95)

<6/27 朝>
Subsample = 0.2

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1.0,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=10,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=2, seed=0, silent=True, subsample=0.95)
       
<Subsampleなし>

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1.0,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=20,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=2, seed=0, silent=True, subsample=0.95)

In [28]:
df_station39_cat = get_categorical_cols(
    "../../data/train_categorical_station_39.csv",
    "../../data/test_categorical_station_39.csv",
    ['Id', 'L3_S39_F3967'])

df_station47_cat = get_categorical_cols(
    "../../data/train_categorical_station_47.csv",
    "../../data/test_categorical_station_47.csv",
    ['Id', 'L3_S47_F4141'])

df_analyze_exS28 = pd.merge(df_analyze_exS28, df_station39_cat, on='Id', how='left')
df_analyze_exS28 = pd.merge(df_analyze_exS28, df_station47_cat, on='Id', how='left')
decode_categorical_data(df_analyze_exS28, ['L3_S39_F3967', 'L3_S47_F4141'])

df_station47_train_num = pd.read_csv("../../data/train_numeric_station_47.csv")
df_station47_test_num  = pd.read_csv("../../data/test_numeric_station_47.csv")
df_station47_num = pd.concat([df_station47_train_num, df_station47_test_num])
df_analyze_exS28 = pd.merge(df_analyze_exS28, df_station47_num, on='Id', how='left')


  if self.run_code(code, result):


In [29]:

# Grid Search
params={'max_depth': [20],
        'subsample': [0.95],
        'colsample_bytree': [1.0],
        'n_estimators': [100],
        'scale_pos_weight': [2]
}
model_exS28 = train_rf_parameter(df_analyze_exS28, params, 1)

(128341, 69)
(64086, 68)
Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] colsample_bytree=1.0, max_depth=20, n_estimators=100, scale_pos_weight=2, subsample=0.95 
9561 2 49 2
38242 8 6 194
[CV]  colsample_bytree=1.0, max_depth=20, n_estimators=100, scale_pos_weight=2, subsample=0.95, total=  14.0s
[CV] colsample_bytree=1.0, max_depth=20, n_estimators=100, scale_pos_weight=2, subsample=0.95 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   14.3s remaining:    0.0s


9562 1 49 1
38242 8 5 196
[CV]  colsample_bytree=1.0, max_depth=20, n_estimators=100, scale_pos_weight=2, subsample=0.95, total=  13.8s
[CV] colsample_bytree=1.0, max_depth=20, n_estimators=100, scale_pos_weight=2, subsample=0.95 
9559 4 47 3
38244 6 5 196
[CV]  colsample_bytree=1.0, max_depth=20, n_estimators=100, scale_pos_weight=2, subsample=0.95, total=  14.0s
[CV] colsample_bytree=1.0, max_depth=20, n_estimators=100, scale_pos_weight=2, subsample=0.95 
9558 4 49 1
38247 4 4 197
[CV]  colsample_bytree=1.0, max_depth=20, n_estimators=100, scale_pos_weight=2, subsample=0.95, total=  14.4s
[CV] colsample_bytree=1.0, max_depth=20, n_estimators=100, scale_pos_weight=2, subsample=0.95 
9560 2 47 3
38242 9 2 199
[CV]  colsample_bytree=1.0, max_depth=20, n_estimators=100, scale_pos_weight=2, subsample=0.95, total=  14.4s


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  1.2min finished


[0 0 0 ..., 0 0 0]
[[15932     9]
 [   72     9]]
15932 9 72 9
0.234042580656


In [30]:
df_result_ex_S28 = train_and_predict_submission(df_analyze_exS28, model_exS28.best_params_, 1)

In [31]:
df_result = pd.concat([df_result_S32, df_result_S28_ex_S32, df_result_ex_S28]).sort_values('Id')

In [32]:
df_result.head

<bound method NDFrame.head of               Id  Response
0              1         0
1              2         0
2              3         0
3              5         0
4              8         0
5             10         0
6             12         0
7             15         0
0             17         0
8             19         0
9             20         0
10            21         0
11            22         0
12            24         0
13            25         0
14            29         0
15            30         0
16            32         0
17            33         0
18            35         0
19            36         0
20            37         0
21            39         0
22            40         0
23            42         0
24            43         0
25            45         0
26            46         0
27            48         0
0             50         0
...          ...       ...
1095328  2367436         0
1095329  2367438         0
1095330  2367441         0
1095331  2367445         

In [33]:
df_result.to_csv("../../submission/submit_20180627_1.csv", index=False)