In [1]:
import os
import pandas as pd
import numpy as np
import math
from matplotlib.pyplot import *
import matplotlib.pyplot as plt
from matplotlib import animation
from matplotlib import cm
from parse import *
import seaborn as sns

In [2]:
CHUNKSIZE = 100000

TRAIN_PATH        = "../../data/train_StationPathInfo.csv"
TRAIN_PATH_EX     = "../../data/train_StationPathInfoEx.csv"
TEST_PATH         = "../../data/test_StationPathInfo.csv"
TEST_PATH_EX      = "../../data/test_StationPathInfoEx.csv"

SEED = 0
CHUNKSIZE = 50000
NROWS = 1200000

ID_COLUMN = 'Id'
TARGET_COLUMN = 'Response'
    
    
    

In [3]:
# 読み出す。

df_train_bin = pd.read_csv(TRAIN_PATH)
df_test_bin  = pd.read_csv(TEST_PATH)
df_pass = pd.concat([df_train_bin, df_test_bin])


通過ステーションをヒートマップにしてみる。

In [4]:
# 時系列でソートしたヒートマップを出してみる。

TEST_START_TIME      = "../../data/test_StartEndTime.csv"
TRAIN_START_TIME     = "../../data/train_StartEndTime.csv"

df_start_train = pd.read_csv(TRAIN_START_TIME)
df_start_train = df_start_train.ix[:,['Id','StartTime','EndTime', 'Response']]
df_start_test = pd.read_csv(TEST_START_TIME)
df_start_test['Response'] = -1


.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  import sys


In [5]:
df_start_train.head()

Unnamed: 0,Id,StartTime,EndTime,Response
0,4,82.24,87.29,0
1,6,1313.12,1315.75,0
2,7,1618.7,1624.42,0
3,9,1149.2,1154.16,0
4,11,602.64,606.02,0


In [6]:
df_start_test.head()
df_start = pd.concat([df_start_train, df_start_test])
df_start.head(100)

Unnamed: 0,Id,StartTime,EndTime,Response
0,4,82.24,87.29,0
1,6,1313.12,1315.75,0
2,7,1618.70,1624.42,0
3,9,1149.20,1154.16,0
4,11,602.64,606.02,0
5,13,1331.66,1339.73,0
6,14,1662.63,1664.04,0
7,16,791.22,804.36,0
8,18,517.64,518.08,0
9,23,156.27,157.89,0


In [7]:
df_train = pd.merge(df_start, df_pass, how='left')
df_train = df_train.fillna(-1500)




In [8]:
df_train[df_train.isnull().any(axis=1)]

Unnamed: 0,Id,StartTime,EndTime,Response,L0_S0_D1,L0_S1_D26,L0_S2_D34,L0_S3_D70,L0_S4_D106,L0_S5_D115,...,L3_S42_D4029,L3_S43_D4062,L3_S44_D4101,L3_S45_D4125,L3_S46_D4135,L3_S47_D4140,L3_S48_D4194,L3_S49_D4208,L3_S50_D4242,L3_S51_D4255


In [9]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier


def separate_X_y(df):
    X = df.drop(['Response'], axis=1)
    y = df['Response']
    return X, y

def train_with_r_forest(df):
    X_train, y_train = separate_X_y(df)

    cl_weight = {0:0.3, 1:0.7}
    
    rf = RandomForestClassifier(max_depth=10,n_estimators=50, random_state=33)
    rf.fit(X_train, y_train)
    return rf


In [10]:
from sklearn.metrics import confusion_matrix


def calc_mcc(cf_mat):
    tn, fp, fn, tp = cf_mat.ravel()
    print(tn, fp, fn, tp)
    mcc = (tp * tn - fp * fn) / np.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))
    return mcc


def train_rf_parameter(df):
    df = df[(df['Response'] == 0) | (df['Response'] == 1)]
    df_train, df_test = train_test_split(df, random_state=33)
    
    df_train_ok   = df_train[df_train['Response'] == 0]
    df_train_ng   = df_train[df_train['Response'] == 1]
    undersample_rate = len(df_train_ng) * 30 / len(df_train_ok)
    df_train_ok_sample = df_train_ok.sample(frac = undersample_rate)
    df_train = pd.concat([df_train_ok_sample, df_train_ng])

    rf = train_with_r_forest(df_train)
    X_test, y_test = separate_X_y(df_test)
    y_pred = rf.predict(X_test)
    print(y_pred)   
    
    cf_mat = confusion_matrix(y_test, y_pred)
    print(cf_mat)

    mcc = calc_mcc(cf_mat)
    print(mcc)    

train_rf_parameter(df_train)




[0 0 0 ..., 0 0 0]
[[293898    357]
 [  1537    145]]
293898 357 1537 145
0.1552668126


In [11]:
def train_and_predict_one_chnuk(df_train):
    df_train_ok   = df_train[df_train['Response'] == 0]
    df_train_ng   = df_train[df_train['Response'] == 1]
    df_test = df_train[df_train['Response'] == -1]
    
    undersample_rate = len(df_train_ng) * 20 / len(df_train_ok)
    df_train_ok_sample = df_train_ok.sample(frac = undersample_rate)

    df_train_balance = pd.concat([df_train_ok_sample, df_train_ng])
    
    rf = train_with_r_forest(df_train_balance)

    X_test, y_test = separate_X_y(df_test)
    y_pred = rf.predict(X_test)
      
    df_result_add = pd.DataFrame(columns=['Id', 'Response'])
    df_result_add.loc[:, 'Id']       = df_test['Id'].values
    df_result_add.loc[:, 'Response'] = y_pred
    
    return df_result_add

df_result = train_and_predict_one_chnuk(df_train)
    

In [12]:
df_result[df_result['Response']==1].shape


(3291, 2)

In [13]:
df_result[df_result['Response']==0].shape

(1180457, 2)

In [15]:
df_result.to_csv("../../submission/submit_20180620_1.csv", index=False)