In [1]:
import numpy as np
import scipy as sp
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
import statsmodels.stats.api as sms
import sklearn as sk

import matplotlib as mpl
import matplotlib.pylab as plt
from mpl_toolkits.mplot3d import Axes3D
mpl.rcParams['figure.figsize'] = (8, 5)

import seaborn as sns
sns.set()
sns.set_style("whitegrid")
sns.set_color_codes()


from sklearn.metrics import confusion_matrix, classification_report, roc_curve, auc, roc_auc_score, accuracy_score
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import binarize
from sklearn.preprocessing import label_binarize
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression


import io
import pydot
from IPython.core.display import Image 
from sklearn.tree import export_graphviz


pd.options.display.max_columns = 400
pd.options.display.max_rows = 200
pd.options.display.max_colwidth = 600
pd.options.display.precision = 10

  from pandas.core import datetools


In [47]:
train = pd.read_csv('./__data/act_train.csv', parse_dates=['date'])
test = pd.read_csv('./__data/act_test.csv', parse_dates=['date'])
ppl = pd.read_csv('./__data/people.csv', parse_dates=['date'])

In [48]:
df_train = pd.merge(train, ppl, on='people_id', suffixes=('_act', '_ppl'))
df_test = pd.merge(test, ppl, on='people_id', suffixes=('_act', '_ppl'))

In [49]:
del train, test, ppl

# 1. Preprocessing

* activity_id value를 act1 / act2 에 따라 1 / 2 로 구분
* act data 와 people data 의 data feature를 year / month / day / weekday 로 구분 
* group_1 / people_id feature 의 string 제거 (숫자만 존재)
* 가입 후 특정 행동까지의 기간 생성
* activity_id value의 string을 제거 후 숫자만으로 id 유지
* 기존 연/월/일 이 합쳐진 date feature 제거
* NaN 값 0 으로 대체
* boolean type feature 0 or 1 로 encodint

In [5]:
def cleansing_data(train, test):
    
    # activity_id value를 act1 / act2 에 따라 type1 / type2 로 구분
    for table in [train, test]:
        ait = pd.DataFrame([item[0].lstrip('act') for item in table['activity_id'].str.split('_')])
        table.insert(1, 'activity_id_type', ait)

    # act data 와 people data 의 date feature를 year / month / day / weekday 로 구분    
    for table in [train, test]:
        
        table.insert(3, 'year_act', table['date_act'].dt.year)
        table.insert(4, 'month_act',table['date_act'].dt.month)
        table.insert(5, 'day_act', table['date_act'].dt.day)
        table.insert(6, 'weekday_act', table['date_act'].dt.weekday)
    
        table.insert(19, 'year_ppl', table['date_ppl'].dt.year)
        table.insert(20, 'month_ppl', table['date_ppl'].dt.month)
        table.insert(21, 'day_ppl', table['date_ppl'].dt.day)
        table.insert(22, 'weekday_ppl', table['date_ppl'].dt.weekday)
    
        # group_1 / people_id feature 의 string 제거
        table['group_1'] = table['group_1'].str.lstrip('group')
        table['people_id'] = table['people_id'].str.lstrip('ppl_')
    
        # 가입 후 특정 행동까지의 기간 생성
        table['from_join_to_act'] = (table['date_act'] - table['date_ppl']).astype('timedelta64[D]')
     
        # activity_id value의 string을 제거 후 숫자만으로 id 유지
        table['activity_id'] = table['activity_id'].str.lstrip('act2_')
        table['activity_id'] = table['activity_id'].str.lstrip('act1_')
        
        
        # 기존 연/월/일 이 합쳐진 date feature 제거
        del table['date_act']
        del table['date_ppl']
        
        # NaN 값 0 으로 대체 
        table.fillna(0, inplace=True)
              
    # boolean type feature 0 or 1 로 encodint
    features = ['char_{}'.format(i) for i in range(11, 38)]
    features.insert(0, 'char_10_ppl')

    tf_map = {True: 0, False: 1}
    for table in [train, test]:
        for feature in features:
            table[feature] = table[feature].map(tf_map)
    
    # Memory Error 로 함께 실행 불가
    # type 문자열 제거
#     for table in [train, test]:
#         table = table.replace('type ', '', regex=True).apply(pd.to_numeric, errors='ignore')

    return None

In [6]:
%%time
cleansing_data(df_train, df_test)

Wall time: 30.1 s


#### * type string 제거

In [7]:
%%time
df_train = df_train.replace('type ', '', regex=True).apply(pd.to_numeric, errors='ignore')
df_test = df_test.replace('type ', '', regex=True).apply(pd.to_numeric, errors='ignore')

Wall time: 2min 14s


In [8]:
y = df_train['outcome']
df_train = df_train.drop('outcome', axis=1)

In [77]:
df_train.tail()

Unnamed: 0,people_id,activity_id_type,activity_id,year_act,month_act,day_act,weekday_act,activity_category,char_1_act,char_2_act,char_3_act,char_4_act,char_5_act,char_6_act,char_7_act,char_8_act,char_9_act,char_10_act,year_ppl,month_ppl,day_ppl,weekday_ppl,char_1_ppl,group_1,char_2_ppl,char_3_ppl,char_4_ppl,char_5_ppl,char_6_ppl,char_7_ppl,char_8_ppl,char_9_ppl,char_10_ppl,char_11,char_12,char_13,char_14,char_15,char_16,char_17,char_18,char_19,char_20,char_21,char_22,char_23,char_24,char_25,char_26,char_27,char_28,char_29,char_30,char_31,char_32,char_33,char_34,char_35,char_36,char_37,char_38,from_join_to_act
2197286,99994.0,2,4668076,2023,6,16,4,4,0,0,0,0,0,0,0,0,0,418,2023,1,6,4,2,17764,3,2,7,2,1,2,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,95,161.0
2197287,99994.0,2,4743548,2023,3,30,3,4,0,0,0,0,0,0,0,0,0,1832,2023,1,6,4,2,17764,3,2,7,2,1,2,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,95,83.0
2197288,99994.0,2,536973,2023,1,19,3,2,0,0,0,0,0,0,0,0,0,1,2023,1,6,4,2,17764,3,2,7,2,1,2,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,95,13.0
2197289,99994.0,2,688656,2023,5,2,1,4,0,0,0,0,0,0,0,0,0,199,2023,1,6,4,2,17764,3,2,7,2,1,2,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,95,116.0
2197290,99994.0,2,715089,2023,6,15,3,2,0,0,0,0,0,0,0,0,0,1,2023,1,6,4,2,17764,3,2,7,2,1,2,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,95,160.0


### feature 간 상관계수 확인을 통한 feature selection

In [10]:
df_concat = pd.concat([df_train, y], axis=1)

In [20]:
%%time
corr_train = df_concat.corr()

Wall time: 38.5 s


####  target feature를 제외한 feature 간의 상관계수 중 0.9 이상 -0.9 이하 확인
   * 해당 feature와 target feature 간의 상관계수 재확인
   * feature 간의 상관계수가 높은 feature 들 중 target과의 상관계수가 적은 feature 제거

In [21]:
for num in corr_train.columns:
    for i in corr_train[num].keys():
        if ((corr_train[num][i] > 0.9) | (corr_train[num][i] < -0.9) and (corr_train[num][i] != 1.0)):
            print(num, " / ", i, " : ", corr_train[num][i])
            print("with outcome and", i, " : ", corr_train['outcome'][i])

activity_id_type  /  char_4_act  :  -0.930650834198
with outcome and char_4_act  :  -0.0166410010782
activity_id_type  /  char_6_act  :  -0.921646835621
with outcome and char_6_act  :  -0.0167425176382
char_4_act  /  activity_id_type  :  -0.930650834198
with outcome and activity_id_type  :  0.0182558467491
char_6_act  /  activity_id_type  :  -0.921646835621
with outcome and activity_id_type  :  0.0182558467491
char_6_act  /  char_7_act  :  0.904142191127
with outcome and char_7_act  :  -0.016196438308
char_7_act  /  char_6_act  :  0.904142191127
with outcome and char_6_act  :  -0.0167425176382
char_3_ppl  /  char_4_ppl  :  0.908604270796
with outcome and char_4_ppl  :  -0.114526082512
char_4_ppl  /  char_3_ppl  :  0.908604270796
with outcome and char_3_ppl  :  -0.142767035454
char_21  /  char_28  :  0.979756634551
with outcome and char_28  :  -0.277761893976
char_28  /  char_21  :  0.979756634551
with outcome and char_21  :  -0.27874843315


* 유지 feature : char_21, char_3_ppl, char_6_act, activity_id_type  
* 제거 feature : char_28, char_4_ppl, char_7_act, char_4_act  

In [78]:
del_feature = ['char_28', 'char_4_ppl', 'char_7_act', 'char_4_act']  
for num in del_feature:
    df_train = df_train.drop(num, axis=1)

#### feature 속성 확인

In [79]:
df_train.dtypes

people_id            float64
activity_id_type       int64
activity_id           object
year_act               int64
month_act              int64
day_act                int64
weekday_act            int64
activity_category      int64
char_1_act             int64
char_2_act             int64
char_3_act             int64
char_5_act             int64
char_6_act             int64
char_8_act             int64
char_9_act             int64
char_10_act            int64
year_ppl               int64
month_ppl              int64
day_ppl                int64
weekday_ppl            int64
char_1_ppl             int64
group_1                int64
char_2_ppl             int64
char_3_ppl             int64
char_5_ppl             int64
char_6_ppl             int64
char_7_ppl             int64
char_8_ppl             int64
char_9_ppl             int64
char_10_ppl            int64
char_11                int64
char_12                int64
char_13                int64
char_14                int64
char_15       

In [128]:
import re

In [129]:
id_str = re.compile('e\+[0-9]+')
for num in range(len(df_train['activity_id'])):
    m = id_str.search(df_train['activity_id'][num])
    if m:
        print('index :', num, " / ", m.group())

index : 396743  /  e+06
index : 513730  /  e+06
index : 721590  /  e+05
index : 847795  /  e+05
index : 1459459  /  e+05


In [130]:
str_index = [396743, 513730, 721590, 847795, 1459459]
for num in str_index:
    print("index :", num, " / ", "value :", df_train['activity_id'][num])

index : 396743  /  value : e+06
index : 513730  /  value : 4e+06
index : 721590  /  value : e+05
index : 847795  /  value : 5e+05
index : 1459459  /  value : 2e+05


In [80]:
df_train['activity_id'] = df_train['activity_id'].replace('e+06', 1000000)
df_train['activity_id'] = df_train['activity_id'].replace('4e+06', 4000000)
df_train['activity_id'] = df_train['activity_id'].replace('e+05', 100000)
df_train['activity_id'] = df_train['activity_id'].replace('5e+05', 500000)
df_train['activity_id'] = df_train['activity_id'].replace('2e+05', 200000)
df_train['activity_id'] = df_train['activity_id'].replace('', 0)

In [81]:
df_train['activity_id'] = df_train['activity_id'].apply(pd.to_numeric, errors='ignore')

In [110]:
df_train['people_id'] = df_train['people_id'].astype(np.int64)

In [113]:
df_train.dtypes

people_id              int64
activity_id_type       int64
activity_id            int64
year_act               int64
month_act              int64
day_act                int64
weekday_act            int64
activity_category      int64
char_1_act             int64
char_2_act             int64
char_3_act             int64
char_5_act             int64
char_6_act             int64
char_8_act             int64
char_9_act             int64
char_10_act            int64
year_ppl               int64
month_ppl              int64
day_ppl                int64
weekday_ppl            int64
char_1_ppl             int64
group_1                int64
char_2_ppl             int64
char_3_ppl             int64
char_5_ppl             int64
char_6_ppl             int64
char_7_ppl             int64
char_8_ppl             int64
char_9_ppl             int64
char_10_ppl            int64
char_11                int64
char_12                int64
char_13                int64
char_14                int64
char_15       

In [123]:
pd.DataFrame(df_train.isnull().sum().sort_values(ascending=False))

Unnamed: 0,0
from_join_to_act,0
char_9_act,0
char_7_ppl,0
char_6_ppl,0
char_5_ppl,0
char_3_ppl,0
char_2_ppl,0
group_1,0
char_1_ppl,0
weekday_ppl,0


In [114]:
%%time
df_train.to_csv('df_train.csv')

Wall time: 2min 5s


## X_test 생성

In [115]:
df_test.dtypes

people_id              int64
activity_id_type       int64
activity_id          float64
year_act               int64
month_act              int64
day_act                int64
weekday_act            int64
activity_category      int64
char_1_act             int64
char_2_act             int64
char_3_act             int64
char_5_act             int64
char_6_act             int64
char_8_act             int64
char_9_act             int64
char_10_act            int64
year_ppl               int64
month_ppl              int64
day_ppl                int64
weekday_ppl            int64
char_1_ppl             int64
group_1                int64
char_2_ppl             int64
char_3_ppl             int64
char_5_ppl             int64
char_6_ppl             int64
char_7_ppl             int64
char_8_ppl             int64
char_9_ppl             int64
char_10_ppl            int64
char_11                int64
char_12                int64
char_13                int64
char_14                int64
char_15       

In [119]:
df_test['activity_id'] = df_test['activity_id'].fillna(0).astype(np.int64)

In [84]:
del_feature = ['char_28', 'char_4_ppl', 'char_7_act', 'char_4_act']  
for num in del_feature:
    df_test = df_test.drop(num, axis=1)

In [116]:
print(len(df_train.columns))
print(len(df_test.columns))

58
58


In [121]:
df_test.tail()

Unnamed: 0,people_id,activity_id_type,activity_id,year_act,month_act,day_act,weekday_act,activity_category,char_1_act,char_2_act,char_3_act,char_5_act,char_6_act,char_8_act,char_9_act,char_10_act,year_ppl,month_ppl,day_ppl,weekday_ppl,char_1_ppl,group_1,char_2_ppl,char_3_ppl,char_5_ppl,char_6_ppl,char_7_ppl,char_8_ppl,char_9_ppl,char_10_ppl,char_11,char_12,char_13,char_14,char_15,char_16,char_17,char_18,char_19,char_20,char_21,char_22,char_23,char_24,char_25,char_26,char_27,char_29,char_30,char_31,char_32,char_33,char_34,char_35,char_36,char_37,char_38,from_join_to_act
498682,99997,2,4367092,2023,4,22,5,2,0,0,0,0,0,0,0,1,2022,3,12,5,2,17304,2,40,9,3,8,6,6,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,36,406.0
498683,99997,2,4404220,2022,11,12,5,2,0,0,0,0,0,0,0,1,2022,3,12,5,2,17304,2,40,9,3,8,6,6,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,36,245.0
498684,99997,2,448830,2022,8,2,1,2,0,0,0,0,0,0,0,1,2022,3,12,5,2,17304,2,40,9,3,8,6,6,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,36,143.0
498685,99997,2,450133,2022,8,2,1,2,0,0,0,0,0,0,0,1,2022,3,12,5,2,17304,2,40,9,3,8,6,6,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,36,143.0
498686,99997,2,847967,2022,10,15,5,2,0,0,0,0,0,0,0,1,2022,3,12,5,2,17304,2,40,9,3,8,6,6,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,36,217.0


In [122]:
pd.DataFrame(df_test.isnull().sum().sort_values(ascending=False))

Unnamed: 0,0
from_join_to_act,0
char_9_act,0
char_7_ppl,0
char_6_ppl,0
char_5_ppl,0
char_3_ppl,0
char_2_ppl,0
group_1,0
char_1_ppl,0
weekday_ppl,0


In [124]:
df_test.to_csv('df_test.csv')

# 2. Modeling

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import auc

In [10]:
dfX = pd.read_csv('./df_train.csv')
dfX_test = pd.read_csv('./df_test.csv')

In [11]:
dfX = dfX.drop('Unnamed: 0', axis=1)
dfX_test = dfX_test.drop('Unnamed: 0', axis=1)

In [12]:
X_train, X_test, y_train, y_test = train_test_split(dfX, y, test_size=0.20, random_state=42)

### (1) Logistic Regression

In [16]:
%%time
model_logit = LogisticRegression().fit(X_train, y_train)

Wall time: 1min 46s


In [17]:
logit_pred = model_logit.predict(X_test)

In [18]:
score_logit = roc_auc_score(y_test, logit_pred)
score_logit

0.81969153009766504

### (2) QDA / LDA

In [18]:
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

In [20]:
%%time
model_qda = QuadraticDiscriminantAnalysis().fit(X_train, y_train)
model_lda = LinearDiscriminantAnalysis().fit(X_train, y_train)



Wall time: 30.5 s


In [21]:
qda_pred = model_qda.predict(X_test)
lda_pred = model_lda.predict(X_test)

In [22]:
score_qda = roc_auc_score(y_test, qda_pred)
score_qda

0.8484463713831415

In [23]:
score_lda = roc_auc_score(y_test, lda_pred)
score_lda

0.84942525812102077

### (3) Gaussian NB

In [24]:
clf_norm = GaussianNB().fit(X_train, y_train)

In [25]:
norm_pred = clf_norm.predict(X_test)

In [26]:
score_norm = roc_auc_score(y_test, norm_pred)
score_norm

0.74290968083171183

### (4) Multinomial NB

In [27]:
%%time
clf_mult = MultinomialNB().fit(X_train, y_train)

Wall time: 1.71 s


In [28]:
mult_pred = clf_mult.predict(X_test)

In [29]:
score_norm = roc_auc_score(y_test, norm_pred)
score_norm

0.74290968083171183

### (5) Decision Tree

In [73]:
def tree_depth_cv(start_num, last_num):
    for num in range(start_num, last_num):
        clf_tree = DecisionTreeClassifier(criterion='entropy', max_depth=num).fit(X_train, y_train)
        tree_pred = clf_tree.predict(X_test)
        score_tree = roc_auc_score(y_test, tree_pred)
        print('max_depth', num, "roc_auc :", score_tree)

In [76]:
%%time
tree_depth_cv(4, 16)

max_depth 4 roc_auc : 0.847964153961
max_depth 5 roc_auc : 0.857940499621
max_depth 6 roc_auc : 0.85732693423
max_depth 7 roc_auc : 0.869228405685
max_depth 8 roc_auc : 0.872955283524
max_depth 9 roc_auc : 0.87775084123
max_depth 10 roc_auc : 0.882876957192
max_depth 11 roc_auc : 0.887650374013
max_depth 12 roc_auc : 0.894765060338
max_depth 13 roc_auc : 0.900993494643
max_depth 14 roc_auc : 0.907827748908
max_depth 15 roc_auc : 0.916984640805
Wall time: 9min 56s


In [17]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

In [79]:
model_tree = DecisionTreeClassifier(criterion='entropy', max_depth=15)
cv = KFold(10)

In [80]:
cross_val_score(model_tree, dfX, y, scoring="roc_auc", cv=cv)

array([ 0.93301923,  0.9139052 ,  0.91814841,  0.91622672,  0.92232743,
        0.95160699,  0.91479927,  0.91627849,  0.91769192,  0.91131418])

### (6) Random Forest / Extra Tree

In [16]:
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier

In [85]:
def forest_depth_cv(start_num, last_num):
    for num in range(start_num, last_num):
        clf_forest = RandomForestClassifier(max_depth=num).fit(X_train, y_train)
        forest_pred = clf_forest.predict(X_test)
        score_forest = roc_auc_score(y_test, forest_pred)
        print('max_depth', num, "roc_auc :", score_forest)

In [86]:
def extra_depth_cv(start_num, last_num):
    for num in range(start_num, last_num):
        clf_extra = ExtraTreesClassifier(max_depth=num).fit(X_train, y_train)
        extra_pred = clf_extra.predict(X_test)
        score_extra = roc_auc_score(y_test, extra_pred)
        print('max_depth', num, "roc_auc :", score_extra)

In [87]:
%%time
forest_depth_cv(4, 16)

max_depth 4 roc_auc : 0.851426404994
max_depth 5 roc_auc : 0.8483555663
max_depth 6 roc_auc : 0.85855364241
max_depth 7 roc_auc : 0.861371785403
max_depth 8 roc_auc : 0.866959875234
max_depth 9 roc_auc : 0.876028689194
max_depth 10 roc_auc : 0.876114416837
max_depth 11 roc_auc : 0.885906584748
max_depth 12 roc_auc : 0.898429728656
max_depth 13 roc_auc : 0.900752626958
max_depth 14 roc_auc : 0.91566384838
max_depth 15 roc_auc : 0.925988047006
Wall time: 13min 41s


In [89]:
model_forest = RandomForestClassifier(max_depth=15)
cv = KFold(10)

In [90]:
%%time
cross_val_score(model_forest, dfX, y, scoring="roc_auc", cv=cv)

Wall time: 9min 24s


array([ 0.95297259,  0.93329085,  0.92751301,  0.92515409,  0.93576942,
        0.96128846,  0.93352199,  0.95356465,  0.93863979,  0.91363566])

In [142]:
%%time
extra_depth_cv(4, 16)

max_depth 4 roc_auc : 0.764296861902
max_depth 5 roc_auc : 0.806148374892
max_depth 6 roc_auc : 0.796401002579
max_depth 7 roc_auc : 0.834317272946
max_depth 8 roc_auc : 0.835493922279
max_depth 9 roc_auc : 0.854304677233
max_depth 10 roc_auc : 0.866282534095
max_depth 11 roc_auc : 0.872890201047
max_depth 12 roc_auc : 0.876869755431
max_depth 13 roc_auc : 0.890649080452
max_depth 14 roc_auc : 0.894783985885
max_depth 15 roc_auc : 0.902485102164
Wall time: 8min 58s


In [143]:
model_extra = ExtraTreesClassifier(max_depth=15)
cv = KFold(10)

In [145]:
%%time
cross_val_score(model_extra, dfX, y, scoring="roc_auc", cv=cv)

Wall time: 2h 18min 12s


array([ 0.95334965,  0.92182121,  0.9176043 ,  0.92014351,  0.92447981,
        0.95650228,  0.92757451,  0.95168339,  0.93040021,  0.91436364])

### importances

In [147]:
fit_model_forest = RandomForestClassifier(max_depth=15).fit(X_train, y_train)
fit_model_extra = ExtraTreesClassifier(max_depth=15).fit(X_train, y_train)

In [148]:
importances_forest = fit_model_forest.feature_importances_
importances_extra = fit_model_extra.feature_importances_

In [149]:
indices_forest = np.argsort(importances_forest)[::-1]
indices_extra = np.argsort(importances_extra)[::-1]

In [153]:
# random forest의 importance

for f in range(dfX.shape[1]):
    print("%d. feature %d : %s (%f)" % (f + 1, indices_forest[f], dfX.keys()[indices_forest[f]], importances_forest[indices_forest[f]]))

1. feature 57 : char_38 (0.340481)
2. feature 22 : group_1 (0.204389)
3. feature 23 : char_2_ppl (0.100308)
4. feature 53 : char_34 (0.043149)
5. feature 28 : char_8_ppl (0.040801)
6. feature 27 : char_7_ppl (0.035768)
7. feature 26 : char_6_ppl (0.028346)
8. feature 29 : char_9_ppl (0.023446)
9. feature 0 : Unnamed: 0 (0.013318)
10. feature 19 : day_ppl (0.012203)
11. feature 34 : char_14 (0.011033)
12. feature 1 : people_id (0.010911)
13. feature 58 : from_join_to_act (0.009720)
14. feature 24 : char_3_ppl (0.009265)
15. feature 18 : month_ppl (0.008427)
16. feature 36 : char_16 (0.007754)
17. feature 45 : char_25 (0.007510)
18. feature 25 : char_5_ppl (0.007355)
19. feature 20 : weekday_ppl (0.005818)
20. feature 50 : char_31 (0.005175)
21. feature 55 : char_36 (0.005063)
22. feature 17 : year_ppl (0.004630)
23. feature 33 : char_13 (0.003719)
24. feature 47 : char_27 (0.003400)
25. feature 30 : char_10_ppl (0.003274)
26. feature 21 : char_1_ppl (0.003168)
27. feature 6 : day_act (0

In [154]:
# extra tree의 importance

for f in range(dfX.shape[1]):
    print("%d. feature %d : %s (%f)" % (f + 1, indices_extra[f], dfX.keys()[indices_extra[f]], importances_extra[indices_extra[f]]))

1. feature 23 : char_2_ppl (0.227132)
2. feature 57 : char_38 (0.198528)
3. feature 22 : group_1 (0.043227)
4. feature 33 : char_13 (0.040249)
5. feature 26 : char_6_ppl (0.036777)
6. feature 53 : char_34 (0.035551)
7. feature 56 : char_37 (0.030172)
8. feature 28 : char_8_ppl (0.029568)
9. feature 55 : char_36 (0.026445)
10. feature 21 : char_1_ppl (0.025826)
11. feature 45 : char_25 (0.024788)
12. feature 29 : char_9_ppl (0.019890)
13. feature 37 : char_17 (0.018249)
14. feature 39 : char_19 (0.017394)
15. feature 51 : char_32 (0.017272)
16. feature 27 : char_7_ppl (0.016643)
17. feature 19 : day_ppl (0.013324)
18. feature 31 : char_11 (0.012731)
19. feature 47 : char_27 (0.012183)
20. feature 24 : char_3_ppl (0.008199)
21. feature 18 : month_ppl (0.007442)
22. feature 30 : char_10_ppl (0.007305)
23. feature 34 : char_14 (0.007010)
24. feature 48 : char_29 (0.006936)
25. feature 40 : char_20 (0.006326)
26. feature 43 : char_23 (0.006130)
27. feature 0 : Unnamed: 0 (0.006034)
28. feat

### (7) XGboost

In [15]:
from xgboost import XGBClassifier



In [158]:
%%time
xgb = XGBClassifier().fit(X_train, y_train)

Wall time: 3min 42s


In [159]:
xgb_pred = xgb.predict(X_test)

In [160]:
score_xgb = roc_auc_score(y_test, xgb_pred)
score_xgb

0.87554826050304957

In [11]:
def xgb_depth_cv(start_num, last_num):
    for num in range(start_num, last_num):
        clf_xgb = XGBClassifier(max_depth=num).fit(X_train, y_train)
        xgb_pred = clf_xgb.predict(X_test)
        score_xgb = roc_auc_score(y_test, xgb_pred)
        print('max_depth', num, "roc_auc :", score_xgb)

In [12]:
%%time
xgb_depth_cv(4, 16)

max_depth 4 roc_auc : 0.880805518293
max_depth 5 roc_auc : 0.886237835939
max_depth 6 roc_auc : 0.894063783836
max_depth 7 roc_auc : 0.902763827277
max_depth 8 roc_auc : 0.912608216242
max_depth 9 roc_auc : 0.929702050162
max_depth 10 roc_auc : 0.941208316548
max_depth 11 roc_auc : 0.952275710772
max_depth 12 roc_auc : 0.964963433399
max_depth 13 roc_auc : 0.975402867733
max_depth 14 roc_auc : 0.981144819305
max_depth 15 roc_auc : 0.985586124611
Wall time: 6h 14min 46s


In [15]:
model_xgb = XGBClassifier(max_depth=15)
cv = KFold(10)

In [16]:
%%time
cross_val_score(model_xgb, dfX, y, scoring="roc_auc", cv=cv)

Wall time: 2h 51min 47s


array([ 0.96943006,  0.95558905,  0.95386729,  0.95302876,  0.95916341,
        0.97666811,  0.95515991,  0.73373593,  0.95916279,  0.94600736])

# 3. Predict

### 모델 별 'roc_auc' 확인

1. Logistic Regression : 0.81977578707364129
2. QDA : 0.84836803000416583
3. LDA : 0.8494365670929438
4. GNB : 0.74290032673068318
5. MNB : 0.74290032673068318
6. Decision Tree (depth 1 / 15 ) : 0.847964153961 / 0.916984640805
7. Random Forest (depth 1 / 15 ) : 0.851426404994 / 0.925988047006
8. Extra Tree (depth 1 / 15 ) : 0.764296861902 / 0.902485102164
9. XGBoost (depth 1 / 15 ) : 0.880805518293 / 0.985586124611

###  모델 중 'roc_auc' 가 0.8 이상인 것으로 votingclassifier

In [13]:
from sklearn.ensemble import VotingClassifier

In [19]:
LOG = LogisticRegression()
QDA = QuadraticDiscriminantAnalysis()
LDA = LinearDiscriminantAnalysis()
DTC = DecisionTreeClassifier() 
RFC = RandomForestClassifier()
ETC = ExtraTreesClassifier()
XGB = XGBClassifier()

In [20]:
DTC_15 = DecisionTreeClassifier(max_depth=15) 
RFC_15 = RandomForestClassifier(max_depth=15)
ETC_15 = ExtraTreesClassifier(max_depth=15)
XGB_15 = XGBClassifier(max_depth=15)

### All Model

In [21]:
all_clf = VotingClassifier(estimators=[('LOG', LOG), ('QDA', QDA), ('LDA', LDA), ('DTC', DTC), 
                                     ('RFC', RFC), ('ETC',ETC), ('XGB',XGB)], voting='soft')

In [31]:
%%time
all_clf.fit(X_train, y_train)



Wall time: 10min 3s


VotingClassifier(estimators=[('LOG', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)), ('QDA', Quadr...logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1))],
         n_jobs=1, voting='soft', weights=None)

In [32]:
all_clf_pred = all_clf.predict(X_test)

In [33]:
score_all = roc_auc_score(y_test, all_clf_pred)
score_all

0.96749020543769404

### Decision Tree / Random Forest / Extra Tree : max_depth 1

In [34]:
tree_clf = VotingClassifier(estimators=[('DTC', DTC), ('RFC', RFC), ('ETC',ETC), ('XGB',XGB)], voting='soft')

In [35]:
%%time
tree_clf.fit(X_train, y_train)

Wall time: 7min 29s


VotingClassifier(estimators=[('DTC', DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, rand...logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1))],
         n_jobs=1, voting='soft', weights=None)

In [36]:
tree_clf_pred = tree_clf.predict(X_test)

In [37]:
score_tree_clf = roc_auc_score(y_test, tree_clf_pred)
score_tree_clf

0.99449035344378156

### Decision Tree / Random Forest / Extra Tree : max_depth 15

In [38]:
tree15_clf = VotingClassifier(estimators=[('DTC_15', DTC_15), ('RFC_15', RFC_15), ('ETC_15',ETC_15), ('XGB_15',XGB_15)], voting='soft')

In [39]:
%%time
tree15_clf.fit(X_train, y_train)

Wall time: 25min 27s


VotingClassifier(estimators=[('DTC_15', DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=15,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, ran...logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1))],
         n_jobs=1, voting='soft', weights=None)

In [40]:
tree15_clf_pred = tree15_clf.predict(X_test)

In [41]:
score_tree15_clf = roc_auc_score(y_test, tree15_clf_pred)
score_tree15_clf

0.95354261505025006

#### Predict Save

* all model

In [42]:
%%time
all_clf = VotingClassifier(estimators=[('LOG', LOG), ('QDA', QDA), ('LDA', LDA), ('DTC', DTC), 
                                       ('RFC', RFC), ('ETC',ETC), ('XGB',XGB)], voting='soft')
all_clf.fit(dfX, y)



Wall time: 9min 20s


In [43]:
y_hat_all = all_clf.predict(dfX_test)

In [50]:
all_hat_answer = pd.concat([pd.DataFrame(df_test['activity_id'], columns=['activity_id']), pd.DataFrame(y_hat_all, columns=['outcome'])], axis=1)

In [52]:
all_hat_answer.to_csv('./all_hat_answer.csv', index=False)

* tree model

In [54]:
%%time
tree_clf = VotingClassifier(estimators=[('DTC', DTC), ('RFC', RFC), ('ETC',ETC), ('XGB',XGB)], voting='soft')
tree_clf.fit(dfX, y)

Wall time: 6min 26s


In [55]:
y_hat_tree = tree_clf.predict(dfX_test)

In [58]:
tree_hat_answer = pd.concat([pd.DataFrame(df_test['activity_id'], columns=['activity_id']), pd.DataFrame(y_hat_tree, columns=['outcome'])], axis=1)

In [60]:
tree_hat_answer.to_csv('./tree_hat_answer.csv', index=False)

* tree model : max_depth 15

In [61]:
%%time
tree15_clf = VotingClassifier(estimators=[('DTC_15', DTC_15), ('RFC_15', RFC_15), ('ETC_15',ETC_15), ('XGB_15',XGB_15)], voting='soft')
tree15_clf.fit(dfX, y)

Wall time: 21min 38s


In [62]:
y_hat_tree15 = tree15_clf.predict(dfX_test)

In [63]:
tree15_hat_answer = pd.concat([pd.DataFrame(df_test['activity_id'], columns=['activity_id']), pd.DataFrame(y_hat_tree15, columns=['outcome'])], axis=1)

In [64]:
tree15_hat_answer.to_csv('./tree15_hat_answer.csv', index=False)