In [1]:
# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
import warnings
from collections import Counter
from sklearn.preprocessing import StandardScaler, Imputer
from sklearn.feature_selection import RFECV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix
from sklearn.externals import joblib
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate, StratifiedKFold
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score,roc_auc_score
from sklearn.model_selection import GridSearchCV

warnings.filterwarnings('ignore')

# SET PARAMETERS
file_model = 'train'
file_score = 'test'

# トレーニングデータをロード  
train = pd.read_csv('./data/'+ file_model + '.csv', header=0)

# Sex/ Embarked : 数値に変換
# 欠損値は平均で補完
train= train.replace("male",0).replace("female",1).replace("S",0).replace("C",1).replace("Q",2)
train["Age"].fillna(train.Age.mean(), inplace=True)
train["Embarked"].fillna(train.Embarked.mean(), inplace=True)

# Name "Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5
# 上記敬称パターンに無い例外的な敬称はRareとする
# 欠損は0
combine1 = [train]
for train in combine1: 
        train['Salutation'] = train.Name.str.extract(' ([A-Za-z]+).', expand=False) 

for train in combine1: 
        train['Salutation'] = train['Salutation'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
        train['Salutation'] = train['Salutation'].replace('Mlle', 'Miss')
        train['Salutation'] = train['Salutation'].replace('Ms', 'Miss')
        train['Salutation'] = train['Salutation'].replace('Mme', 'Mrs')
        del train['Name']

Salutation_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5} 

for train in combine1: 
        train['Salutation'] = train['Salutation'].map(Salutation_mapping) 
        train['Salutation'] = train['Salutation'].fillna(0)


# Ticket 
# Ticketの先頭の文字で分類
# 文字列の長さでも分類
for train in combine1: 
        train['Ticket_Lett'] = train['Ticket'].apply(lambda x: str(x)[0])
        train['Ticket_Lett'] = train['Ticket_Lett'].apply(lambda x: str(x)) 
        train['Ticket_Lett'] = np.where((train['Ticket_Lett']).isin(['1', '2', '3', 'S', 'P', 'C', 'A']), train['Ticket_Lett'], np.where((train['Ticket_Lett']).isin(['W', '4', '7', '6', 'L', '5', '8']), '0','0')) 
        train['Ticket_Len'] = train['Ticket'].apply(lambda x: len(x)) 
        del train['Ticket'] 

train['Ticket_Lett']=train['Ticket_Lett'].replace("1",1).replace("2",2).replace("3",3).replace("0",0).replace("S",3).replace("P",0).replace("C",3).replace("A",3)

# Cabin
# 先頭の文字で分類
for train in combine1: 
    train['Cabin_Lett'] = train['Cabin'].apply(lambda x: str(x)[0]) 
    train['Cabin_Lett'] = train['Cabin_Lett'].apply(lambda x: str(x)) 
    train['Cabin_Lett'] = np.where((train['Cabin_Lett']).isin([ 'F', 'E', 'D', 'C', 'B', 'A']),train['Cabin_Lett'], np.where((train['Cabin_Lett']).isin(['W', '4', '7', '6', 'L', '5', '8']), '0','0'))
del train['Cabin'] 
train['Cabin_Lett']=train['Cabin_Lett'].replace("A",1).replace("B",2).replace("C",1).replace("0",0).replace("D",2).replace("E",2).replace("F",1)

# Add FamilySize/IsAlome
# 一緒に乗船している人数によって生存に大きく差が出る為、FamilySize/isAloneを項目として追加します。
# ここまででまだ使われていないものはPclass、SibspとParchです。
# Pclassは何等級のところに乗っていたかを表すものなのでこのままでいいです。
# Sibspは乗っていた夫婦と兄弟の人数を表したものです。Parchは乗っていた親と子供の人数を表したものです。
# よってSibsp+Parch+1がFamilySizeとなります。また、FamilySizeが1だとIsAlone一人で乗っているかどうかが1となります。
train["FamilySize"] = train["SibSp"] + train["Parch"] + 1
for train in combine1:
    train['IsAlone'] = 0
    train.loc[train['FamilySize'] == 1, 'IsAlone'] = 1

ID = train.iloc[:,0] 
X  = train.iloc[:, 2:] # Pclass以降の変数
y  = train.iloc[:, 1]  # 正解データ    
X_columns = X.columns.values

X.head(10)

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Salutation,Ticket_Lett,Ticket_Len,Cabin_Lett,FamilySize,IsAlone
0,3,0,22.0,1,0,7.25,0.0,1.0,3,9,0,2,0
1,1,1,38.0,1,0,71.2833,1.0,3.0,0,8,1,2,0
2,3,1,26.0,0,0,7.925,0.0,2.0,3,16,0,1,1
3,1,1,35.0,1,0,53.1,0.0,3.0,1,6,1,2,0
4,3,0,35.0,0,0,8.05,0.0,1.0,3,6,0,1,1
5,3,0,29.699118,0,0,8.4583,2.0,1.0,3,6,0,1,1
6,1,0,54.0,0,0,51.8625,0.0,1.0,1,5,2,1,1
7,3,0,2.0,3,1,21.075,0.0,4.0,3,6,0,5,0
8,3,1,27.0,0,2,11.1333,0.0,3.0,3,6,0,3,0
9,2,1,14.0,1,0,30.0708,1.0,3.0,2,6,0,2,0


In [2]:
# テストデータをロード
test = pd.read_csv('./data/'+ file_score + '.csv', header=0)
test= test.replace("male",0).replace("female",1).replace("S",0).replace("C",1).replace("Q",2)

test["Age"].fillna(train.Age.mean(), inplace=True)
test["Fare"].fillna(train.Fare.mean(), inplace=True)

combine = [test]
for test in combine:
    test['Salutation'] = test.Name.str.extract(' ([A-Za-z]+)\.', expand=False)
for test in combine:
    test['Salutation'] = test['Salutation'].replace(['Lady', 'Countess','Capt', 'Col',\
         'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')

    test['Salutation'] = test['Salutation'].replace('Mlle', 'Miss')
    test['Salutation'] = test['Salutation'].replace('Ms', 'Miss')
    test['Salutation'] = test['Salutation'].replace('Mme', 'Mrs')
    del test['Name']
Salutation_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}

for test in combine:
    test['Salutation'] = test['Salutation'].map(Salutation_mapping)
    test['Salutation'] = test['Salutation'].fillna(0)

for test in combine:
        test['Ticket_Lett'] = test['Ticket'].apply(lambda x: str(x)[0])
        test['Ticket_Lett'] = test['Ticket_Lett'].apply(lambda x: str(x))
        test['Ticket_Lett'] = np.where((test['Ticket_Lett']).isin(['1', '2', '3', 'S', 'P', 'C', 'A']), test['Ticket_Lett'],
                                   np.where((test['Ticket_Lett']).isin(['W', '4', '7', '6', 'L', '5', '8']),
                                            '0', '0'))
        test['Ticket_Len'] = test['Ticket'].apply(lambda x: len(x))
        del test['Ticket']
test['Ticket_Lett']=test['Ticket_Lett'].replace("1",1).replace("2",2).replace("3",3).replace("0",0).replace("S",3).replace("P",0).replace("C",3).replace("A",3) 

for test in combine:
        test['Cabin_Lett'] = test['Cabin'].apply(lambda x: str(x)[0])
        test['Cabin_Lett'] = test['Cabin_Lett'].apply(lambda x: str(x))
        test['Cabin_Lett'] = np.where((test['Cabin_Lett']).isin(['T', 'H', 'G', 'F', 'E', 'D', 'C', 'B', 'A']),test['Cabin_Lett'],
                                   np.where((test['Cabin_Lett']).isin(['W', '4', '7', '6', 'L', '5', '8']),
                                            '0','0'))        
        del test['Cabin']
test['Cabin_Lett']=test['Cabin_Lett'].replace("A",1).replace("B",2).replace("C",1).replace("0",0).replace("D",2).replace("E",2).replace("F",1).replace("G",1) 

test["FamilySize"] = train["SibSp"] + train["Parch"] + 1

for test in combine:
    test['IsAlone'] = 0
    test.loc[test['FamilySize'] == 1, 'IsAlone'] = 1
    
IDs = test.iloc[:,[0]] 
Xs  = test.iloc[:, 1:]

IDs.head(10)

Unnamed: 0,PassengerId
0,892
1,893
2,894
3,895
4,896
5,897
6,898
7,899
8,900
9,901


In [3]:
# パラメータチューニング
param_grid_rf = {'n_estimators'      : [10,25,50,75,100],
                 'n_jobs'            : [4],
                 'min_samples_split' : [5,10,15,20,25,30],
                 'max_depth'         : [5,10,15,20,25,30]}

rf_clf = GridSearchCV(RandomForestClassifier(), param_grid_rf)
rf_clf.fit(X, y)

GridSearchCV(cv=None, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'n_estimators': [10, 25, 50, 75, 100], 'n_jobs': [4], 'min_samples_split': [5, 10, 15, 20, 25, 30], 'max_depth': [5, 10, 15, 20, 25, 30]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [4]:
# CLASSIFIER
# pipe_knn = Pipeline([('scl',StandardScaler()), ('est',KNeighborsClassifier())])
pipe_logistic = Pipeline([('scl', StandardScaler()), ('est',LogisticRegression())])
pipe_rf = Pipeline([('scl',StandardScaler()),('est',rf_clf.best_estimator_)])
pipe_gb = Pipeline([('scl',StandardScaler()),('est',GradientBoostingClassifier())])
pipe_mlp = Pipeline([('scl',StandardScaler()),('est',MLPClassifier(max_iter=2000,hidden_layer_sizes=(4,2)))])
# pipe_svc = Pipeline([('scl',StandardScaler()),('est',LinearSVC(random_state=1))])
pipe_xgb = Pipeline([('scl',StandardScaler()),('est',XGBClassifier())])

In [5]:
# Process for Imbalanced data
# 不均衡データ対応処理

from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler,SMOTE

# Original
y = y.as_matrix().ravel()

# データが不均衡である場合、Smoteのみ行うこととする。（全部やると時間がかかりすぎる為）
smt = SMOTE()
X_smt,y_smt = smt.fit_sample(X, y)
print('SMOTE', Counter(y_smt))

SMOTE Counter({0: 549, 1: 549})


In [6]:
# Modeling & Scoring
# トレーニングデータでモデルを生成し、テストデータで検証
# cross_validateを使用

warnings.filterwarnings('ignore')

metrics_dict = {'1':'accuracy', '2':'precision', '3':'recall', '4':'f1', '5':'roc_auc' }
scores_df = pd.DataFrame(index=[], columns=['algorithm','org_train_score','org_test_score','smt_train_score','smt_test_score'] )
pipe_names = ['Logistic','RandomForest','GradientBoosting','MLP','Xgboost']

pipe_scores_dict = {}

print('Enter one of the following metrics number')
input_num = input('1:accuracy, 2:precision, 3:recall, 4:f1, 5:roc_auc ')
print(metrics_dict[input_num])


# pipe_lines = [pipe_knn, pipe_logistic, pipe_rf, pipe_gb, pipe_mlp, pipe_svc]
pipe_lines = [pipe_logistic, pipe_rf, pipe_gb, pipe_mlp, pipe_xgb]

for (i,pipe) in enumerate(pipe_lines):
    skf = StratifiedKFold(n_splits=10,shuffle=True)
    
    # Train Original
    pipe.fit(X, y)
    joblib.dump(pipe, './model/'+ metrics_dict[input_num] + '_' +  pipe_names[i] + '_org.pkl')
    org_scores = cross_validate(pipe, X, y, cv=skf, scoring=metrics_dict[input_num])
    
    # Scoring (Original)    
    org_score = pd.DataFrame(pipe.predict(Xs), columns=['Survived'])
    IDs.join(org_score).to_csv('./data/'+ metrics_dict[input_num] + '_' + pipe_names[i] + '_org_with_pred.csv', index=False)

    # Train Smote
    pipe.fit(X_smt, y_smt)
    joblib.dump(pipe, './model/'+ metrics_dict[input_num] + '_' + pipe_names[i] + '_smt.pkl')
    smt_scores = cross_validate(pipe, X_smt, y_smt, cv=skf, scoring=metrics_dict[input_num])
    
    # Scoring (Smote)    
    smt_score = pd.DataFrame(pipe.predict(Xs), columns=['Survived'])
    IDs.join(smt_score).to_csv('./data/'+ metrics_dict[input_num] + '_' + pipe_names[i] + '_smt_with_pred.csv', index=False)
   
    # Make Dataframe    
    series = pd.Series([pipe_names[i],
                        np.mean(org_scores['train_score']),
                        np.mean(org_scores['test_score']), 
                        np.mean(smt_scores['train_score']),
                        np.mean(smt_scores['test_score'])],
                        index=scores_df.columns)
    scores_df = scores_df.append(series, ignore_index = True)
    
    # Make scores for ranking    
    pipe_scores_dict[pipe_names[i]] = max(np.mean(org_scores['test_score']), np.mean(smt_scores['test_score']))


# 評価結果を性能順にソートしてprint
for pipe, score in sorted(pipe_scores_dict.items(), key=lambda x: x[1], reverse=True):
    print('%s: %.5f' %(pipe, score))
    
scores_df

Enter one of the following metrics number
1:accuracy, 2:precision, 3:recall, 4:f1, 5:roc_auc 5
roc_auc
Xgboost: 0.91870
RandomForest: 0.91754
GradientBoosting: 0.91505
MLP: 0.88294
Logistic: 0.87307


Unnamed: 0,algorithm,org_train_score,org_test_score,smt_train_score,smt_test_score
0,Logistic,0.870577,0.864198,0.881796,0.873072
1,RandomForest,0.9518,0.88575,0.967018,0.917538
2,GradientBoosting,0.960659,0.879868,0.969891,0.915055
3,MLP,0.813402,0.79823,0.890988,0.882942
4,Xgboost,0.947273,0.888814,0.962804,0.918702
