In [1]:
#ライブラリのインポート
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
%matplotlib inline 

#ファイルの読み込み
train = pd.read_csv("Kaggle/titanic/data/train.csv")
test = pd.read_csv("Kaggle/titanic/data/test.csv")

In [2]:
full_data = [train,test]
 
for i,dataset in enumerate(full_data):
    #欠損値の処理
    dataset["Embarked"] = dataset["Embarked"].fillna("S")
    dataset["Fare"] = dataset["Fare"].fillna(train["Fare"].median())
    dataset["Age"] = dataset["Age"].fillna(train["Age"].median())
    
    #カテゴリ変数の処理
    dataset = pd.get_dummies(dataset,columns=["Embarked"])
    dataset["Sex"] = dataset["Sex"].map({"male":0,"female":1})
    
    if i==0:
        train = dataset
    else:
        test = dataset   

In [3]:
#不要なカラムの削除
train = train.drop(["PassengerId","Name","Ticket","Cabin"],axis=1)
test = test.drop(["PassengerId","Name","Ticket","Cabin"],axis=1)

In [4]:
X_set = train.iloc[:,1:]
y_set = train["Survived"].copy()

In [5]:
%%time
#モデル評価のためのライブラリ
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score

import xgboost as xgb
from xgboost import XGBClassifier
 
kf = StratifiedKFold(n_splits=10,shuffle=True,random_state=1)
 
xgb_scores = []
for i,(train_index,test_index) in enumerate(kf.split(X_set,y_set)):
    X_cv_train = X_set.iloc[train_index]
    X_cv_test = X_set.iloc[test_index]
    y_cv_train = y_set[train_index]
    y_cv_test = y_set[test_index]
    
    clf = xgb.XGBClassifier(random_state=1)
    
    clf.fit(X_cv_train,y_cv_train)
 
    y_cv_pred = clf.predict(X_cv_test)
    
    acc_score = accuracy_score(y_cv_test,y_cv_pred)
    
    xgb_scores.append(acc_score)
print(xgb_scores)
print("mean : {}".format(np.mean(xgb_scores)))

[0.8111111111111111, 0.8202247191011236, 0.8314606741573034, 0.898876404494382, 0.7415730337078652, 0.797752808988764, 0.7752808988764045, 0.8539325842696629, 0.8764044943820225, 0.797752808988764]
mean : 0.8204369538077405
CPU times: user 890 ms, sys: 47.2 ms, total: 937 ms
Wall time: 941 ms


In [6]:
%%time
#ランダムフォレスト
from sklearn.ensemble import RandomForestClassifier

#10分割交差検証を指定
kf = StratifiedKFold(n_splits=10,shuffle=True,random_state=1)

#交差検証の各スコアを入れる箱
rf_scores = []

for i,(train_index,test_index) in enumerate(kf.split(X_set,y_set)):
    X_cv_train = X_set.iloc[train_index]
    X_cv_test = X_set.iloc[test_index]
    y_cv_train = y_set[train_index]
    y_cv_test = y_set[test_index]
    
    clf = RandomForestClassifier(random_state=1)
    clf.fit(X_cv_train,y_cv_train)

    y_cv_pred = clf.predict(X_cv_test)

    acc = accuracy_score(y_cv_test,y_cv_pred)

    rf_scores.append(acc)

print(rf_scores)
print("mean : {}".format(np.mean(rf_scores)))

[0.7888888888888889, 0.7865168539325843, 0.8426966292134831, 0.898876404494382, 0.7640449438202247, 0.797752808988764, 0.7865168539325843, 0.8314606741573034, 0.8651685393258427, 0.8202247191011236]
mean : 0.818214731585518
CPU times: user 2.23 s, sys: 3.93 ms, total: 2.23 s
Wall time: 2.23 s


In [7]:
%%time
import lightgbm as gbm

kf = StratifiedKFold(n_splits=10,shuffle=True,random_state=1)

gbm_scores = []

for i,(train_index,test_index) in enumerate(kf.split(X_set,y_set)):
    X_cv_train = X_set.iloc[train_index]
    X_cv_test = X_set.iloc[test_index]
    y_cv_train = y_set[train_index]
    y_cv_test = y_set[test_index]
    
    clf = gbm.LGBMClassifier(random_state=1)
    
    clf.fit(X_cv_train,y_cv_train)

    y_cv_pred = clf.predict(X_cv_test)
    
    acc_score = accuracy_score(y_cv_test,y_cv_pred)
    
    gbm_scores.append(acc_score)
print(gbm_scores)
print("mean : {}".format(np.mean(gbm_scores)))

[0.8333333333333334, 0.7865168539325843, 0.8426966292134831, 0.9101123595505618, 0.797752808988764, 0.797752808988764, 0.7752808988764045, 0.8202247191011236, 0.8764044943820225, 0.7752808988764045]
mean : 0.8215355805243446
CPU times: user 766 ms, sys: 36.1 ms, total: 802 ms
Wall time: 802 ms
