In [83]:
import numpy as np
import pandas as pd
import pandas_profiling
import statistics as st
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import make_pipeline

In [84]:
data_train = pd.read_csv('./input/train.csv')
data_test = pd.read_csv('./input/test.csv')
data_gender_submission = pd.read_csv('./input/gender_submission.csv')

In [85]:
data_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [86]:
data_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [87]:
data_gender_submission.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


In [88]:
# 訓練データの確認
profile = pandas_profiling.ProfileReport(data_train)
profile.to_file(output_file="profile.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

In [89]:
# 特徴量エンジニアリング

# 不要な特徴量を削除
data_train.drop(['PassengerId', 'Name', 'Parch', 'SibSp', 'Ticket', 'Cabin'], axis=1, inplace=True)
data_test.drop(['PassengerId', 'Name', 'Parch', 'SibSp', 'Ticket', 'Cabin'], axis=1, inplace=True)

# 欠損データの補完
data_train['Age'].fillna(np.mean(data_train['Age']), inplace=True)
data_train['Fare'].fillna(np.nanmedian(data_train['Fare']), inplace=True)
data_train['Embarked'].fillna(st.mode(data_train['Embarked']), inplace=True)
data_test['Age'].fillna(np.mean(data_test['Age']), inplace=True)
data_test['Fare'].fillna(np.nanmedian(data_test['Fare']), inplace=True)
data_test['Embarked'].fillna(st.mode(data_test['Embarked']), inplace=True)

#文字列を数値へ変換
data_train['Sex'].replace(['male', 'female'], [0, 1], inplace=True)
data_train['Embarked'].replace(['S', 'C', 'Q'], [0, 1, 2], inplace=True)
data_test['Sex'].replace(['male', 'female'], [0, 1], inplace=True)
data_test['Embarked'].replace(['S', 'C', 'Q'], [0, 1, 2], inplace=True)

In [90]:
# 訓練データの再確認
profile2 = pandas_profiling.ProfileReport(data_train)
profile2.to_file(output_file="profile2.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

In [91]:
# 訓練データの分割
X = data_train.drop('Survived', axis=1)
y = data_train['Survived']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_val shape:", X_val.shape)
print("y_val shape:", y_val.shape)

X_train shape: (712, 5)
y_train shape: (712,)
X_val shape: (179, 5)
y_val shape: (179,)


In [92]:
# テストデータ
X_test = data_test

In [93]:
# random forest
rf_clf = make_pipeline(RandomForestClassifier(random_state=42))

rf_params = {
    "randomforestclassifier__n_estimators": range(25, 100, 25), 
    "randomforestclassifier__max_depth": range(10, 50, 10)
}

rf_model=GridSearchCV(rf_clf, param_grid=rf_params, cv=5, n_jobs=-1,verbose=1)
rf_model.fit(X_train, y_train)
rf_model.best_params_

Fitting 5 folds for each of 12 candidates, totalling 60 fits


{'randomforestclassifier__max_depth': 10,
 'randomforestclassifier__n_estimators': 75}

In [94]:
rf_acc_train = rf_model.score(X_train, y_train)
rf_acc_val = rf_model.score(X_val, y_val)

print("RF Training Accuracy:", round(rf_acc_train, 4))
print("RF Validation Accuracy:", round(rf_acc_val, 4))

RF Training Accuracy: 0.9452
RF Validation Accuracy: 0.8156


In [95]:
y_test_rf = rf_model.predict(X_test)

result_rf = pd.DataFrame(y_test_rf).astype(int)
result_rf.columns = ['Survived']
submit_rf = pd.concat([data_gender_submission['PassengerId'].astype(int), result_rf],axis=1)
submit_rf.to_csv('./output/submit_v1.csv', index=False)

In [96]:
# gradient boosting
gb_clf = make_pipeline(GradientBoostingClassifier())

gb_params = {
    "gradientboostingclassifier__n_estimators": range(20, 31, 5),
    "gradientboostingclassifier__max_depth": range(2, 5)
}

gb_model = GridSearchCV(gb_clf, param_grid=gb_params, cv=5, n_jobs=-1, verbose=1)
gb_model.fit(X_train, y_train)
gb_model.best_params_

Fitting 5 folds for each of 9 candidates, totalling 45 fits


{'gradientboostingclassifier__max_depth': 2,
 'gradientboostingclassifier__n_estimators': 30}

In [97]:
gb_acc_train = gb_model.score(X_train, y_train)
gb_acc_val = gb_model.score(X_val, y_val)

print("GB Training Accuracy:", round(gb_acc_train, 4))
print("GB Validation Accuracy:", round(gb_acc_val, 4))

GB Training Accuracy: 0.8385
GB Validation Accuracy: 0.7989


In [98]:
y_test_gb = gb_model.predict(X_test)

result_gb = pd.DataFrame(y_test_gb).astype(int)
result_gb.columns = ['Survived']
submit_gb = pd.concat([data_gender_submission['PassengerId'].astype(int), result_gb],axis=1)
submit_gb.to_csv('./output/submit_v2.csv', index=False)