# ランダムフォレストのハイパーパラメーターチューニング

## ライブラリを読み込む

In [3]:
%matplotlib inline

# 標準ライブラリ
import sys
import os
import csv as csv

# データ処理ライブラリ
import pandas as pd

# 可視化ライブラリ
import matplotlib.pyplot as plt

# 機械学習ライブラリ
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC



## データを読み込む
### トレーニングデータ

In [4]:
trFile = 'Titanic/all/train.csv'
df_train = pd.read_csv(trFile)
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


### テストデータ

In [5]:
teFile = 'Titanic/all/test.csv'
df_test = pd.read_csv(teFile)
df_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


## 前処理を行う
### トレーニングデータの性別 female/male を数値 0/1 に変換する

In [6]:
# Load training data
train_df = pd.read_csv(trFile, header=0)

# Convert "Sex" to be dummy variable (female = 0, male = 1)
train_df["Gender"] = train_df["Sex"].map({"female": 0, "male": 1}).astype(int)
train_df.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Gender
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,0
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,0


### トレーニングデータの欠損値を補う

In [7]:
train_df.Age.head(10)

0    22.0
1    38.0
2    26.0
3    35.0
4    35.0
5     NaN
6    54.0
7     2.0
8    27.0
9    14.0
Name: Age, dtype: float64

In [8]:
# Number of raws that have null values
len(train_df.Age[train_df.Age.isnull()])

177

In [9]:
# Complement the missing values of "Age" column with average of "Age"
median_age = train_df["Age"].dropna().median()
median_age

28.0

In [10]:
if len(train_df.Age[train_df.Age.isnull()]) > 0:
    train_df.loc[(train_df.Age.isnull()), "Age"] = median_age

In [11]:
# Copy test data's "PassengerId" column, and remove un-used columns
train_df.Age[train_df.Age.isnull()]

Series([], Name: Age, dtype: float64)

In [12]:
train_df.Age.head(10)

0    22.0
1    38.0
2    26.0
3    35.0
4    35.0
5    28.0
6    54.0
7     2.0
8    27.0
9    14.0
Name: Age, dtype: float64

In [13]:
# Show train_df columns
train_df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked', 'Gender'],
      dtype='object')

In [14]:
# Remove un-used columns
train_df = train_df.drop(["Name", "Ticket", "Sex", "SibSp", "Parch", 
                         "Fare", "Cabin", "Embarked", "PassengerId"], 
                         axis=1)
train_df.head(3)

Unnamed: 0,Survived,Pclass,Age,Gender
0,0,3,22.0,1
1,1,1,38.0,0
2,1,3,26.0,0


### テストデータの性別を数値に置き換える

In [15]:
# Load test data, Convert "Sex" tobe a dummy variable
test_df = pd.read_csv(teFile, header=0)
test_df["Gender"] = test_df["Sex"].map({"female":0, "male":1}).astype(int)
test_df.head(3)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Gender
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,1
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S,0
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,1


### テストデータの欠損データを補う

In [16]:
# Complement the missing values of "Age" column with average of "Age"
median_age = test_df["Age"].dropna().median()
if len(test_df.Age[test_df.Age.isnull()]) > 0:
    test_df.loc[(test_df.Age.isnull()), "Age"] = median_age

In [17]:
# Copy test data's "PassengerId" column, and remove un-used columns
ids = test_df["PassengerId"].values
test_df = test_df.drop(["Name", "Ticket", "Sex", "SibSp", 
                        "Parch", "Fare", "Cabin", "Embarked", "PassengerId"], axis=1)
test_df.head(3)


Unnamed: 0,Pclass,Age,Gender
0,3,34.5,1
1,3,47.0,0
2,2,62.0,1


In [18]:
# Training data
X_train = train_df.drop(columns='Survived')
y_train = train_df['Survived']


# Model
RF = RandomForestClassifier()

# Hyper parameter 
param_grid = {'bootstrap': [True],
              'max_depth': [2,3,5,1020,100],
              'min_samples_leaf': [3,4,5],
              'min_samples_split': [8,10,12],
              'n_estimators': [100]}
# Grid search 
grid_searchlog = GridSearchCV(RF, param_grid, cv=5, scoring='accuracy')
grid_searchlog

GridSearchCV(cv=5, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'bootstrap': [True], 'max_depth': [2, 3, 5, 1020, 100], 'min_samples_leaf': [3, 4, 5], 'min_samples_split': [8, 10, 12], 'n_estimators': [100]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [19]:
grid_searchlog.fit(X_train, y_train)
grid_searchlog.grid_scores_



[mean: 0.80135, std: 0.02102, params: {'bootstrap': True, 'max_depth': 2, 'min_samples_leaf': 3, 'min_samples_split': 8, 'n_estimators': 100},
 mean: 0.79125, std: 0.03955, params: {'bootstrap': True, 'max_depth': 2, 'min_samples_leaf': 3, 'min_samples_split': 10, 'n_estimators': 100},
 mean: 0.79349, std: 0.02999, params: {'bootstrap': True, 'max_depth': 2, 'min_samples_leaf': 3, 'min_samples_split': 12, 'n_estimators': 100},
 mean: 0.78563, std: 0.02035, params: {'bootstrap': True, 'max_depth': 2, 'min_samples_leaf': 4, 'min_samples_split': 8, 'n_estimators': 100},
 mean: 0.79686, std: 0.03077, params: {'bootstrap': True, 'max_depth': 2, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 100},
 mean: 0.78676, std: 0.02294, params: {'bootstrap': True, 'max_depth': 2, 'min_samples_leaf': 4, 'min_samples_split': 12, 'n_estimators': 100},
 mean: 0.79686, std: 0.01801, params: {'bootstrap': True, 'max_depth': 2, 'min_samples_leaf': 5, 'min_samples_split': 8, 'n_estimators': 1

In [21]:
optimized_rf = grid_searchlog.best_estimator_
optimized_rf

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=1020, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=3, min_samples_split=8,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [22]:
y_pred_optimized = optimized_rf.predict(test_df)
my_solution = pd.DataFrame({'PassengerId':ids,'Survived':y_pred_optimized})
my_solution

my_solution.to_csv('titanic_submit04.csv',index=None)

## 機械学習モデルの交差検証

5種類のモデル + ハイパーパラメーター最適化したランダムフォレスト
    + ロジスティック回帰
    + 決定木、
    + ランダムフォレスト、
    + 勾配ブースティング、
    + サポートベクターマシン
    + 最適化 ランダムフォレスト
についての交差検証を行い予測精度を比較する。

In [23]:
# Training data
X_train = train_df.drop(columns='Survived')
y_train = train_df['Survived']


# Predict with multiple (3) models
LR = LogisticRegression()
DT = DecisionTreeClassifier()
RF = RandomForestClassifier()
GB = GradientBoostingClassifier()
SVM = SVC()

# Cross varidation preparation
scores = []
modelnames = ['LogisticRegression', 'DecisionTree', 'RandomForest', 
              'GradientBoosting','SupportVectorMachine','Optimized_RF']
models = [LR, DT, RF, GB, SVM, optimized_rf]

# zip modelnames and models to a dictionary
modelsDict = dict(zip(modelnames,models))

# Loop over models for cross varidation
for i in models:
    score = cross_val_score(i, X_train, y_train, scoring = 'accuracy', cv = 5).mean()
    scores.append(score)

# Output the results
cv_ranking = pd.DataFrame(scores, index=modelnames,
             columns=['CV Scores']).sort_values(by = 'CV Scores', ascending=False)
cv_ranking

Unnamed: 0,CV Scores
Optimized_RF,0.80367
RandomForest,0.803619
DecisionTree,0.80141
GradientBoosting,0.795767
SupportVectorMachine,0.793501
LogisticRegression,0.792333


## 交差検証で最も精度が高いモデルについてテストデータの予測を行う

In [24]:
# Show the best Model that has the highest score 
bestModelName = cv_ranking['CV Scores'].idxmax()
bestModelName

'Optimized_RF'

In [25]:
bestModel = modelsDict[bestModelName]
bestModel

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=1020, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=3, min_samples_split=8,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [26]:
bestModel.fit(X_train,y_train)
y_pred = bestModel.predict(test_df).astype(int)

my_solution = pd.DataFrame({'PassengerId':ids,'Survived':y_pred})
my_solution

my_solution.to_csv('titanic_submit04_best.csv',index=None)
