# load library & data 

In [22]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [23]:
train = pd.read_csv("titanic/train.csv")
test = pd.read_csv("titanic/test.csv")

train.shape, test.shape

((891, 12), (418, 11))

# 결측치 대체 

In [24]:
# 이전 값과 다음 값을 이용하여 채움
# 대부분 시계열 데이터에서 데이터가 순서대로 있을 때 사용

# method로 채우는 방법
# 앞에 있는 값으로 결측치 채움
train["Age_ffill"] = train["Age"].fillna(method="ffill")
# 뒤에 있는 값으로 결측치 채움
train["Age_bfill"] = train["Age"].fillna(method="bfill")
train[["Age", "Age_ffill", "Age_bfill"]]

Unnamed: 0,Age,Age_ffill,Age_bfill
0,22.0,22.0,22.0
1,38.0,38.0,38.0
2,26.0,26.0,26.0
3,35.0,35.0,35.0
4,35.0,35.0,35.0
...,...,...,...
886,27.0,27.0,27.0
887,19.0,19.0,19.0
888,,19.0,26.0
889,26.0,26.0,26.0


In [25]:
# forward 방향으로 채우게 되면 맨 앞이 결측치일 경우 채워지지 않음
train["Age"].interpolate(method="linear", limit_direction="forward")

0      22.0
1      38.0
2      26.0
3      35.0
4      35.0
       ... 
886    27.0
887    19.0
888    22.5
889    26.0
890    32.0
Name: Age, Length: 891, dtype: float64

In [26]:
train["Age_interpolate"] = train["Age"].interpolate(method="linear", limit_direction="both")
train[["Age", "Age_ffill", "Age_bfill", "Age_interpolate"]]

Unnamed: 0,Age,Age_ffill,Age_bfill,Age_interpolate
0,22.0,22.0,22.0,22.0
1,38.0,38.0,38.0,38.0
2,26.0,26.0,26.0,26.0
3,35.0,35.0,35.0,35.0
4,35.0,35.0,35.0,35.0
...,...,...,...,...
886,27.0,27.0,27.0,27.0
887,19.0,19.0,19.0,19.0
888,,19.0,26.0,22.5
889,26.0,26.0,26.0,26.0


In [27]:
test["Age_ffill"] = test["Age"].fillna(method="ffill")
test["Age_bfill"] = test["Age"].fillna(method="bfill")
test["Age_interpolate"] = test["Age"].interpolate(method="linear", limit_direction="both")
test[["Age", "Age_ffill", "Age_bfill", "Age_interpolate"]].sample(20)

Unnamed: 0,Age,Age_ffill,Age_bfill,Age_interpolate
279,22.0,22.0,22.0,22.0
109,18.5,18.5,18.5,18.5
311,22.0,22.0,22.0,22.0
138,23.0,23.0,23.0,23.0
128,42.0,42.0,42.0,42.0
256,,32.5,28.0,29.5
261,21.0,21.0,21.0,21.0
131,53.0,53.0,53.0,53.0
386,24.0,24.0,24.0,24.0
373,44.0,44.0,44.0,44.0


In [28]:
train["Fare_fill"] = train["Fare"]
test["Fare_fill"] = test["Fare"].interpolate(method="linear", limit_direction="both")
test[test["Fare"].isnull()]

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age_ffill,Age_bfill,Age_interpolate,Fare_fill
152,1044,3,"Storey, Mr. Thomas",male,60.5,0,0,3701,,,S,60.5,60.5,60.5,10.03955


In [35]:
train[["Age", "Age_ffill", "Age_bfill", "Age_interpolate"]].isnull().sum()

Age                177
Age_ffill            0
Age_bfill            0
Age_interpolate      0
dtype: int64

In [34]:
test[["Age", "Age_ffill", "Age_bfill", "Age_interpolate"]].isnull().sum()

Age                86
Age_ffill           0
Age_bfill           2
Age_interpolate     0
dtype: int64

# label & features 

In [29]:
label = "Survived"

In [30]:
train.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked', 'Age_ffill',
       'Age_bfill', 'Age_interpolate', 'Fare_fill'],
      dtype='object')

In [31]:
feature_names = ["Pclass", "Sex", "Age_interpolate", "Fare_fill", "Embarked"]
feature_names

['Pclass', 'Sex', 'Age_interpolate', 'Fare_fill', 'Embarked']

# train & test

In [33]:
X_train = pd.get_dummies(train[feature_names])
y_train = train[label]

X_test = pd.get_dummies(test[feature_names])

X_train.shape, y_train.shape, X_test.shape

((891, 8), (891,), (418, 8))

In [36]:
set(X_train.columns) - set(X_test.columns)

set()

In [37]:
set(X_test.columns) - set(X_train.columns)

set()

# ML : RandomForest

In [38]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
model

In [39]:
from sklearn.model_selection import GridSearchCV

In [40]:
max_depth = list(range(3, 20, 2))
max_features = [0.3, 0.5, 0.7, 0.8, 0.9]
n_estimators = [100, 400, 700, 1000]

In [41]:
parameters = {"max_depth" : max_depth, "max_features":max_features, 
              "n_estimators":n_estimators}

In [42]:
clf = GridSearchCV(model, parameters, n_jobs=-1, cv=5)
clf.fit(X_train, y_train)

In [43]:
clf.best_estimator_

In [44]:
y_predict = clf.predict(X_train)

In [45]:
(y_predict == y_train).mean()

0.9034792368125701

