# load library & data 

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt



In [3]:
train = pd.read_csv("titanic/train.csv")
test = pd.read_csv("titanic/test.csv")

train.shape, test.shape

((891, 12), (418, 11))

# 결측치 대체 

In [4]:
# 이전 값과 다음 값을 이용하여 채움
# 대부분 시계열 데이터에서 데이터가 순서대로 있을 때 사용

# method로 채우는 방법
# 앞에 있는 값으로 결측치 채움
train["Age_ffill"] = train["Age"].fillna(method="ffill")
# 뒤에 있는 값으로 결측치 채움
train["Age_bfill"] = train["Age"].fillna(method="bfill")
train[["Age", "Age_ffill", "Age_bfill"]]

Unnamed: 0,Age,Age_ffill,Age_bfill
0,22.0,22.0,22.0
1,38.0,38.0,38.0
2,26.0,26.0,26.0
3,35.0,35.0,35.0
4,35.0,35.0,35.0
...,...,...,...
886,27.0,27.0,27.0
887,19.0,19.0,19.0
888,,19.0,26.0
889,26.0,26.0,26.0


In [5]:
# forward 방향으로 채우게 되면 맨 앞이 결측치일 경우 채워지지 않음
train["Age"].interpolate(method="linear", limit_direction="forward")

0      22.0
1      38.0
2      26.0
3      35.0
4      35.0
       ... 
886    27.0
887    19.0
888    22.5
889    26.0
890    32.0
Name: Age, Length: 891, dtype: float64

In [6]:
train["Age_interpolate"] = train["Age"].interpolate(method="linear", limit_direction="both")
train[["Age", "Age_ffill", "Age_bfill", "Age_interpolate"]]

Unnamed: 0,Age,Age_ffill,Age_bfill,Age_interpolate
0,22.0,22.0,22.0,22.0
1,38.0,38.0,38.0,38.0
2,26.0,26.0,26.0,26.0
3,35.0,35.0,35.0,35.0
4,35.0,35.0,35.0,35.0
...,...,...,...,...
886,27.0,27.0,27.0,27.0
887,19.0,19.0,19.0,19.0
888,,19.0,26.0,22.5
889,26.0,26.0,26.0,26.0


In [7]:
test["Age_ffill"] = test["Age"].fillna(method="ffill")
test["Age_bfill"] = test["Age"].fillna(method="bfill")
test["Age_interpolate"] = test["Age"].interpolate(method="linear", limit_direction="both")
test[["Age", "Age_ffill", "Age_bfill", "Age_interpolate"]].sample(20)

Unnamed: 0,Age,Age_ffill,Age_bfill,Age_interpolate
413,,28.0,39.0,33.5
392,13.0,13.0,13.0,13.0
311,22.0,22.0,22.0,22.0
97,29.0,29.0,29.0,29.0
6,30.0,30.0,30.0,30.0
171,27.0,27.0,27.0,27.0
362,31.0,31.0,31.0,31.0
303,24.0,24.0,24.0,24.0
4,22.0,22.0,22.0,22.0
205,,25.0,35.0,30.0


In [8]:
train["Fare_fill"] = train["Fare"]
test["Fare_fill"] = test["Fare"].interpolate(method="linear", limit_direction="both")
test[test["Fare"].isnull()]

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age_ffill,Age_bfill,Age_interpolate,Fare_fill
152,1044,3,"Storey, Mr. Thomas",male,60.5,0,0,3701,,,S,60.5,60.5,60.5,10.03955


In [9]:
train[["Age", "Age_ffill", "Age_bfill", "Age_interpolate"]].isnull().sum()

Age                177
Age_ffill            0
Age_bfill            0
Age_interpolate      0
dtype: int64

In [10]:
test[["Age", "Age_ffill", "Age_bfill", "Age_interpolate"]].isnull().sum()

Age                86
Age_ffill           0
Age_bfill           2
Age_interpolate     0
dtype: int64

# label & features 

In [11]:
label = "Survived"

In [12]:
train.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked', 'Age_ffill',
       'Age_bfill', 'Age_interpolate', 'Fare_fill'],
      dtype='object')

In [13]:
feature_names = ["Pclass", "Sex", "Age_interpolate", "Fare_fill", "Embarked"]
feature_names

['Pclass', 'Sex', 'Age_interpolate', 'Fare_fill', 'Embarked']

# train & test

In [14]:
X_train = pd.get_dummies(train[feature_names])
y_train = train[label]

X_test = pd.get_dummies(test[feature_names])

X_train.shape, y_train.shape, X_test.shape

((891, 8), (891,), (418, 8))

In [15]:
set(X_train.columns) - set(X_test.columns)

set()

In [16]:
set(X_test.columns) - set(X_train.columns)

set()

# ML : RandomForest

In [40]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=333, random_state=42, n_jobs=-1, max_depth=7, 
                               max_features=0.5439066747874877)
model

In [41]:
model.fit(X_train, y_train)

In [42]:
y_predict = model.predict(X_test)

# GridSearchCV

In [18]:
from sklearn.model_selection import GridSearchCV

In [19]:
max_depth = list(range(3, 20, 2))
max_features = [0.3, 0.5, 0.7, 0.8, 0.9]
n_estimators = [100, 400, 700, 1000]

In [20]:
parameters = {"max_depth" : max_depth, "max_features":max_features, 
              "n_estimators":n_estimators}

In [21]:
clf = GridSearchCV(model, parameters, n_jobs=-1, cv=5)
clf.fit(X_train, y_train)

In [22]:
clf.best_estimator_

In [23]:
y_predict = clf.predict(X_train)

In [24]:
(y_predict == y_train).mean()

0.9034792368125701

# RandomizedSearchCV

In [36]:
from sklearn.model_selection import RandomizedSearchCV

In [37]:
parameters = {"max_depth": np.random.randint(3, 10, 5), 
              "max_features": np.random.uniform(0.5, 1, 10), 
              "n_estimators": np.random.randint(200, 1000, 100)}

In [38]:
clfr = RandomizedSearchCV(model, param_distributions=parameters, n_iter=10, random_state=42)
clfr.fit(X_train, y_train)

In [39]:
clfr.best_params_

{'n_estimators': 333, 'max_features': 0.5439066747874877, 'max_depth': 7}

# submit

In [46]:
submit = pd.read_csv("titanic/gender_submission.csv")
submit.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


In [47]:
submit["Survived"] = y_predict

In [48]:
submit.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1


In [51]:
submit.to_csv("titanic/randomized.csv", index=False)

In [52]:
pd.read_csv("titanic/grid.csv")

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0
