参考URL
- https://ishitonton.hatenablog.com/entry/2019/02/24/184253
- https://naotaka1128.hatenadiary.jp/entry/kaggle-compe-tips

# import

In [281]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
import warnings

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier

# configs

In [282]:
warnings.filterwarnings("ignore")

INPUT = "../data/input"
OUTPUT = "../data/output"
NAME = "baseline002"

# load data

In [283]:
data_train = pd.read_csv(os.path.join(INPUT, "train.csv"))
data_test = pd.read_csv(os.path.join(INPUT, "test.csv"))
data = pd.concat([data_train, data_test], ignore_index=True)

In [284]:
data.head()

Unnamed: 0,PassengerId,Perished,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,1.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,0.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,0.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,0.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,1.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [285]:
data.shape

(1309, 12)

In [286]:
data.isnull().sum()

PassengerId       0
Perished        418
Pclass            0
Name              0
Sex               0
Age             263
SibSp             0
Parch             0
Ticket            0
Fare              1
Cabin          1014
Embarked          2
dtype: int64

# preprocess

In [287]:
# 欠損値の除去
data["Age"].fillna(data["Age"].mean(), inplace=True)
data["Fare"].fillna(data["Fare"].mean(), inplace=True)
data.drop('Cabin', axis=1, inplace=True)
data["Embarked"].fillna("S", inplace=True)

In [288]:
data.isnull().sum()

PassengerId      0
Perished       418
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Embarked         0
dtype: int64

# feature engineering

In [289]:
sex_onehot = pd.get_dummies(data["Sex"])
sex_onehot.head()

Unnamed: 0,female,male
0,0,1
1,1,0
2,1,0
3,1,0
4,0,1


In [290]:
le = LabelEncoder()
sex_label = pd.DataFrame(le.fit_transform(data["Sex"]), columns=["sex_label"])
sex_label.head()

Unnamed: 0,sex_label
0,1
1,0
2,0
3,0
4,1


In [291]:
encording = data.groupby("Sex").size()
encording = encording / len(data)
sex_freq = data["Sex"].map(encording).to_frame("sex_freq")
sex_freq.head()

Unnamed: 0,sex_freq
0,0.644003
1,0.355997
2,0.355997
3,0.355997
4,0.644003


In [292]:
embarked_onehot = pd.get_dummies(data["Embarked"])
embarked_onehot.head()

Unnamed: 0,C,Q,S
0,0,0,1
1,1,0,0
2,0,0,1
3,0,0,1
4,0,0,1


In [293]:
le = LabelEncoder()
embarked_label = pd.DataFrame(le.fit_transform(data["Embarked"]), columns=["embarked_label"])
embarked_label.head()

Unnamed: 0,embarked_label
0,2
1,0
2,2
3,2
4,2


In [294]:
encording = data.groupby("Embarked").size()
encording = encording / len(data)
embarked_freq = data["Embarked"].map(encording).to_frame("embarked_freq")
embarked_freq.head()

Unnamed: 0,embarked_freq
0,0.699771
1,0.206264
2,0.699771
3,0.699771
4,0.699771


In [295]:
pclass = pd.DataFrame(data["Pclass"])
pclass.head()

Unnamed: 0,Pclass
0,3
1,1
2,3
3,1
4,3


In [308]:
age = pd.DataFrame(data['Age'])
age.head()

Unnamed: 0,Age
0,22.0
1,38.0
2,26.0
3,35.0
4,35.0


In [309]:
encording = data.groupby("Age").size()
encording = encording / len(data)
age_freq = data["Age"].map(encording).to_frame("age_freq")
age_freq.head()

Unnamed: 0,age_freq
0,0.03285
1,0.010695
2,0.022918
3,0.017571
4,0.017571


In [310]:
age_less10 = data["Age"].apply(lambda x : 1 if x <= 10 else 0).to_frame("age_less10")
age_less10.head()

Unnamed: 0,age_less10
0,0
1,0
2,0
3,0
4,0


In [299]:
sibsp = pd.DataFrame(data["SibSp"])
sibsp.head()

Unnamed: 0,SibSp
0,1
1,1
2,0
3,1
4,0


In [300]:
parch = pd.DataFrame(data["Parch"])
parch.head()

Unnamed: 0,Parch
0,0
1,0
2,0
3,0
4,0


In [301]:
fare = pd.DataFrame(data["Fare"])
fare.head()

Unnamed: 0,Fare
0,7.25
1,71.2833
2,7.925
3,53.1
4,8.05


In [302]:
pclass = pd.DataFrame(data["Pclass"])
pclass.head()

Unnamed: 0,Pclass
0,3
1,1
2,3
3,1
4,3


In [303]:
encording = data.groupby("Pclass").size()
encording = encording / len(data)
pclass_freq = data["Pclass"].map(encording).to_frame("pclass_freq")
pclass_freq.head()

Unnamed: 0,pclass_freq
0,0.541635
1,0.246753
2,0.541635
3,0.246753
4,0.541635


In [304]:
familysize = pd.DataFrame(data["Parch"] + data["SibSp"] + 1, columns=["familysize"])
familysize.head()

Unnamed: 0,familysize
0,2
1,2
2,1
3,2
4,1


In [311]:
features_list = [sex_label,
                 sex_freq,
                 embarked_label,
                 embarked_freq,
                 sibsp,
                 age,
                 age_freq,
                 age_less10,
                 pclass,
                 pclass_freq,
                 parch,
                 fare,
                 familysize]

In [312]:
# features_list = [sex_label,
#                  embarked_label,
#                  sibsp,
#                  age,
#                  pclass,
#                  parch,
#                  fare,
#                  familysize]

In [313]:
features = pd.concat(features_list, axis=1)

In [314]:
features.head()

Unnamed: 0,sex_label,sex_freq,embarked_label,embarked_freq,SibSp,Age,age_freq,age_less10,Pclass,pclass_freq,Parch,Fare,familysize
0,1,0.644003,2,0.699771,1,22.0,0.03285,0,3,0.541635,0,7.25,2
1,0,0.355997,0,0.206264,1,38.0,0.010695,0,1,0.246753,0,71.2833,2
2,0,0.355997,2,0.699771,0,26.0,0.022918,0,3,0.541635,0,7.925,1
3,0,0.355997,2,0.699771,1,35.0,0.017571,0,1,0.246753,0,53.1,2
4,1,0.644003,2,0.699771,0,35.0,0.017571,0,3,0.541635,0,8.05,1


In [315]:
X_train = features[:len(data_train)]
y_train = data_train["Perished"]
X_test = features[len(data_train):]

# validation

In [316]:
tr_scores = []
val_scores = []
n_splits = 5

cv = KFold(n_splits=n_splits, shuffle=True, random_state=0)
for fold_id, (train_index, valid_index) in enumerate(cv.split(X_train)):
    X_tr = X_train.loc[train_index, :]
    X_val = X_train.loc[valid_index, :]
    y_tr = y_train[train_index]
    y_val = y_train[valid_index]
    
    model = RandomForestClassifier(max_depth=12, min_samples_leaf=2, random_state=0)
    model.fit(X_tr, y_tr)
    tr_scores.append(model.score(X_tr, y_tr))
    val_scores.append(model.score(X_val, y_val))

In [317]:
tr_score = sum(tr_scores) / n_splits
val_score = sum(val_scores) / n_splits
print(f"train score : {tr_score}")
print(f"valid score : {val_score}")

train score : 0.9161034243661061
valid score : 0.8260121775155357


# training

In [318]:
clf = RandomForestClassifier(max_depth=12, min_samples_leaf=2, random_state=0)
clf.fit(X_train, y_train)
print(clf.score(X_train, y_train))
pred = clf.predict(X_test)

0.9135802469135802


# submission

In [319]:
submission = pd.read_csv(os.path.join(OUTPUT, "sub_sample.csv"))
submission["Perished"] = pred
now = datetime.datetime.now()

In [22]:
submission.head()

Unnamed: 0,PassengerId,Perished
0,892,1
1,893,1
2,894,1
3,895,1
4,896,0


In [23]:
submission.to_csv(os.path.join(OUTPUT, "sub_{0:%Y%m%d%H%M}_{1}_{2}.csv").format(now, round(val_score, 2), NAME))