# Titanic

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import time

import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.compose import make_column_transformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import RepeatedStratifiedKFold,GridSearchCV

from sklearn.pipeline import Pipeline

In [3]:
# データロード
df_train = pd.read_csv('csv/train.csv')
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## 欠損値の対応

In [4]:
df_train['Age'] = df_train['Age'].fillna(df_train['Age'].median())
df_train['Embarked'] = df_train['Embarked'].fillna('S')

df_train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.361582,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,13.019697,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,22.0,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,35.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


## 特徴量

In [5]:
# # 家族数1～3だと生存確率高いはず
# df_train['family_num'] = df_train['SibSp'] + df_train['Parch']
# df_train['family_1-3'] = df_train.apply(lambda row: '1' if (row['family_num']>=1 and row['family_num']<=3) else '0', axis=1)

In [6]:
y_col = 'Survived'
X = df_train.drop(columns=[y_col, 'PassengerId', 'Name', 'Ticket', 'Cabin', 'family_num', 'SibSp', 'Parch'])
y = df_train[y_col]

In [7]:
# 標準化のために数値列のみ抽出
numeric_cols = X.select_dtypes(include=np.number).columns.to_list()
categol_cols = X.select_dtypes(exclude=np.number).columns.to_list()

In [8]:
ct = make_column_transformer(
    (StandardScaler(), numeric_cols),
    (OneHotEncoder(sparse=False, drop="first"), categol_cols)
)

In [9]:
# パイプライン定義
pipeline = Pipeline(steps=[('ct', ct), ('model', LogisticRegression())])

In [10]:
lr_param_grid = {
    'model__penalty': ["l2",  "none"], 
    'model__C': [i / 10 for i in range(3, 20, 2)],  # out -> [0.3, 0.5, 0.7, 0.9, 1.1, 1.3, 1.5, 1.7, 1.9]
    'model__solver':["newton-cg", "sag", "saga", "lbfgs"],
    'model__max_iter': list(range(10, 101, 10)) # out -> [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
}

In [11]:
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3)
grid_search = GridSearchCV(estimator=pipeline, param_grid=lr_param_grid, scoring='f1', cv=cv)
grid_search.fit(X, y)

time.sleep(5)
print(grid_search.best_params_, grid_search.best_score_)

{'model__C': 1.1, 'model__max_iter': 10, 'model__penalty': 'l2', 'model__solver': 'newton-cg'} 0.7277644874209056
