In [1]:
import pandas as pd
import numbers as np
import seaborn as sns
import matplotlib.pyplot as plt
import japanize_matplotlib
import sys
import openpyxl
import yaml
sys.path.append('../titanic_utils')

import Data_check as ch
import Processing as ps
import Modeling_rf as rf

In [2]:
df_train = ch.load_titanic_data('train.csv', '../data')
df_test = ch.load_titanic_data('test.csv', '../data')

In [3]:
# Fareの欠損値を中央値で補完
df_train = ps.fill_missing_values(df_train, 'Fare', value=df_train['Fare'].median())
df_test = ps.fill_missing_values(df_test, 'Fare', value=df_test['Fare'].median())

In [4]:
# Embarkedの欠損値→Unknown
df_train = ps.fill_missing_values(df_train, 'Embarked', 'C')
df_test = ps.fill_missing_values(df_test, 'Embarked', 'C')
df_train['Embarked'].value_counts()

Embarked
S    644
C    170
Q     77
Name: count, dtype: int64

In [5]:
# チケットの枚数でグループ化した特徴量
df_train = ps.label_ticket_groups(df_train)
df_test = ps.label_ticket_groups(df_test)
df_train['TicketGroup'].value_counts()

TicketGroup
1    596
2    295
Name: count, dtype: int64

In [6]:
# Nameから敬称グループを作成
df_train = ps.assign_honorific_groups(df_train)
df_test = ps.assign_honorific_groups(df_test)
df_train['Honorifics'].value_counts()

Honorifics
Master     558
Miss       184
Mrs        127
Unknown     13
Officer      5
Royalty      4
Name: count, dtype: int64

In [7]:
# Ageの欠損値→ランダムフォレスト回帰モデルを使用して予測し、代入
df_train = ps.predict_and_fill_age(df_train)
df_test = ps.predict_and_fill_age(df_test)

In [8]:
# 家族のサイズを計算してラベルを割り当て
df_train = ps.assign_family_labels(df_train)
df_test = ps.assign_family_labels(df_test)
df_train['FamilySize'].value_counts().sort_index()

FamilySize
1     537
2     161
3     102
4      29
5      15
6      22
7      12
8       6
11      7
Name: count, dtype: int64

In [9]:
df_train['FamilyLabel'].value_counts().sort_index()

FamilyLabel
0     13
1    586
2    292
Name: count, dtype: int64

In [11]:
with open("../settings/parameters.yml", 'r') as file:
    params_ = yaml.safe_load(file)

In [12]:
train_data = df_train[params_['column_settings']['train_columns']]
train_data = pd.get_dummies(train_data, dtype=int)
X = train_data.values[:,1:]  
y = train_data.values[:,0]

test_data = df_test[params_['column_settings']['test_columns']]
test_data = pd.get_dummies(test_data, dtype=int)

In [13]:
# パラメータの読み込み
params = rf.load_parameters("../settings/parameters.yml")
print(params['random_forest_config'])

{'n_estimators': [20, 21, 22, 23, 24, 25, 26, 27, 28, 29], 'max_depth': [3, 4, 5, 6, 7, 8, 9], 'random_state': 10, 'max_features': 'sqrt'}


In [14]:
trained_model = rf.train_model(X, y, params)

In [15]:
# モデル評価
best_params = trained_model.best_params_
best_score = trained_model.best_score_
print(f"Best Parameters: {best_params}")
print(f"Best Score: {best_score}")

Best Parameters: {'classify__max_depth': 7, 'classify__n_estimators': 20}
Best Score: 0.8237952559300872


In [16]:
# テストデータの予測
predictions = rf.predict(trained_model, test_data)

In [17]:
rf.result_to_csv(predictions, 'randam_forest')

'gender_submission.csv updated with predictions'