In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.model_selection import GridSearchCV # ハイパーパラメータチューニングと交差検証を自動で行うためのライブラリ
from sklearn.ensemble import RandomForestClassifier # ランダムフォレスト
import lightgbm as gbm # LightGBM
import xgboost as xgb # XGBoost
from sklearn.model_selection import train_test_split
pd.set_option('display.max_rows', 20)
np.set_printoptions(threshold = np.inf)

In [2]:
def extract_alpha_from_prefix(cabins):
    if pd.isna(cabins):
        return cabins

    alpha_part = ""
    for cabin in cabins.split(' '):
        for char in cabin:
            if char.isalpha():
                alpha_part += char
            else:
                break

    for i in range(len(alpha_part)-1):
        if alpha_part[i] != alpha_part[i+1]:
            return alpha_part

    return alpha_part[0]

In [3]:
def eda(df_all):
    # 3.1.1 Name -> Title
    df_all['Title'] = df_all['Name'].str.split(',', expand = True)[1].str.split('.', expand = True)[0].str.strip()

    # 出現回数が非常に少ないタイトルは過学習の原因になるので、似ている特性でカテゴライズする
    # (Miss、Mrs、)Ms、Mlle、Lady、Mme、The Countess、Donaはすべて女性であるため、Miss/Mrs/Msに置き換える
    df_all['Title'] = df_all['Title'].replace(['Ms', 'Mlle', 'Lady', 'Mme', 'the Countess', 'Dona'], 'Miss/Mrs/Ms')
    df_all['Title'] = df_all['Title'].replace(['Dr', 'Col', 'Major', 'Jonkheer', 'Capt', 'Sir', 'Don', 'Rev'], 'Dr/Military/Noble/Clergy')

    # 3.1.2 Ticket -> Ticket Frequency
    df_all['Ticket_Frequency'] = df_all.groupby('Ticket')['Ticket'].transform('count')


    # Cabin -> Segment
    df_all['Segment'] = df_all['Cabin'].apply(extract_alpha_from_prefix)

    # 特徴量同士を組み合わせる
    df_all['Family'] = df_all['SibSp'] + df_all['Parch']
    return df_all

In [4]:
def transform_data_with_eda(data, label_encoders = None, ct = None):
    # 欠損値の処理(fillna関数のinplace = Trueは、PandasのDataFrameやSeriesのメソッドで使用される引数で、元のDataFrameまたはSeriesを直接変更することを指定)
    data['Age'] = data['Age'].fillna(data['Age'].median())
    data['Embarked'] = data['Embarked'].fillna(data['Embarked'].mode()[0])
    data['Fare'] = data['Fare'].fillna(data['Fare'].median())

    # 特徴量の部分抽出
    data = eda(data)
    remove_columns = ['Name', 'Ticket', 'Cabin', 'SibSp', 'Parch']
    data = data.drop(columns = remove_columns)
    data_columns = data.columns.tolist()
    data_index = data.index

    # カテゴリ変数を数値に変換
    # LabelEncodeing(カテゴリ変数が特定の順番を持つ場合)
    le_columns = ['Segment'] # 'Embarked'
    if not label_encoders:
        label_encoders = {}
        for column in le_columns:
            le = LabelEncoder()
            data[column] = le.fit_transform(data[column])
            label_encoders[column] = le
    else:
        for column in le_columns:
            data[column] = label_encoders[column].transform(data[column])

    # OneHotEncoding(カテゴリ変数が特定の順番を持たない場合)
    ohe_columns = ['Sex', 'Title', 'Embarked']
    if not ct:
        ct = ColumnTransformer(
            transformers = [(
                'encoder',
                OneHotEncoder(handle_unknown = 'ignore', sparse_output = False),
                ohe_columns
            )],
            remainder = 'passthrough'
        )
        data = ct.fit_transform(data)
    else:
        data = ct.transform(data)

    encoded_feature_names = ct.named_transformers_['encoder'].get_feature_names_out(ohe_columns) # OneHotEncoderによって生成された新しい列名を取得
    passthrough_columns = [col for col in data_columns if col not in ohe_columns] # remainder='passthrough' でそのまま通過した列の名前を取得
    all_feature_names = list(encoded_feature_names) + passthrough_columns
    data = pd.DataFrame(data, columns = all_feature_names, index = data_index)

    # 整数型に戻したいカラムのリストを定義
    int_columns = ['PassengerId', 'Survived']
    for col in int_columns:
        if col in data.columns:
            data[col] = data[col].astype(int)

    return data, label_encoders, ct

## 学習用

In [5]:
# データロード
train_data = pd.read_csv('./data/train.csv')

In [6]:
X = train_data.drop(columns = ['Survived', 'PassengerId'])
y = train_data['Survived']
X, le_encoders, ct = transform_data_with_eda(X)
print(X.columns)
remove_columns = ['Sex_male', 'Title_Mrs', 'Embarked_S']
X = X.drop(columns = remove_columns)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

Index(['Sex_female', 'Sex_male', 'Title_Dr/Military/Noble/Clergy',
       'Title_Master', 'Title_Miss', 'Title_Miss/Mrs/Ms', 'Title_Mr',
       'Title_Mrs', 'Embarked_C', 'Embarked_Q', 'Embarked_S', 'Pclass', 'Age',
       'Fare', 'Ticket_Frequency', 'Segment', 'Family'],
      dtype='object')


In [7]:
X.columns

Index(['Sex_female', 'Title_Dr/Military/Noble/Clergy', 'Title_Master',
       'Title_Miss', 'Title_Miss/Mrs/Ms', 'Title_Mr', 'Embarked_C',
       'Embarked_Q', 'Pclass', 'Age', 'Fare', 'Ticket_Frequency', 'Segment',
       'Family'],
      dtype='object')

In [8]:
# --- フィーチャースケーリング ---
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [None]:
# --- モデルの訓練 ---
# 試したいパラメータの候補を辞書で定義
param_grid_gbm = {
    'n_estimators': [100, 250, 500, 750, 1000],
    'max_depth': [3, 5, 7, -1],
    'learning_rate': [0.01, 0.05, 0.1],
    'num_leaves': [15, 31, 63],
    'reg_alpha': [0, 0.1, 1, 10],
    'reg_lambda': [0, 0.1, 1, 10],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'subsample': [0.6, 0.8, 1.0],
}

# モデルとパラメータ候補、交差検証の方法を指定してGridSearchCVを準備
# cv=5 は5-Fold Cross-Validationを意味する
grid_search = GridSearchCV(
    estimator = gbm.LGBMClassifier(random_state = 0, verbose = -1),
    # estimator = xgb.XGBClassifier(random_state = 0, use_label_encoder = False, eval_metric = 'logloss'),
    param_grid = param_grid_gbm,
    cv = 10,
    scoring = 'accuracy'
)

# 訓練データで探索を実行
grid_search.fit(X_train, y_train)

# 最も性能が良かったパラメータとスコアを確認
best_params_R = grid_search.best_params_
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

"""
ランダムフォレスト
Best Parameters: {'max_depth': 7, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 10, 'n_estimators': 200}
Best Score: 0.8385367762128325
"""



## 提出用

In [None]:
# データロード
train_data = pd.read_csv('./data/train.csv')
test_data = pd.read_csv('./data/test.csv')

# データの前処理
X_train = train_data.drop(columns = ['Survived', 'PassengerId'])
y_train = train_data['Survived']
X_test = test_data.drop(columns = ['PassengerId'])
X_train, le_encoders, ct = transform_data_with_eda(X_train)
X_test, _, _ = transform_data_with_eda(X_test, le_encoders, ct)
remove_columns = ['Sex_male', 'Title_Mrs', 'Embarked_S']
X_train = X_train.drop(columns = remove_columns)
X_test = X_test.drop(columns = remove_columns)

# --- フィーチャースケーリング ---
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# --- 最適なパラメータでのモデル訓練 ---
classifier = gbm.LGBMClassifier(random_state = 0, **best_params_R)
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)
X_test_passengerid = test_data['PassengerId'].values

survived_passenger = pd.DataFrame({
    'PassengerId': X_test_passengerid,
    'Survived': y_pred
})
survived_passenger.to_csv('./data/submittion_randomforest2.csv', index = False, encoding = 'utf-8')

