## 特徴量エンジニアリングが中心

In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

SEED = 1234

df_train = pd.read_csv('../data/train.csv')
df_test = pd.read_csv('../data/test.csv')

In [2]:
from preprocessing import Preprocessing

df_train, df_test = Preprocessing().preproces(df_train, df_test)

In [3]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

forest = RandomForestClassifier
model_params = {"random_state": SEED}
eval_func = lambda true_y, pred_y: accuracy_score(true_y, pred_y)
select_columns = [
    'Fare', 'Age_na', 
    'Pclass_2', 'Pclass_3', 'Sex_male', 'SibSp_1', 'SibSp_2', 'SibSp_3', 'SibSp_4', 
    'SibSp_5', 'SibSp_8', 'Parch_1', 'Parch_2', 'Parch_3', 'Parch_4', 'Parch_5', 
    'Parch_6', 'Embarked_Q', 'Embarked_S', "Age"
]

In [4]:
from selectFeatures import ForwardFeatureSelection

target = "Survived"

FFS = ForwardFeatureSelection()
selected_features = FFS.select(
    df_train, select_columns, target, forest, model_params, eval_func
)

100%|██████████| 20/20 [00:05<00:00,  3.96it/s]
100%|██████████| 20/20 [00:04<00:00,  4.13it/s]
100%|██████████| 20/20 [00:04<00:00,  4.39it/s]
100%|██████████| 20/20 [00:04<00:00,  4.62it/s]
100%|██████████| 20/20 [00:04<00:00,  4.89it/s]
100%|██████████| 20/20 [00:03<00:00,  5.13it/s]

selected features: ['Sex_male', 'SibSp_3', 'SibSp_8', 'Pclass_3', 'Embarked_S']
accuracy score: 0.81





In [5]:
import optuna
SEED = 1234

def objective(trial):
    model_cls = RandomForestClassifier
    model_params = {
        "n_estimators": trial.suggest_int('n_estimators', 50, 1000),
        "criterion": trial.suggest_categorical('criterion', ["gini", "entropy"]),
        "max_depth": trial.suggest_int('max_depth', 1, 100),
        "random_state": SEED
    }
    
    eval_func = lambda true_y, pred_y: accuracy_score(true_y, pred_y)
    metric = FFS.validation(
        df_train, select_columns, target, model_cls, model_params, eval_func
    )
    
    return metric

In [6]:
# 下で最適化したハイパーパラメタ
model_params = {
    'n_estimators': 753, 
    'criterion': 'entropy', 
    'max_depth': 8,
    "random_state": SEED
}
model_cls = RandomForestClassifier

eval_func = lambda true_y, pred_y: accuracy_score(true_y, pred_y)
metrics, pred_y = FFS.validation(
    df_train, select_columns, target, model_cls, model_params, eval_func, df_test=df_test, is_pred=True
)
print(f'accuracy score: {metrics}')

ValueError: The truth value of a DataFrame is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [None]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100)

print(f"best parametors: {study.best_params}")
print(f"best accuracy score: {study.best_value}")
model_params = study.best_params

In [None]:
model_params["random_state"] = SEED
model_cls = RandomForestClassifier

eval_func = lambda true_y, pred_y: accuracy_score(true_y, pred_y)
metrics, pred_y = FFS.validation(
    df_train, select_columns, target, model_cls, model_params, eval_func, df_test=df_test
)
print(f'accuracy score: {metrics}')

In [None]:
model_params