## 特徴量エンジニアリングが中心

In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

SEED = 1234

df_train = pd.read_csv('../data/train.csv')
df_test = pd.read_csv('../data/test.csv')

In [2]:
from preprocessing import Preprocessing

df_train, df_test = Preprocessing().preproces(df_train, df_test)

In [3]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

forest = RandomForestClassifier
model_params = {"random_state": SEED}
eval_func = lambda true_y, pred_y: accuracy_score(true_y, pred_y)
select_columns = [
    'Fare', 'Age_na', 
    'Pclass_2', 'Pclass_3', 'Sex_male', 'SibSp_1', 'SibSp_2', 'SibSp_3', 'SibSp_4', 
    'SibSp_5', 'SibSp_8', 'Parch_1', 'Parch_2', 'Parch_3', 'Parch_4', 'Parch_5', 
    'Parch_6', 'Embarked_Q', 'Embarked_S', "Age"
]

In [4]:
from selectFeatures import ForwardFeatureSelection

target = "Survived"

FFS = ForwardFeatureSelection()
selected_features = FFS.select(
    df_train, select_columns, target, forest, model_params, eval_func
)

100%|██████████| 20/20 [00:05<00:00,  3.99it/s]
100%|██████████| 20/20 [00:04<00:00,  4.23it/s]
100%|██████████| 20/20 [00:04<00:00,  4.42it/s]
100%|██████████| 20/20 [00:04<00:00,  4.64it/s]
100%|██████████| 20/20 [00:04<00:00,  4.91it/s]
100%|██████████| 20/20 [00:03<00:00,  5.18it/s]

selected features: ['Sex_male', 'SibSp_3', 'SibSp_8', 'Pclass_3', 'Embarked_S']
accuracy score: 0.81





In [14]:
import optuna
SEED = 1234

def objective(trial):
    model_cls = RandomForestClassifier
    model_params = {
        "n_estimators": trial.suggest_int('n_estimators', 50, 1000),
        "criterion": trial.suggest_categorical('criterion', ["gini", "entropy"]),
        "max_depth": trial.suggest_int('max_depth', 1, 100),
        "random_state": SEED
    }
    
    eval_func = lambda true_y, pred_y: accuracy_score(true_y, pred_y)
    metric = FFS.validation(
        df_train, selected_features, target, model_cls, model_params, eval_func
    )
    
    return metric

In [None]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100)

# optunaの結果を取得
print(study.best_params)
print(study.best_value)
model_params = study.best_params

[32m[I 2022-12-18 10:15:40,108][0m A new study created in memory with name: no-name-4a627ce9-0c0e-4fbe-b97a-5cab4075e63d[0m
[32m[I 2022-12-18 10:15:50,897][0m Trial 0 finished with value: 0.8114478114478114 and parameters: {'n_estimators': 629, 'criterion': 'entropy', 'max_depth': 61}. Best is trial 0 with value: 0.8114478114478114.[0m
[32m[I 2022-12-18 10:15:57,577][0m Trial 1 finished with value: 0.8114478114478114 and parameters: {'n_estimators': 394, 'criterion': 'gini', 'max_depth': 20}. Best is trial 0 with value: 0.8114478114478114.[0m
[32m[I 2022-12-18 10:16:11,168][0m Trial 2 finished with value: 0.8114478114478114 and parameters: {'n_estimators': 802, 'criterion': 'gini', 'max_depth': 90}. Best is trial 0 with value: 0.8114478114478114.[0m
[32m[I 2022-12-18 10:16:14,582][0m Trial 3 finished with value: 0.8114478114478114 and parameters: {'n_estimators': 196, 'criterion': 'gini', 'max_depth': 70}. Best is trial 0 with value: 0.8114478114478114.[0m
[32m[I 2022-1

In [None]:
# 別でで最適化したハイパーパラメタ
model_params = {
    'n_estimators': 753, 
    'criterion': 'entropy', 
    'max_depth': 8,
    "random_state": SEED
}
model_cls = RandomForestClassifier

eval_func = lambda true_y, pred_y: accuracy_score(true_y, pred_y)
metrics, pred_y = FFS.validation(
    df_train, selected_features, target, model_cls, model_params, eval_func, df_test=df_test, is_pred=True
)
print(f'accuracy score: {metrics}')

In [12]:
pred_y = pred_y.astype(np.int64)

output = pd.DataFrame({"PassengerId": df_test["PassengerId"], "Survived": pred_y})
output.to_csv('result.csv', header=True, index=False)

In [13]:
output

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [None]:
#　特徴量の説明度合いを調べる

from statsmodels.stats.outliers_influence import OLSInfluence, variance_inflation_factor
import statsmodels.api as sm

def print_statsmodels(df, columns, target):
    # 重回帰分析
    X = sm.add_constant(df[columns])
    y = df[target]
    model = sm.OLS(y, X)
    fitted = model.fit()
    
    print('summary = \n', fitted.summary())

print_statsmodels(df_train, selected_features, target)