<a href="https://colab.research.google.com/github/waseda-ken/data_analyze_kikagaku/blob/main/%E5%AE%9F%E8%A3%85%E3%83%86%E3%82%B9%E3%83%88.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score

# 1. データの読み込みとラベル数値化
data = pd.read_csv('bank.csv')
data['result'] = data['result'].map({'fail': 0, 'success': 1})
x = data.drop(columns=['result'])
y = data['result']

# 2. データ分割 (train:val:test = 60:20:20), 乱数シード0, 層化サンプリング
x_train_val, x_test, y_train_val, y_test = train_test_split(
    x, y, test_size=0.2, random_state=0, stratify=y
)
x_train, x_val, y_train, y_val = train_test_split(
    x_train_val, y_train_val, test_size=0.25, random_state=0, stratify=y_train_val
)

# 3. 前処理パイプラインの定義
numeric_features = ['age', 'balance', 'pdays', 'previous']
categorical_features = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'poutcome']

numeric_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),  # 数値の欠損を中央値で補完
    ('scaler', StandardScaler())                     # 標準化
])
categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),  # カテゴリの欠損を "missing" で補完
    ('onehot', OneHotEncoder(handle_unknown='ignore'))                     # ワンホットエンコーディング
])

preprocessor = ColumnTransformer([
    ('num', numeric_pipeline, numeric_features),
    ('cat', categorical_pipeline, categorical_features)
])

# 4. モデルパイプラインの設定
pipeline_lr = Pipeline([
    ('pre', preprocessor),
    ('clf', LogisticRegression(solver='liblinear', random_state=0))
])
pipeline_rf = Pipeline([
    ('pre', preprocessor),
    ('clf', RandomForestClassifier(n_jobs=-1, random_state=0))
])

# 5. グリッドサーチ用ハイパーパラメータ
param_grid_lr = {
    'clf__C': [0.01, 0.1, 1, 10],
    'clf__class_weight': [None, 'balanced']
}
param_grid_rf = {
    'clf__n_estimators': [100, 200],
    'clf__max_depth': [None, 10, 20],
    'clf__class_weight': [None, 'balanced', 'balanced_subsample']
}

# 6. 5-fold CV で F1 最大化を目指すグリッドサーチ
search_lr = GridSearchCV(pipeline_lr, param_grid_lr, scoring='f1', cv=5, n_jobs=-1)
search_lr.fit(x_train, y_train)

search_rf = GridSearchCV(pipeline_rf, param_grid_rf, scoring='f1', cv=5, n_jobs=-1)
search_rf.fit(x_train, y_train)

# 7. 検証データで F1 を比較し、ベストモデルを選択
f1_lr = f1_score(y_val, search_lr.predict(x_val))
f1_rf = f1_score(y_val, search_rf.predict(x_val))

if f1_lr >= f1_rf:
    best_model = search_lr.best_estimator_
    model_name = 'LogisticRegression'
    best_f1 = f1_lr
else:
    best_model = search_rf.best_estimator_
    model_name = 'RandomForest'
    best_f1 = f1_rf

print(f"選択モデル: {model_name} (検証 F1 = {best_f1:.4f})")

# 8. 訓練＋検証データ全体で再学習
best_model.fit(x_train_val, y_train_val)

# 9. 推論用データの予測と文字列変換
pred_data = pd.read_csv('bank_pred.csv')
y_pred_num = best_model.predict(pred_data)
y_pred_label = ['success' if y==1 else 'fail' for y in y_pred_num]


選択モデル: RandomForest (検証 F1 = 0.3799)


In [None]:
# 10. submission.csv に書き出し
output_df = pd.DataFrame({'pred': y_pred_label})
output_df.to_csv('submission.csv', index=False)

print("推論結果を submission.csv に保存しました。")

推論結果を submission.csv に保存しました。


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import f1_score
from xgboost import xGBClassifier

# 1. データ読み込みとラベル数値化
data = pd.read_csv('bank.csv')
data['result'] = data['result'].map({'fail': 0, 'success': 1})
x = data.drop(columns=['result'])
y = data['result']

# 2. train:val:test = 60:20:20、乱数シード＝0、層化サンプリング
x_tv, x_test, y_tv, y_test = train_test_split(
    x, y, test_size=0.2, random_state=0, stratify=y
)
x_train, x_val, y_train, y_val = train_test_split(
    x_tv, y_tv, test_size=0.25, random_state=0, stratify=y_tv
)

# 3. 前処理パイプライン
num_feats = ['age', 'balance', 'pdays', 'previous']
cat_feats = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'poutcome']

num_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])
cat_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ('num', num_pipe, num_feats),
    ('cat', cat_pipe, cat_feats)
])

# 4. xGBoost パイプライン＆ハイパーパラグリッド
pipe_xgb = Pipeline([
    ('pre', preprocessor),
    ('clf', xGBClassifier(
        eval_metric='logloss',
        random_state=0,
        use_label_encoder=False  # 古いバージョン互換で残せますが警告が出るので削除してもOK
    ))
])

param_grid = {
    'clf__n_estimators':    [100, 200],
    'clf__max_depth':       [3, 5],
    'clf__learning_rate':   [0.01, 0.1],
    'clf__scale_pos_weight': [1, (y_train==0).sum()/(y_train==1).sum()]
}

search = GridSearchCV(
    pipe_xgb, param_grid,
    scoring='f1', cv=5, n_jobs=-1, verbose=1
)
search.fit(x_train, y_train)

# 5. 検証データで最適閾値を探索
probs_val = search.predict_proba(x_val)[:, 1]
best_thr, best_f1 = 0.5, f1_score(y_val, (probs_val > 0.5).astype(int))
for thr in np.linspace(0.1, 0.9, 81):
    f1 = f1_score(y_val, (probs_val > thr).astype(int))
    if f1 > best_f1:
        best_f1, best_thr = f1, thr

print(f"Best params: {search.best_params_}")
print(f"Validation F1 @ threshold {best_thr:.2f} = {best_f1:.4f}")

# 6. train+val 全体で再学習
best_model = search.best_estimator_
best_model.fit(x_tv, y_tv)

# 7. 提出用データに予測を適用
df_pred = pd.read_csv('bank_pred.csv')
probs_test = best_model.predict_proba(df_pred)[:, 1]
y_test_pred = (probs_test > best_thr).astype(int)
labels = ['success' if y==1 else 'fail' for y in y_test_pred]

# 8. submission.csv に出力
pd.Series(labels, name='pred').to_csv('submission.csv', index=False)
print("submission.csv を出力しました。")


Fitting 5 folds for each of 16 candidates, totalling 80 fits


Parameters: { "use_label_encoder" } are not used.



Best params: {'clf__learning_rate': 0.1, 'clf__max_depth': 3, 'clf__n_estimators': 100, 'clf__scale_pos_weight': np.float64(7.530805687203792)}
Validation F1 @ threshold 0.60 = 0.4000


Parameters: { "use_label_encoder" } are not used.



submission.csv を出力しました。
