# モデル学習を行う際の関数一覧

・lightGBM  
・RandomForest  
・CatBoost


In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error, mean_squared_log_error
from sklearn.model_selection import KFold
import lightgbm as lgb
from catboost import CatBoostRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor

import joblib

In [None]:
# light GBM(early stopping)
params = {
    "boosting_type": "gbdt",
    "objective": "regression",
    "metric": "rmsle",
    "learning_rate": 0.1,
    "num_leaves": 16,
    "n_estimators": 100000,
    "importance_type": "gain",
    "random_state": 42
}

def LGBM_cv(input_x, input_y, params, n_split=5):

    metrics = []
    imp = pd.DataFrame()
    models = []
    oof_preds = np.zeros(len(input_x))

    cv = list(KFold(n_splits=n_split, shuffle=True, random_state=42).split(input_x, input_y))

    for nfold in range(n_split):
        print("-" * 20, f"Fold {nfold}", "-" * 20)
        idx_tr, idx_va = cv[nfold]
        X_tr, y_tr = input_x.iloc[idx_tr], input_y.iloc[idx_tr]
        X_va, y_va = input_x.iloc[idx_va], input_y.iloc[idx_va]

        print(X_tr.shape, y_tr.shape)
        print(X_va.shape, y_va.shape)

        model = lgb.LGBMRegressor(**params)
        model.fit(X_tr, y_tr,
                  eval_set=[(X_tr, y_tr), (X_va, y_va)],
                  eval_metric="rmse",
                  callbacks=[
                      lgb.early_stopping(stopping_rounds=50, verbose=True),
                      lgb.log_evaluation(100)
                  ])
        models.append(model)

        y_tr_pred = model.predict(X_tr)
        y_va_pred = model.predict(X_va)
        metric_train = np.sqrt(mean_squared_error(y_tr, y_tr_pred))
        metric_test = np.sqrt(mean_squared_error(y_va, y_va_pred))
        metrics.append([nfold, metric_train, metric_test])
        
        oof_preds[idx_va] = y_va_pred

        _imp = pd.DataFrame({
            "col": X_tr.columns,
            "imp": model.booster_.feature_importance(importance_type='gain'),
            "nfold": nfold
        })
        imp = pd.concat([imp, _imp], axis=0, ignore_index=True)

    print("=" * 20, "CV Results", "=" * 20)

    metrics = np.array(metrics)
    print(metrics)
    print(f"[cv] train: {metrics[:,1].mean():.5f}±{metrics[:,1].std():.5f}, "
          f"test: {metrics[:,2].mean():.5f}±{metrics[:,2].std():.5f}")

    imp = imp.groupby("col")["imp"].agg(["mean", "std"]).reset_index()
    imp.columns = ["col", "imp", "imp_std"]
    
    # modelsを保存
    # joblib.dump(models, "lgbm_models.pkl")

    return models, imp, metrics, oof_preds

In [None]:
# Cat_boost (early stopping)
cat_params = {
    "iterations": 1000,
    "learning_rate": 0.1,
    "l2_leaf_reg": 3.0, 
    "verbose": 100,
    "random_state": 42
    }

def cat_cv(input_x, input_y, params, n_split=5):
    metrics = []
    imp = pd.DataFrame()
    models = []
    cat_features = ["Sex"]

    cv = list(KFold(n_splits=n_split,
                              shuffle=True,
                              random_state=42).split(input_x, input_y))

    for nfold in range(n_split):
        print("-" * 20, f"Fold {nfold}", "-" * 20)
        idx_tr, idx_va = cv[nfold][0],cv[nfold][1]
        X_tr, y_tr = input_x.iloc[idx_tr], input_y.iloc[idx_tr]
        X_va, y_va = input_x.iloc[idx_va], input_y.iloc[idx_va]
        print(X_tr.shape, y_tr.shape)
        print(X_va.shape, y_va.shape)

        model = CatBoostRegressor(**params)
        model.fit(
            X_tr,
            y_tr,
            eval_set=(X_va, y_va),
            early_stopping_rounds=50,
            use_best_model=True,
            cat_features=cat_features,
            verbose=100
            )

        models.append(model)

        y_tr_pred = model.predict(X_tr)
        y_va_pred = model.predict(X_va)
        metric_train = np.sqrt(mean_squared_error(y_tr, y_tr_pred))
        metric_test = np.sqrt(mean_squared_error(y_va, y_va_pred))
        metrics.append([nfold, metric_train, metric_test])

        _imp = pd.DataFrame({
            "col": X_tr.columns,
            "imp": model.get_feature_importance(type='PredictionValuesChange'),
            "nfold": nfold
        })
        imp = pd.concat([imp, _imp], axis=0, ignore_index=True)

    print("=" * 20, "CV Results", "=" * 20)

    metrics = np.array(metrics)
    print(metrics)
    print(f"[cv] train: {metrics[:,1].mean():.5f}±{metrics[:,1].std():.5f}, "
          f"test: {metrics[:,2].mean():.5f}±{metrics[:,2].std():.5f}")

    imp = imp.groupby("col")["imp"].agg(["mean", "std"]).reset_index()
    imp.columns = ["col", "imp", "imp_std"]

    return models, imp, metrics

In [4]:
# Random Forest
rf_params = {"max_depth": 10,
             "criterion":"squared_error",
             "max_features":"sqrt",
             "min_samples_leaf": 1,
             "min_samples_split": 5,
             "n_estimators": 100,
             "n_jobs":1,
             "bootstrap":True,
             "random_state": 42
            }

def RF_cv(input_x, input_y, params, n_split=5):

    metrics = []
    imp = pd.DataFrame()
    models = []

    cv = list(KFold(n_splits=n_split, shuffle=True, random_state=42).split(input_x, input_y))

    for nfold in range(n_split):
        print("-" * 20, f"Fold {nfold}", "-" * 20)
        idx_tr, idx_va = cv[nfold]
        X_tr, y_tr = input_x.iloc[idx_tr], input_y.iloc[idx_tr]
        X_va, y_va = input_x.iloc[idx_va], input_y.iloc[idx_va]

        print(X_tr.shape, y_tr.shape)
        print(X_va.shape, y_va.shape)

        model = RandomForestRegressor(**params)
        model.fit(X_tr, y_tr,)
        models.append(model)

        y_tr_pred = model.predict(X_tr)
        y_va_pred = model.predict(X_va)
        metric_train = np.sqrt(mean_squared_error(y_tr, y_tr_pred))
        metric_test = np.sqrt(mean_squared_error(y_va, y_va_pred))
        metrics.append([nfold, metric_train, metric_test])

        _imp = pd.DataFrame({
            "col": X_tr.columns,
            "imp": model.feature_importances_,
            "nfold": nfold
        })
        imp = pd.concat([imp, _imp], axis=0, ignore_index=True)

    print("=" * 20, "CV Results", "=" * 20)

    metrics = np.array(metrics)
    print(metrics)
    print(f"[cv] train: {metrics[:,1].mean():.5f}±{metrics[:,1].std():.5f}, "
          f"test: {metrics[:,2].mean():.5f}±{metrics[:,2].std():.5f}")

    imp = imp.groupby("col")["imp"].agg(["mean", "std"]).reset_index()
    imp.columns = ["col", "imp", "imp_std"]

    return models, imp, metrics

In [None]:
models_rf, imp_rf, metrics_rf = RF_cv(X, y, rf_params)

# Feature importanceの表示
imp_rf.sort_values("imp", ascending=False, ignore_index=True)

以下、パラメーター設定なし

・CatBoost  
・ExtraTree

In [None]:
def run_catboost_cv(input_x, input_y, n=5):
    kf = KFold(n_splits=n, shuffle=True, random_state=42)
    oof_pred = np.zeros(len(input_y))
    models = []

    importances = np.zeros(input_x.shape[1])
    r2_list, mae_list, rmse_list = [], [], []

    for train_idx, valid_idx in kf.split(input_x):
        X_train, X_valid = input_x.iloc[train_idx], input_x.iloc[valid_idx]
        y_train, y_valid = input_y.iloc[train_idx], input_y.iloc[valid_idx]

        model = CatBoostRegressor(verbose=0, random_state=42)
        model.fit(X_train, y_train)
        pred = model.predict(X_valid)
        models.append(model)

        oof_pred[valid_idx] = pred
        importances += model.get_feature_importance()

        r2_list.append(r2_score(y_valid, pred))
        mae_list.append(mean_absolute_error(y_valid, pred))
        rmse_list.append(np.sqrt(mean_squared_error(y_valid, pred)))

    # 最終モデルを全データで学習
    #final_model = CatBoostRegressor(verbose=0, random_state=42)
    #final_model.fit(input_x, input_y)

    metric = {
        "R2": np.mean(r2_list),
        "MAE": np.mean(mae_list),
        "RMSE": np.mean(rmse_list)
    }

    importance_df = pd.DataFrame({
        "feature": input_x.columns,
        "importance": importances / n
    }).sort_values(by="importance", ascending=False)

    return models, metric, importance_df, oof_pred

In [None]:
def run_extratrees_cv(input_x, input_y, n = 5):
    kf = KFold(n_splits=n, shuffle=True, random_state=42)
    oof_pred = np.zeros(len(input_y))

    importances = np.zeros(input_x.shape[1])
    r2_list, mae_list, rmse_list = [], [], []
    models = []

    for train_idx, valid_idx in kf.split(input_x):
        X_train, X_valid = input_x.iloc[train_idx], input_x.iloc[valid_idx]
        y_train, y_valid = input_y.iloc[train_idx], input_y.iloc[valid_idx]

        model = ExtraTreesRegressor(n_jobs=-1, random_state=42)
        model.fit(X_train, y_train)
        pred = model.predict(X_valid)
        models.append(model)

        oof_pred[valid_idx] = pred
        importances += model.feature_importances_

        r2_list.append(r2_score(y_valid, pred))
        mae_list.append(mean_absolute_error(y_valid, pred))
        rmse_list.append(np.sqrt(mean_squared_error(y_valid, pred)))

    # 最終モデルを全データで学習
    # final_model = ExtraTreesRegressor(n_jobs=-1, random_state=42)
    # final_model.fit(input_x, input_y)

    metric = {
        "R2": np.mean(r2_list),
        "MAE": np.mean(mae_list),
        "RMSE": np.mean(rmse_list)
    }

    importance_df = pd.DataFrame({
        "feature": input_x.columns,
        "importance": importances / n  # 平均
    }).sort_values(by="importance", ascending=False)

    return models, metric, importance_df, oof_pred


# StratifiedKFoldを使用

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.cross_decomposition import PLSRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import numpy as np
import pandas as pd

1. 等幅Bin  
2. 等位Bin  
Sturges’ Ruleに基づいて、データを分割

In [None]:
# 等幅ビン（Sturges’ Rule 本来の解釈）
def compare_linear_models_cv_equal_width(input_x, input_y, seed=42):
    N = len(input_y)
    k_bins = int(np.floor(np.log2(N) + 1))  # Sturges' Rule

    # 等幅ビン分割（cut）
    y_bins = pd.cut(input_y, bins=k_bins, labels=False)

    # モデル定義
    models_to_compare = {
        'LinearRegression': lambda: LinearRegression(),
        'Ridge': lambda: Ridge(alpha=1.0, random_state=seed),
        'Lasso': lambda: Lasso(alpha=0.01, random_state=seed, max_iter=10000),
        'PLS': lambda: PLSRegression(n_components=5),
    }

    results = []
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)

    for model_name, model_func in models_to_compare.items():
        r2_scores, mae_scores, rmse_scores = [], [], []

        for train_idx, valid_idx in skf.split(input_x, y_bins):
            X_train, X_valid = input_x.iloc[train_idx], input_x.iloc[valid_idx]
            y_train, y_valid = input_y.iloc[train_idx], input_y.iloc[valid_idx]

            # スケーリング
            scaler_x = StandardScaler()
            scaler_y = StandardScaler()
            X_train_scaled = scaler_x.fit_transform(X_train)
            X_valid_scaled = scaler_x.transform(X_valid)
            y_train_scaled = scaler_y.fit_transform(y_train.values.reshape(-1, 1)).ravel()

            # 学習
            model = model_func()
            model.fit(X_train_scaled, y_train_scaled)

            # 予測 & 逆変換
            y_pred_scaled = model.predict(X_valid_scaled)
            y_pred = scaler_y.inverse_transform(y_pred_scaled.reshape(-1, 1)).ravel()

            # 評価
            r2_scores.append(r2_score(y_valid, y_pred))
            mae_scores.append(mean_absolute_error(y_valid, y_pred))
            rmse_scores.append(np.sqrt(mean_squared_error(y_valid, y_pred)))

        results.append({
            "Model": model_name,
            "R2": np.mean(r2_scores),
            "MAE": np.mean(mae_scores),
            "RMSE": np.mean(rmse_scores)
        })

    result_df = pd.DataFrame(results)
    print("=== 等幅ビン (Sturges’ Rule) ===")
    print(result_df)
    return result_df

In [None]:
# 等分位ビン（実用的：各foldで均等に分割されやすい）
def compare_linear_models_cv_equal_freq(input_x, input_y, seed=42):
    N = len(input_y)
    k_bins = int(np.floor(np.log2(N) + 1))  # Sturges' Rule

    # 等分位ビン分割（qcut）
    y_bins = pd.qcut(input_y, q=k_bins, labels=False, duplicates="drop")

    # モデル定義
    models_to_compare = {
        'LinearRegression': lambda: LinearRegression(),
        'Ridge': lambda: Ridge(alpha=1.0, random_state=seed),
        'Lasso': lambda: Lasso(alpha=0.01, random_state=seed, max_iter=10000),
        'PLS': lambda: PLSRegression(n_components=5),
    }

    results = []
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)

    for model_name, model_func in models_to_compare.items():
        r2_scores, mae_scores, rmse_scores = [], [], []

        for train_idx, valid_idx in skf.split(input_x, y_bins):
            X_train, X_valid = input_x.iloc[train_idx], input_x.iloc[valid_idx]
            y_train, y_valid = input_y.iloc[train_idx], input_y.iloc[valid_idx]

            # スケーリング
            scaler_x = StandardScaler()
            scaler_y = StandardScaler()
            X_train_scaled = scaler_x.fit_transform(X_train)
            X_valid_scaled = scaler_x.transform(X_valid)
            y_train_scaled = scaler_y.fit_transform(y_train.values.reshape(-1, 1)).ravel()

            # 学習
            model = model_func()
            model.fit(X_train_scaled, y_train_scaled)

            # 予測 & 逆変換
            y_pred_scaled = model.predict(X_valid_scaled)
            y_pred = scaler_y.inverse_transform(y_pred_scaled.reshape(-1, 1)).ravel()

            # 評価
            r2_scores.append(r2_score(y_valid, y_pred))
            mae_scores.append(mean_absolute_error(y_valid, y_pred))
            rmse_scores.append(np.sqrt(mean_squared_error(y_valid, y_pred)))

        results.append({
            "Model": model_name,
            "R2": np.mean(r2_scores),
            "MAE": np.mean(mae_scores),
            "RMSE": np.mean(rmse_scores)
        })

    result_df = pd.DataFrame(results)
    print("=== 等分位ビン (Stratified用) ===")
    print(result_df)
    return result_df

In [5]:
!jupyter nbconvert --to html model_function.ipynb

[NbConvertApp] Converting notebook model_function.ipynb to html
[NbConvertApp] Writing 327527 bytes to model_function.html
