# 学習データの分割

In [2]:
import lightgbm
import numpy as np
import pandas as pd

In [3]:
# データの分割
df_train = pd.read_csv('data/input/train.csv')
df_test = pd.read_csv('data/input/test.csv')
connection_df = pd.read_csv('data/input/pretreatment_data02.csv')

df_train_a = connection_df[:len(df_train)]
df_test_a = connection_df[len(df_train):].drop(columns=["SalePrice"])

X_train = df_train_a.drop("SalePrice", axis=1)
y_train = df_train_a["SalePrice"]
X_test = df_test_a

print(X_train.shape, y_train.shape, X_test.shape)

(1460, 201) (1460,) (1459, 201)


In [4]:
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

def run_cv(model):
    """概要
    二乗平均並行誤差の関数化
    :param model: 
    :return: 
    """
    cv = KFold(
        n_splits=3,
        random_state=42,
        shuffle=True
    )
    rmse_results = []
    models = []
    for trn_index, val_index in cv.split(X_train):
        X_trn, X_val = X_train.loc[trn_index], X_train.loc[val_index]
        y_trn, y_val = y_train[trn_index], y_train[val_index]

        # モデルの学習
        model.fit(X_trn, y_trn)
        pred = model.predict(X_val)

        # モデルの精度を算出
        rmse = np.sqrt(mean_squared_error(y_val, pred))
        print("RMSE:", rmse)
        rmse_results.append(rmse)
        models.append(model)

    print(rmse_results)
    print("Average:", np.mean(rmse_results))
    return models

# 予測モデルの作成

In [5]:
import lightgbm as lgb

# lightgbmのパラメータ調整
lgb_params = {
    "objective": "regression",
    "metric": "rmse",
    "boosting_type": "gbdt",
    "num_leaves": 5,
    "learning_rate": 0.05,
    "max_bin": 55,
    "feature_fraction": 0.2319,
    "bagging_fraction": 0.8,
    "bagging_freq": 5,
    "feature_fraction_seed": 9,
    "bagging_seed": 9,
    "min_data_in_leaf": 6,
    "min_sum_hessian_in_leaf": 11,
    "verbose": -1,
    "early_stopping_rounds": 300
}

In [7]:
# KFoldオブジェクトの作成
cv = KFold(
    n_splits=3,
    random_state=42,
    shuffle=True
    )

# モデル構築、テストデータに対する予測結果を格納するための空の配列
rmse_results = []
lgbm_models = []
test_preds = np.zeros(len(X_test))

for trn_index, val_index in cv.split(X_train, y_train):
    X_trn, X_val = X_train.loc[trn_index], X_train.loc[val_index]
    y_trn, y_val = y_train[trn_index], y_train[val_index]

    train_lgb = lgb.Dataset(X_trn, y_trn)
    validation_lgb = lgb.Dataset(X_val, y_val)

    model = lgb.train(
        lgb_params,
        train_lgb, 
        num_boost_round=300,
        valid_sets=[train_lgb, validation_lgb], 
        callbacks=[lgb.log_evaluation(period=100)],
        )

    pred = model.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, pred))
    print("RMSE:", rmse)
    rmse_results.append(rmse)
    lgbm_models.append(model)

    test_preds += model.predict(X_test) / cv.n_splits

print(rmse_results)
print("Average:", np.mean(rmse_results))

[100]	training's rmse: 23938.6	valid_1's rmse: 31831.2
[200]	training's rmse: 19058.3	valid_1's rmse: 30123.8
[300]	training's rmse: 16659.4	valid_1's rmse: 29717.4
RMSE: 29674.169329916185
[100]	training's rmse: 21239.2	valid_1's rmse: 36277.5
[200]	training's rmse: 17152.2	valid_1's rmse: 36713.6
[300]	training's rmse: 15164.4	valid_1's rmse: 37528.9
RMSE: 35980.55296022618
[100]	training's rmse: 25449.2	valid_1's rmse: 24901.2
[200]	training's rmse: 20549.1	valid_1's rmse: 23384.9
[300]	training's rmse: 17715.9	valid_1's rmse: 23165.7
RMSE: 23160.18501527926
[np.float64(29674.169329916185), np.float64(35980.55296022618), np.float64(23160.18501527926)]
Average: 29604.969101807208


In [None]:
# グリッドサーチ
from sklearn.model_selection import GridSearchCV
from datetime import time
start = time.time()

cv_params = {
    'reg_alpha': [0.0001, 0.003, 0.1],
    'reg_lambda': [0.0001, 0.1],
    'num_leaves': [2, 3, 4, 6],
    'colsample_bytree': [0.4, 0.7, 1.0],
    'subsample': [0.4, 1.0],
    'subsample_freq': [0, 7],
    'min_child_samples': [0, 2, 5, 10]
}

# グリッドサーチのインスタンス作成
gridcv = GridSearchCV(model, cv_params, cv=cv,
                      scoring=scoring, n_jobs=-1)
# グリッドサーチ実行（学習実行）
gridcv.fit(X, y, **fit_params)
# 最適パラメータの表示と保持
best_params = gridcv.best_params_
best_score = gridcv.best_score_
print(f'最適パラメータ {best_params}\nスコア {best_score}')
print(f'所要時間{time.time() - start}秒')

# データの提出

In [23]:
submission = pd.read_csv("data/input/sample_submission.csv")
submission["SalePrice"] = test_preds

# 提出ファイルの出力
submission.to_csv("data/output/submission240830_3.csv", index=False)