# 学習データの分割

In [1]:
import lightgbm
import numpy as np
import pandas as pd

In [2]:
# データの分割
df_train = pd.read_csv('data/input/train.csv')
df_test = pd.read_csv('data/input/test.csv')
connection_df = pd.read_csv('data/input/pretreatment_data02.csv')

df_train_a = connection_df[:len(df_train)]
df_test_a = connection_df[len(df_train):].drop(columns=["SalePrice"])

X_train = df_train_a.drop("SalePrice", axis=1)
y_train = df_train_a["SalePrice"]
X_test = df_test_a

print(X_train.shape, y_train.shape, X_test.shape)

(1460, 201) (1460,) (1459, 201)


In [3]:
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

def run_cv(model):
    """概要
    二乗平均並行誤差の関数化
    :param model: 
    :return: 
    """
    cv = KFold(
        n_splits=3,
        random_state=42,
        shuffle=True
    )
    rmse_results = []
    models = []
    for trn_index, val_index in cv.split(X_train):
        X_trn, X_val = X_train.loc[trn_index], X_train.loc[val_index]
        y_trn, y_val = y_train[trn_index], y_train[val_index]

        # モデルの学習
        model.fit(X_trn, y_trn)
        pred = model.predict(X_val)

        # モデルの精度を算出
        rmse = np.sqrt(mean_squared_error(y_val, pred))
        print("RMSE:", rmse)
        rmse_results.append(rmse)
        models.append(model)

    print(rmse_results)
    print("Average:", np.mean(rmse_results))
    return models

# 予測モデルの作成

In [21]:
import lightgbm as lgb

# lightgbmのパラメータ調整
lgb_params = {
    "objective": "regression",
    "metric": "rmse",
    "boosting_type": "gbdt",
    "num_leaves": 31,
    "learning_rate": 0.05,
    "feature_fraction": 0.9,
    "bagging_fraction": 0.8,
    "bagging_freq": 5,
    "verbose": -1,
    "early_stopping_rounds": 300
}

In [22]:
# KFoldオブジェクトの作成
cv = KFold(
    n_splits=3,
    random_state=42,
    shuffle=True
    )

# モデル構築、テストデータに対する予測結果を格納するための空の配列
rmse_results = []
lgbm_models = []
test_preds = np.zeros(len(X_test))

for trn_index, val_index in cv.split(X_train, y_train):
    X_trn, X_val = X_train.loc[trn_index], X_train.loc[val_index]
    y_trn, y_val = y_train[trn_index], y_train[val_index]

    train_lgb = lgb.Dataset(X_trn, y_trn)
    validation_lgb = lgb.Dataset(X_val, y_val)

    model = lgb.train(
        lgb_params,
        train_lgb, 
        num_boost_round=1000,
        valid_sets=[train_lgb, validation_lgb], 
        callbacks=[lgb.log_evaluation(period=100)],
        )

    pred = model.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, pred))
    print("RMSE:", rmse)
    rmse_results.append(rmse)
    lgbm_models.append(model)

    test_preds += model.predict(X_test) / cv.n_splits

print(rmse_results)
print("Average:", np.mean(rmse_results))

[100]	training's rmse: 17520.6	valid_1's rmse: 30964.3
[200]	training's rmse: 12337.1	valid_1's rmse: 30572.1
[300]	training's rmse: 9395.33	valid_1's rmse: 30540.7
[400]	training's rmse: 7458.63	valid_1's rmse: 30462.3
RMSE: 30355.844566036656
[100]	training's rmse: 17288.7	valid_1's rmse: 33448.9
[200]	training's rmse: 11776.9	valid_1's rmse: 35220
[300]	training's rmse: 8678.65	valid_1's rmse: 36640.4
[400]	training's rmse: 6640.64	valid_1's rmse: 37745.2
RMSE: 33448.93602971436
[100]	training's rmse: 22190.1	valid_1's rmse: 23386.2
[200]	training's rmse: 16234.9	valid_1's rmse: 23436.5
[300]	training's rmse: 12537.8	valid_1's rmse: 23670
[400]	training's rmse: 9733.95	valid_1's rmse: 23979.7
RMSE: 23035.674672993588
[np.float64(30355.844566036656), np.float64(33448.93602971436), np.float64(23035.674672993588)]
Average: 28946.818422914865


# データの提出

In [23]:
submission = pd.read_csv("data/input/sample_submission.csv")
submission["SalePrice"] = test_preds

# 提出ファイルの出力
submission.to_csv("data/output/submission240830_3.csv", index=False)