# 学習データの分割

In [31]:
# データの分割
import pandas as pd

df_train = pd.read_csv('data/input/train.csv')
df_test = pd.read_csv('data/input/test.csv')
connection_df = pd.read_csv('data/input/pretreatment_data01.csv')

train_df = connection_df[:len(df_train)]
test_df = connection_df[len(df_train):].drop(columns=['SalePrice'])

# X_trainには、SalePriceを除いたtrain_dfを代入
X_train = train_df.drop("SalePrice", axis=1)

# y_trainには、SalePriceのみが入ったtrain_dfを代入
y_train = train_df["SalePrice"]

# X_testにはtrain_dfを代入
X_test = test_df

print(X_train.shape, y_train.shape, X_test.shape)

In [32]:
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

def run_cv(model):
    """
    
    :param model: 
    :return: 
    """
    cv = KFold(n_splits=3, random_state=42, shuffle=True)
    rmse_results = []
    models = []
    
    for trn_index, val_index in cv.split(X_train):
        X_trn, X_val = X_train.loc[trn_index], X_train.loc[val_index]
        y_trn, y_val = y_train[trn_index], y_train[val_index]
        
        # モデルの学習
        model.fit(X_trn, y_trn)
        pred = model.predict(X_val)
        
        # モデル制度の算出
        rmse = np.sqrt(mean_squared_error(y_val, pred))
        print("RMSE:", rmse)
        models.append(model)
    
    print(rmse_results)
    print("Average RMSE:", np.mean(rmse_results))
    return models

In [35]:
import lightgbm as lgb

lgb_params = {
    "objective":"regression",
    "metric": "rmse"
}

cv = KFold(n_splits=3, random_state=42, shuffle=True)
rmse_results = []
lgbm_models = []
# テストデータに対する予測結果を格納するための空の配列
test_preds = np.zeros(len(X_test))

for trn_index, val_index in cv.split(X_train, y_train):
    X_trn, X_val = X_train.loc[trn_index], X_train.loc[val_index]
    y_trn, y_val = y_train[trn_index], y_train[val_index]
    
    train_lgb = lgb.Dataset(X_trn, y_trn)
    validation_lgb = lgb.Dataset(X_val, y_val)
    model = lgb.train(
        lgb_params, train_lgb, 
        num_boost_round=1000, valid_sets=[train_lgb, validation_lgb], 

        )
    pred = model.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, pred))
    print("RMSE:", rmse)
    rmse_results.append(rmse)
    lgbm_models.append(model)

    test_preds += model.predict(X_test) / cv.n_splits

print(rmse_results)
print("Average:", np.mean(rmse_results))

# データの提出

In [36]:
submission = pd.read_csv("sample_submission.csv")
submission["SalePrice"] = test_preds

# 提出ファイルの出力
submission.to_csv("submission.csv", index=False)