In [None]:
# 関連ライブラリのimport
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import mean_squared_error
import lightgbm as lgb
#fold数
folds = 5
seeds = [42, 1234, 923, 628, 802]
n = 27
param_tune = False

import pickle
import os
import gc
gc.enable()
import warnings
warnings.filterwarnings("ignore")
import time

import pandas as pd

from tqdm import tqdm_notebook as tqdm
from collections import defaultdict
import re
import matplotlib.pyplot as plt
import numpy as np

import matplotlib
import japanize_matplotlib
import seaborn as sns

import mojimoji
import itertools

In [None]:
# train = pd.read_csv("data/train_processed.csv")
# test = pd.read_csv("data/test_processed.csv")

In [None]:
# train = pd.read_csv("data/train_processed_add_groupby.csv")
# test = pd.read_csv("data/test_processed_add_groupby.csv")

In [None]:
# train = pd.read_csv("data/train_processed_add_geocoding.csv")
# test = pd.read_csv("data/test_processed_add_geocoding.csv")

In [None]:
# train = pd.read_csv("data/train_processed_add_groupby_geocoding.csv")
# test = pd.read_csv("data/test_processed_add_groupby_geocoding.csv")

In [None]:
# train = pd.read_csv("data/train_processed_add_geocoding_meshcode.csv")
# test = pd.read_csv("data/test_processed_add_geocoding_meshcode.csv")

In [None]:
# train = pd.read_csv("data/train_processed_add_geocoding_meshcode_crime.csv")
# test = pd.read_csv("data/test_processed_add_geocoding_meshcode_crime.csv")

In [None]:
# train = pd.read_csv("data/train_processed_add_geocoding_meshcode_crime_landprice.csv")
# test = pd.read_csv("data/test_processed_add_geocoding_meshcode_crime_landprice.csv")

In [None]:
# train = pd.read_csv("data/train_processed_add_geocoding_meshcode_crime_landprice_log_groupby.csv")
# test = pd.read_csv("data/test_processed_add_geocoding_meshcode_crime_landprice_log_groupby.csv")

In [None]:
train = pd.read_csv("data/train_processed_add_geocoding_meshcode_crime_landprice_ladmark_log_groupby.csv")
test = pd.read_csv("data/test_processed_add_geocoding_meshcode_crime_landprice_ladmark_log_groupby.csv")

In [None]:
exclusion = ["所在地", "アクセス", "番地", "tokens", "丁目", "町"]
train = train.drop(exclusion, axis=1)

In [None]:
def param_tune(data, target):
    """
    パラメータチューニングもまとめてやろうとした残骸
    """
    dtrain = lgb.Dataset(data, label=target)

    params = {
        'objective': "regression",
        'metric':"rmse",
        "verbosity": -1,
        "boosting_type": "gbdt",
    }

    tuner = lgb.LightGBMTunerCV(
        params,
        dtrain,
        folds=KFold(n_splits=5),
        callbacks=[early_stopping(100), log_evaluation(100)],
        optuna_seed=42,
    )

    tuner.run()
    
    return tuner.best_params

In [None]:
# lgbmのモデルを関数で定義しておく
def fit_lgb(X_fit, y_fit, X_val, y_val, counter, feats, cat_col_indices, seed=42):
    """
    全体モデルのパラメータ
    """
    
    lightgbm_params = {
     'objective': "regression",
     'metric':"rmse",
     'learning_rate': 0.01, # 学習率(デフォルトは0.03 or auto)
     'max_depth': -1, 
     'seed': seed, # random seed
     'n_estimators':100000, # 学習回数
     'n_jobs': -1,
        'feature_pre_filter': False,
 'lambda_l1': 6.984886757774584,
 'lambda_l2': 1.2439642430435591e-05,
 'num_leaves': 19,
 'feature_fraction': 0.516,
 'bagging_fraction': 1.0,
 'bagging_freq': 0,
 'min_child_samples': 20,
    }
    
    model = lgb.LGBMRegressor(**lightgbm_params)
     
    model.fit(X_fit, y_fit, 
              eval_set=[(X_val, y_val)],
              categorical_feature = cat_col_indices,
              verbose=1000, # 1000 iterationごとに詳細を表示
              early_stopping_rounds=500) # 500 iterationの間validation setの精度が向上しなかったら学習終了
    
    # validation setの予測結果を格納
    cv_pred = model.predict(X_val)
    
    # 特徴量重要度を後で確認するためデータフレームに格納
    fold_importance_df = pd.DataFrame()
    fold_importance_df["feature"] = feats
    fold_importance_df["importance"] = model.feature_importances_
    fold_importance_df["fold"] = counter + 1
    
    return model, cv_pred, fold_importance_df

In [None]:
models = []
for seed in seeds:
    # 目的変数
    train_y = np.array(train['賃料'])
    # idは特徴量には使えないので切り出しておく
    train_ids = np.array(train['id'])
    # idと賃料以外が特徴量になる
    feats = [col for col in train.columns if col not in ['id','賃料']]

    # カテゴリ変数のインデックス番号を取得
    # lightgbmでは学習時にどの特徴量がカテゴリ変数なのか引数で指定できるが、
    # 特徴量の名前ではなく特徴量の「インデックス番号」で指定する必要がある(バージョンによっても違うかも)
    categorical_feats_name = ['方角','建物構造','区', "railway", "公示価格_bin"]
    categorical_feats_indices = []
    for col in categorical_feats_name:
        categorical_feats_indices.append(list(train[feats].columns).index(col))

    # KFoldで順々に学習⇒評価していくので、最終的にtrain set全体に対するRMSEを評価するため予測結果の格納場所を用意しておく
    cv_preds = np.zeros(train.shape[0])
    # kf = KFold(n_splits=folds, shuffle=True, random_state=42)
    kf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=42)
    # kf.get_n_splits(train_ids, train_y)

    # パラメータチューニングしようとした残骸
    if param_tune==True:
        lightgbm_params = param_tune(train[feats], train_y)

    # 特徴量重要度格納用タプル
    feature_importances = []
    # モデル格納用タプル
    seed_models = []
    t0 = time.time()
    for counter, ids in enumerate(kf.split(train_ids, train_y)):
        print('\nseed {} Fold {}'.format(seed, counter+1))
        # training set
        X_fit, y_fit = train[feats].values[ids[0]], train_y[ids[0]]
        # validation set
        X_val, y_val = train[feats].values[ids[1]], train_y[ids[1]]

        model, cv_pred, fold_importance_df = fit_lgb(X_fit, y_fit, X_val, y_val, counter, feats, categorical_feats_indices)
        # 該当するインデックスにvalidation setの予測結果を格納
        cv_preds[ids[1]] += cv_pred
        models.append(model)
        feature_importances.append(fold_importance_df)
        del X_fit, X_val, y_fit, y_val
        gc.collect()

    # trainデータ全体のRMSEを評価
    rmse = np.sqrt(mean_squared_error(cv_preds,train_y))
    print('Trainデータ全体のRMSE: {:.5f}'.format(rmse))

    # 特徴量重要度のプロット
    feature_importance_df = pd.concat(feature_importances)
    mean_importance = feature_importance_df[["feature", "importance"]].groupby('feature').mean()
    feature_importance_df['mean_importance'] = feature_importance_df['feature'].map(mean_importance['importance'])
    plt.figure(figsize=(8,24), facecolor='w')
    #plt.rcParams["font.size"] = 16
    sns.barplot(x='importance', y='feature', 
                data=feature_importance_df.sort_values('mean_importance', ascending=False).iloc[:5 * 50])
    plt.tight_layout()
    plt.show()

    t1 = time.time()
    elapsed_time = t1-t0
    print("Elapsed time：{:.2f} sec".format(elapsed_time))

In [None]:
t0 = time.time()
# submissionにidが必要なのでidは必ず確保しておく
test_ids = test["id"]

test_preds = np.zeros(test.shape[0])
# fold数分のモデルの予測結果を平均(アンサンブル)
for model in tqdm(models):
    test_preds += model.predict(test[feats].values)/len(models)

submission = pd.DataFrame({"id":test_ids,
                           "賃料":test_preds})

# submissionのフォーマットにのっとって出力
submission.to_csv(f"./submission_log/submission_lgbm_{n}.csv", sep=",", index=False, header=None)

t1 = time.time()
elapsed_time = t1-t0
print("Elapsed time：{:.2f} sec".format(elapsed_time))
cv_pred_df = pd.DataFrame([cv_preds, train_y]).T
cv_pred_df.columns = ["cv_score", "true"]
cv_pred_df["error"] = abs(cv_pred_df["cv_score"] - cv_pred_df["true"])
cv_pred_df.to_csv(f"./cv_log/cv_error_{n}.csv", index=False)

In [None]:
# lgbmのモデルを関数で定義しておく
def fit_lgb_2(X_fit, y_fit, X_val, y_val, counter, feats, cat_col_indices, seed=42):
    """
    一般人向けモデルのパラメータ
    """
    
    lightgbm_params = {
     'objective': "regression",
     'metric':"rmse",
     'learning_rate': 0.01, # 学習率(デフォルトは0.03 or auto)
     'max_depth': -1, 
     'seed': seed, # random seed
     'n_estimators':100000, # 学習回数
     'n_jobs': -1,
      'feature_pre_filter': False,
 'lambda_l1': 0.0015287350854341264,
 'lambda_l2': 8.582661621678263e-05,
 'num_leaves': 31,
 'feature_fraction': 0.7,
 'bagging_fraction': 0.6591670111857015,
 'bagging_freq': 3,
 'min_child_samples': 20
    }
    
    model = lgb.LGBMRegressor(**lightgbm_params)
     
    model.fit(X_fit, y_fit, 
              eval_set=[(X_val, y_val)],
              categorical_feature = cat_col_indices,
              verbose=1000, # 1000 iterationごとに詳細を表示
              early_stopping_rounds=500) # 500 iterationの間validation setの精度が向上しなかったら学習終了
    
    # validation setの予測結果を格納
    cv_pred = model.predict(X_val)
    
    # 特徴量重要度を後で確認するためデータフレームに格納
    fold_importance_df = pd.DataFrame()
    fold_importance_df["feature"] = feats
    fold_importance_df["importance"] = model.feature_importances_
    fold_importance_df["fold"] = counter + 1
    
    return model, cv_pred, fold_importance_df

In [None]:
train = train[train["賃料"] < 500000]

In [None]:
models = []
for seed in seeds:
    # 目的変数
    train_y = np.array(train['賃料'])
    # idは特徴量には使えないので切り出しておく
    train_ids = np.array(train['id'])
    # idと賃料以外が特徴量になる
    feats = [col for col in train.columns if col not in ['id','賃料']]

    # カテゴリ変数のインデックス番号を取得
    # lightgbmでは学習時にどの特徴量がカテゴリ変数なのか引数で指定できるが、
    # 特徴量の名前ではなく特徴量の「インデックス番号」で指定する必要がある(バージョンによっても違うかも)
    categorical_feats_name = ['方角','建物構造','区', "railway", "公示価格_bin"]
    categorical_feats_indices = []
    for col in categorical_feats_name:
        categorical_feats_indices.append(list(train[feats].columns).index(col))

    # KFoldで順々に学習⇒評価していくので、最終的にtrain set全体に対するRMSEを評価するため予測結果の格納場所を用意しておく
    cv_preds = np.zeros(train.shape[0])
    # kf = KFold(n_splits=folds, shuffle=True, random_state=42)
    kf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=42)
    # kf.get_n_splits(train_ids, train_y)

    # パラメータチューニングしようとした残骸
    if param_tune==True:
        lightgbm_params = param_tune(train[feats], train_y)

    # 特徴量重要度格納用タプル
    feature_importances = []
    # モデル格納用タプル
    seed_models = []
    t0 = time.time()
    for counter, ids in enumerate(kf.split(train_ids, train_y)):
        print('\nseed {} Fold {}'.format(seed, counter+1))
        # training set
        X_fit, y_fit = train[feats].values[ids[0]], train_y[ids[0]]
        # validation set
        X_val, y_val = train[feats].values[ids[1]], train_y[ids[1]]

        model, cv_pred, fold_importance_df = fit_lgb(X_fit, y_fit, X_val, y_val, counter, feats, categorical_feats_indices)
        # 該当するインデックスにvalidation setの予測結果を格納
        cv_preds[ids[1]] += cv_pred
        models.append(model)
        feature_importances.append(fold_importance_df)
        del X_fit, X_val, y_fit, y_val
        gc.collect()

    # trainデータ全体のRMSEを評価
    rmse = np.sqrt(mean_squared_error(cv_preds,train_y))
    print('Trainデータ全体のRMSE: {:.5f}'.format(rmse))

    # 特徴量重要度のプロット
    feature_importance_df = pd.concat(feature_importances)
    mean_importance = feature_importance_df[["feature", "importance"]].groupby('feature').mean()
    feature_importance_df['mean_importance'] = feature_importance_df['feature'].map(mean_importance['importance'])
    plt.figure(figsize=(8,24), facecolor='w')
    #plt.rcParams["font.size"] = 16
    sns.barplot(x='importance', y='feature', 
                data=feature_importance_df.sort_values('mean_importance', ascending=False).iloc[:5 * 50])
    plt.tight_layout()
    plt.show()

    t1 = time.time()
    elapsed_time = t1-t0
    print("Elapsed time：{:.2f} sec".format(elapsed_time))

In [None]:
t0 = time.time()
# submissionにidが必要なのでidは必ず確保しておく
test_ids = test["id"]

test_preds = np.zeros(test.shape[0])
# fold数分のモデルの予測結果を平均(アンサンブル)
for model in tqdm(models):
    test_preds += model.predict(test[feats].values)/len(models)

submission = pd.DataFrame({"id":test_ids,
                           "賃料":test_preds})

# submissionのフォーマットにのっとって出力
submission.to_csv(f"./submission_log/submission_lgbm_{n}_normal.csv", sep=",", index=False, header=None)

t1 = time.time()
elapsed_time = t1-t0
print("Elapsed time：{:.2f} sec".format(elapsed_time))
cv_pred_df = pd.DataFrame([cv_preds, train_y]).T
cv_pred_df.columns = ["cv_score", "true"]
cv_pred_df["error"] = abs(cv_pred_df["cv_score"] - cv_pred_df["true"])
cv_pred_df.to_csv(f"./cv_log/cv_error_{n}_normal.csv", index=False)