In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import mean_squared_log_error as msle
from datetime import datetime, timedelta

In [2]:
df =  pd.read_csv('trainofbike.csv')

In [None]:
#https://amalog.hateblo.jp/entry/hyper-parameter-search

In [3]:
#特徴量を追加しています
df.loc[:, "hour"] = pd.DatetimeIndex(df["datetime"]).hour
df.loc[:, "month"] = pd.DatetimeIndex(df["datetime"]).month
pd.DatetimeIndex(df["datetime"]).weekday

Int64Index([5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
            ...
            2, 2, 2, 2, 2, 2, 2, 2, 2, 2],
           dtype='int64', name='datetime', length=10886)

In [4]:
#for分で回すためにデータのサイズを取得しています
indice = len(df.index)

In [5]:
#for文で１行ずつ代入しています
for i in range(indice):
    if i == 0:
        continue
    workingday_value = df.loc[i, "workingday"]
    hour_value = df.loc[i, "hour"]
    df_tmp = df.loc[:i - 1].groupby(["workingday", "hour"]).mean()
    try:
        df.loc[i, "mean"] = df_tmp.loc[(workingday_value, hour_value)]["count"]
  #初めてworkingday=1になったタイミングでkeyerrorが出るので、それを無視させています
    except KeyError:
        continue

In [6]:
#１日目とworkingdayが最初に切り替わるタイミングでNaNが出るのでdrop
df_ = df.dropna()

In [7]:
datasize = np.arange(len(df_.index))
ind_train, ind_test = train_test_split(datasize, test_size=1000, random_state=1)
df_train = df_.iloc[ind_train].reset_index()
df_test = df_.iloc[ind_test].reset_index()

In [8]:
#そのままサーチ
features = ['holiday',
            'workingday',
            'weather',
            'temp',
            'atemp',
            'humidity',
            'windspeed',
            'mean',
            'hour',
            'month']
X_train = df_train[features]
y_train = df_train["count"]
X_test = df_test[features]
y_test = df_test["count"]

In [None]:
#対数変換用サーチ
X_train = df_train[features]
y_train = df_train["count"]
X_test = df_test[features]
y_test = df_test["count"]
y_train_log = np.log1p(y_train + 1)
y_test_log = np.log1p(y_test + 1)

In [10]:
#対数変換をしていない
%%time
from sklearn.model_selection import GridSearchCV
import lightgbm as lgb
#合計45万通りの組み合せすべて検証は多すぎる(1000とおり)(4500)(20000)(3000)(1500)
cv_params ={'max_depth':[9,10,11],
            'min_child_weight':[1,2],
            'subsample':[0.1, 0.2, 0.3],
            'colsample_bytree':[0.7, 0.8, 0.9],
            'reg_alpha':[1e-2, 0.1, 1],
            'n_estimators':[1000, 2000],       #earlystop
            'reg_lambda':[1, 10, 100],
            'learning_rate':[0.1]
           }

model = lgb.LGBMRegressor(silent=False,n_jobs=-1)
model_grid = GridSearchCV(model, cv_params, cv=5, n_jobs=-1)
model_grid.fit(X_train,
                y_train,
                early_stopping_rounds=50,
                eval_set=[(X_test, y_test)],
                eval_metric='rmsle',
                verbose=0)
print('optimal_parameters\n', model_grid.best_params_)

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 518
[LightGBM] [Info] Number of data points in the train set: 9838, number of used features: 10
[LightGBM] [Info] Start training from score 191.585485
optimal_parameters
 {'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 10, 'min_child_weight': 1, 'n_estimators': 1000, 'reg_alpha': 0.1, 'reg_lambda': 10, 'subsample': 0.1}
Wall time: 12min 55s


In [29]:
#対数変換した
%%time
from sklearn.model_selection import GridSearchCV
import lightgbm as lgb
#合計45万通りの組み合せすべて検証は多すぎる((6561)（4600）
cv_params ={'max_depth':[10,11],
            'min_child_weight':[1,2],
            'subsample':[0.5, 0.6, 0.7, 0.8],
            'colsample_bytree':[0.6, 0.7, 0.8, 0.9],
            'reg_alpha':[0.01, 0.1, 1,10],
            'n_estimators':[1000, 2000],       #earlystop
            'reg_lambda':[0.01, 0.01, 0.1],
           }

model = lgb.LGBMRegressor(silent=False,n_jobs=-1)
model_grid = GridSearchCV(model, cv_params, cv=5, n_jobs=-1, scoring='neg_mean_squared_error')
model_grid.fit(X_train,
                y_train_log,
                early_stopping_rounds=50,
                eval_set=[(X_test, y_test_log)],
                eval_metric='rmse',
                verbose=0)
print('optimal_parameters\n', model_grid.best_params_)

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 518
[LightGBM] [Info] Number of data points in the train set: 9838, number of used features: 10
[LightGBM] [Info] Start training from score 4.625336
optimal_parameters
 {'colsample_bytree': 0.6, 'learning_rate': 0.1, 'max_depth': 10, 'min_child_weight': 1, 'n_estimators': 1000, 'reg_alpha': 0.01, 'reg_lambda': 0.1, 'subsample': 0.5}
Wall time: 23min 48s


'\n                          max_depth = 11,           \n                          min_child_weight = 1,     \n                          subsample = 0.7,           \n                          colsample_bytree = 0.8,   \n                          reg_alpha = 1,           \n                          n_estimators = 1000,     \n                          reg_lambda = 0.01,           \n                          learning_rate = 0.1        \n                            \n'

In [32]:
model = lgb.LGBMRegressor(silent = False,           
                          n_jobs = -1,
                          max_depth = 10,           
                          min_child_weight = 1,     
                          subsample = 0.5,           
                          colsample_bytree = 0.6,   
                          reg_alpha = 0.01,           
                          n_estimators = 1000,     
                          reg_lambda = 0.1,           
                          learning_rate = 0.1        
                          )
model.fit(X_train, y_train_log)
y_pred_test = model.predict(X_test)
y_pred = np.exp(y_pred_test) - 1
np.sqrt(msle(y_test, y_pred))

"""
                          max_depth = 11,           
                          min_child_weight = 1,     
                          subsample = 0.7,           
                          colsample_bytree = 0.8,   
                          reg_alpha = 1,           
                          n_estimators = 1000,     
                          reg_lambda = 0.01,           
                          learning_rate = 0.1        
                            
"""

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 518
[LightGBM] [Info] Number of data points in the train set: 9838, number of used features: 10
[LightGBM] [Info] Start training from score 4.625336


0.3150263742246829

In [25]:
model = lgb.LGBMRegressor()
model.fit(X_train, y_train_log)
y_pred_test = model.predict(X_test)
y_pred = np.exp(y_pred_test) - 1
np.sqrt(msle(y_test, y_pred))

0.3225297286901196

In [16]:
#対数変換に対応していないものしかない
#回帰する
import lightgbm as lgb
kf = KFold(n_splits=9, shuffle=True, random_state=1).split(X_train, y_train)
kf_ = list(kf)
model = lgb.LGBMRegressor(silent = False,           
                          n_jobs = -1,
                          max_depth = 10,           
                          min_child_weight = 2,     
                          subsample = 0.9,           
                          colsample_bytree = 1.0,   
                          reg_alpha = 100,           
                          n_estimators = 1000,     
                          reg_lambda = 0.1,           
                          learning_rate = 0.1        
                          )
scores =[]
score = 0

diffs = []
for train, valid in kf_:
    model.fit(X_train.loc[train],
              y_train.loc[train],
              early_stopping_rounds=50,
              eval_set=[(X_train.loc[valid], y_train.loc[valid])],
              eval_metric='rmse',
              verbose=0
              )
    prediction = model.predict(X_train.loc[valid])
    prediction[prediction < 0] = 0
    score = np.sqrt(msle(y_train.loc[valid], prediction))
    scores.append(score)
    y_true = y_train.loc[valid]
    df_diff = prediction - y_true
    plus = df_diff[df_diff > 0].sum()
    equal = df_diff[df_diff == 0].sum()
    minus = df_diff[df_diff < 0].sum()
    _ = [plus, equal, minus]
    diffs.append(_)
mean_score = np.mean(scores) 
diff_mean = np.mean(np.array(diffs), axis=0)
print(f"mean_score:{mean_score}\n")
print(f"plus={diff_mean[0]}")
print(f"equal={diff_mean[1]}")
print(f"minus={diff_mean[2]}")

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 517
[LightGBM] [Info] Number of data points in the train set: 8744, number of used features: 10
[LightGBM] [Info] Start training from score 191.858188
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 517
[LightGBM] [Info] Number of data points in the train set: 8745, number of used features: 10
[LightGBM] [Info] Start training from score 191.543282
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 518
[LightGBM] [Info] Number of data points in the train set: 8745, number of used features: 10
[LightGBM] [Info] Start training from score 191.574614
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 516
[LightGBM] [Info] Number of data points in the train set: 8745, num

In [None]:
#tuning
train0.39   test0.38
#tuningなし
train0.36   test0.34
#default
train0.36   test0.37
#logとってないから、おそらくRSME計算になっているのかも

In [None]:
#logとってやってみる
#default
test 0.32
#tuning
test 0.31

In [None]:
#対数変換した
%%time
from sklearn.model_selection import GridSearchCV
import lightgbm as lgb
#合計45万通りの組み合せすべて検証は多すぎる((6561)（4600）
cv_params ={'reg_alpha':[0.01, 0.1, 1,10],
            'reg_lambda':[0.01, 0.01, 0.1],
            'num_leaves':[2~131072],
            'colsample_bytree':[0.6, 0.7, 0.8, 0.9],
            'subsample':[0.5, 0.6, 0.7, 0.8],
           }

model = lgb.LGBMRegressor(silent=False,n_jobs=-1)
model_grid = GridSearchCV(model, cv_params, cv=5, n_jobs=-1, scoring='neg_mean_squared_error')
model_grid.fit(X_train,
                y_train_log,
                early_stopping_rounds=50,
                eval_set=[(X_test, y_test_log)],
                eval_metric='rmse',
                verbose=0)
print('optimal_parameters\n', model_grid.best_params_)

In [None]:
#https://qiita.com/c60evaporator/items/351188110f328ff921b9

In [None]:
%%time
#没　そもそもRMSLEがなかった。
from sklearn.model_selection import RandomizedSearchCV
import lightgbm as lgb
#探索空間（パラメータ候補）を定義する。（合計45万通りの組み合せ）
cv_params ={'max_depth':[10],
            'min_child_weight':[1,2,3,4,5],
            'subsample':[i/10.0 for i in range(6,11)],
            'colsample_bytree':[0.7],
            'reg_alpha':[1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 0.1, 1],
            'n_estimators':[3000],
            'reg_lambda':[1e-5, 1e-4, 1e-3, 1e-2, 0.1, 1],
            'learning_rate':[0.1]
            }
model = lgb.LGBMRegressor(silent=False,n_jobs=-1)
model_rand = RandomizedSearchCV(model, cv_params, n_iter=200, cv=5, n_jobs=-1)
model_rand.fit(X_train,
               y_train,
               early_stopping_rounds=50,
               eval_set=[(X_test, y_test)],
               eval_metric='rmse',
               verbose=0
              )
print('optimal_parameters\n', model_rand.best_params_)
"""
optimal_parameters
 {'subsample': 0.6, 'reg_lambda': 0.0001, 'reg_alpha': 1e-05, 'n_estimators': 2000, 'min_child_weight': 3, 'max_depth': 8, 'learning_rate': 0.1, 'colsample_bytree': 0.7}
    
 {'subsample': 1.0, 'reg_lambda': 1e-05, 'reg_alpha': 0.0001, 'n_estimators': 2000, 'min_child_weight': 5, 'max_depth': 7, 'learning_rate': 0.1, 'colsample_bytree': 0.7}

 {'subsample': 0.9, 'reg_lambda': 1e-05, 'reg_alpha': 1e-05, 'n_estimators': 2000, 'min_child_weight': 1, 'max_depth': 7, 'learning_rate': 0.1, 'colsample_bytree': 0.8}
    
 {'subsample': 0.9, 'reg_lambda': 1, 'reg_alpha': 1, 'n_estimators': 2000, 'min_child_weight': 2, 'max_depth': 8, 'learning_rate': 0.1, 'colsample_bytree': 0.7}

 {'subsample': 0.7, 'reg_lambda': 0.001, 'reg_alpha': 0.001, 'n_estimators': 2000, 'min_child_weight': 3, 'max_depth': 7, 'learning_rate': 0.1, 'colsample_bytree': 0.8}

調整

{'subsample': 0.6, 'reg_lambda': 0.001, 'reg_alpha': 1e-06, 'n_estimators': 2000, 'min_child_weight': 1, 'max_depth': 10, 'learning_rate': 0.1, 'colsample_bytree': 0.7}

 {'subsample': 1.0, 'reg_lambda': 0.01, 'reg_alpha': 0.01, 'n_estimators': 3000, 'min_child_weight': 3, 'max_depth': 10, 'learning_rate': 0.1, 'colsample_bytree': 0.7}
 
  {'subsample': 1.0, 'reg_lambda': 0.01, 'reg_alpha': 0.01, 'n_estimators': 3000, 'min_child_weight': 1, 'max_depth': 10, 'learning_rate': 0.1, 'colsample_bytree': 0.7}
  
   {'subsample': 0.7, 'reg_lambda': 0.01, 'reg_alpha': 1e-06, 'n_estimators': 3000, 'min_child_weight': 1, 'max_depth': 10, 'learning_rate': 0.1, 'colsample_bytree': 0.7}

{'subsample': 0.9, 'reg_lambda': 0.001, 'reg_alpha': 1e-05, 'n_estimators': 2000, 'min_child_weight': 3, 'max_depth': 10, 'learning_rate': 0.1, 'colsample_bytree': 0.7}

"""