In [None]:
#https://mathmatical22.xyz/
#https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.LGBMRegressor.html
#https://qiita.com/c60evaporator/items/351188110f328ff921b9

In [None]:
from math import sqrt
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.metrics import mean_squared_log_error,mean_squared_error
from sklearn.model_selection import KFold
import lightgbm as lgb

import warnings
pd.options.mode.chained_assignment = None
warnings.filterwarnings("ignore", category=DeprecationWarning)

%matplotlib  inline

In [None]:
train =  pd.read_csv('trainofbike.csv',  parse_dates = ['datetime'])
test = pd.read_csv('testofbike.csv', parse_dates = ['datetime'])

In [None]:
#datetimeの変換
import datetime as dt
train['day'] = train.datetime.dt.day
train['hour'] = train.datetime.dt.hour
train['weekday'] = train.datetime.dt.weekday
train['year'] = train.datetime.dt.year
train['month'] = train.datetime.dt.month
test['day'] = test.datetime.dt.day
test['hour'] = test.datetime.dt.hour
test['weekday'] = test.datetime.dt.weekday
test['year'] = test.datetime.dt.year
test['month'] = test.datetime.dt.month

In [None]:
# ベース(datetime分解あり)
train_0 = train.drop(columns = ['datetime', 'casual', 'registered', 'count'])
train_y = train['count']
test_0 = test.drop(columns = ['datetime'])
test_datetime = test['datetime']

# LightGBMで普通に予測(グラフ化)

In [None]:
kf = KFold(n_splits=10, shuffle=True, random_state=1).split(train_0, train_y)
kf_ = list(kf)

cnt = 0
sum = 0
for Ktrain, Ktest in kf_:
    cnt += 1
    model = lgb.LGBMRegressor()
    model.fit(train_0.loc[Ktrain],
              train_y.loc[Ktrain],
              eval_set = [(train_0.loc[Ktest], train_y.loc[Ktest])],
              eval_metric = 'rmsle',                         #rmsleがない
              verbose=0)
 
    y_pred = model.predict(train_0.loc[Ktest])
    y_pred[y_pred < 0] = 0                                    #負の値が出てしまった。
    
    print(f"Fold:{cnt}")
    print('RMSLE:', np.sqrt(mean_squared_log_error(train_y.loc[Ktest], y_pred)))
    diff = train_y.loc[Ktest] - y_pred
    print(diff[diff<0].sum())
    print(diff[diff>0].sum())
    
    true_df = train.loc[Ktest][['count', 'hour']].groupby(['hour'], as_index = True).mean().sort_values(by = 'count')
    
    trainhour = train_0.loc[Ktest]['hour'].reset_index()
    pred_df = pd.concat([trainhour['hour'], pd.Series(y_pred)], axis=1)
    pred_df.columns = ['hour','predict']
    pred_df = pred_df.groupby(['hour'], as_index = True).mean().sort_values(by = 'predict')
    
    fig = plt.figure(figsize=(15,5))
    sns.lineplot(data=true_df, palette='Oranges')
    sns.lineplot(data=pred_df, palette='Blues')
    plt.show()

In [None]:
model = lgb.LGBMRegressor()
model.fit(train_0, train_y)
y_pred_test = model.predict(test_0)
y_pred_test[y_pred_test < 0] = 0  
sub0 = pd.concat([test_datetime, pd.Series(y_pred_test)], axis=1)
sub0.columns = ['datetime','count']
sub0

In [None]:
sub0.to_csv('bikeLGBM.csv', index = False)

# casual registeredを分ける

In [None]:
train_casual = train['casual']
train_regis = train['registered']

In [None]:
#casual
kf = KFold(n_splits=10, shuffle=True, random_state=1).split(train_0, train_casual)
kf_ = list(kf)

cnt = 0
sum = 0
for Ktrain, Ktest in kf_:
    cnt += 1
    model = lgb.LGBMRegressor()
    model.fit(train_0.loc[Ktrain],
              train_casual.loc[Ktrain],
              eval_set = [(train_0.loc[Ktest], train_casual.loc[Ktest])],
              eval_metric = 'rmsle',                         #rmsleがない
              verbose=0)
 
    y_pred = model.predict(train_0.loc[Ktest])
    y_pred[y_pred < 0] = 0                                    #負の値が出てしまった。
    
    print(f"Fold:{cnt}")
    print('RMSLE:', np.sqrt(mean_squared_log_error(train_casual.loc[Ktest], y_pred)))
    diff = train_casual.loc[Ktest] - y_pred
    print(diff[diff<0].sum())
    print(diff[diff>0].sum())
    
    true_df = train.loc[Ktest][['casual', 'hour']].groupby(['hour'], as_index = True).mean().sort_values(by = 'casual')
    
    trainhour = train_0.loc[Ktest]['hour'].reset_index()
    pred_df = pd.concat([trainhour['hour'], pd.Series(y_pred)], axis=1)
    pred_df.columns = ['hour','predict']
    pred_df = pred_df.groupby(['hour'], as_index = True).mean().sort_values(by = 'predict')
    
    fig = plt.figure(figsize=(15,5))
    sns.lineplot(data=true_df, palette='Oranges')
    sns.lineplot(data=pred_df, palette='Blues')
    plt.show()

In [None]:
#registered
kf = KFold(n_splits=10, shuffle=True, random_state=1).split(train_0, train_regis)
kf_ = list(kf)

cnt = 0
sum = 0
for Ktrain, Ktest in kf_:
    cnt += 1
    model = lgb.LGBMRegressor()
    model.fit(train_0.loc[Ktrain],
              train_regis.loc[Ktrain],
              eval_set = [(train_0.loc[Ktest], train_regis.loc[Ktest])],
              eval_metric = 'rmsle',                         #rmsleがない
              verbose=0)
 
    y_pred = model.predict(train_0.loc[Ktest])
    y_pred[y_pred < 0] = 0                                    #負の値が出てしまった。
    
    print(f"Fold:{cnt}")
    print('RMSLE:', np.sqrt(mean_squared_log_error(train_regis.loc[Ktest], y_pred)))
    diff = train_regis.loc[Ktest] - y_pred
    print(diff[diff<0].sum())
    print(diff[diff>0].sum())
    
    true_df = train.loc[Ktest][['registered', 'hour']].groupby(['hour'], as_index = True).mean().sort_values(by = 'registered')
    
    trainhour = train_0.loc[Ktest]['hour'].reset_index()
    pred_df = pd.concat([trainhour['hour'], pd.Series(y_pred)], axis=1)
    pred_df.columns = ['hour','predict']
    pred_df = pred_df.groupby(['hour'], as_index = True).mean().sort_values(by = 'predict')
    
    fig = plt.figure(figsize=(15,5))
    sns.lineplot(data=true_df, palette='Oranges')
    sns.lineplot(data=pred_df, palette='Blues')
    plt.show()

In [None]:
#casual + registered
model = lgb.LGBMRegressor()
model.fit(train_0, train_casual)
y_pred_casual = model.predict(test_0)
y_pred_casual[y_pred_casual < 0] = 0  

model = lgb.LGBMRegressor()
model.fit(train_0, train_regis)
y_pred_regis = model.predict(test_0)
y_pred_regis[y_pred_regis < 0] = 0

y_pred_test = y_pred_casual + y_pred_regis

sub0 = pd.concat([test_datetime, pd.Series(y_pred_test)], axis=1)
sub0.columns = ['datetime','count']
sub0

In [None]:
sub0.to_csv('casuregi.csv', index = False)
#kaggle 0.491

# count log を取る

In [None]:
train_y_log = np.log1p(train_y + 1)

In [None]:
kf = KFold(n_splits=10, shuffle=True, random_state=1).split(train_0, train_y_log)
kf_ = list(kf)

cnt = 0
sum = 0
for Ktrain, Ktest in kf_:
    cnt += 1
    model = lgb.LGBMRegressor()
    model.fit(train_0.loc[Ktrain],
              train_y_log.loc[Ktrain],
              eval_set = [(train_0.loc[Ktest], train_y_log.loc[Ktest])],
              eval_metric = 'rmsle',                         
              verbose=0)
 
    y_pred_log = model.predict(train_0.loc[Ktest])
    y_pred = np.exp(y_pred_log) - 1
    #y_pred[y_pred < 0] = 0                                    
    
    print(f"Fold:{cnt}")
    print('RMSLE:', np.sqrt(mean_squared_log_error(train_y.loc[Ktest], y_pred)))
    diff = train_y.loc[Ktest] - y_pred
    print(diff[diff<0].sum())
    print(diff[diff>0].sum())
    
    true_df = train.loc[Ktest][['count', 'hour']].groupby(['hour'], as_index = True).mean().sort_values(by = 'count')
    
    trainhour = train_0.loc[Ktest]['hour'].reset_index()
    pred_df = pd.concat([trainhour['hour'], pd.Series(y_pred)], axis=1)
    pred_df.columns = ['hour','predict']
    pred_df = pred_df.groupby(['hour'], as_index = True).mean().sort_values(by = 'predict')
    
    fig = plt.figure(figsize=(15,5))
    sns.lineplot(data=true_df, palette='Oranges')
    sns.lineplot(data=pred_df, palette='Blues')
    plt.show()

In [None]:
model = lgb.LGBMRegressor()
model.fit(train_0, train_y_log)
y_pred_log = model.predict(test_0)
y_pred_test = np.exp(y_pred_log) - 1
sub0 = pd.concat([test_datetime, pd.Series(y_pred_test)], axis=1)
sub0.columns = ['datetime','count']
sub0

In [None]:
sub0.to_csv('bikeLGBM_log.csv', index = False)

# casual registered で分ける and   log を取る

In [None]:
train_casual = train['casual']
train_regis = train['registered']
train_casual_log = np.log1p(train_casual + 1)
train_regis_log = np.log1p(train_regis + 1)

In [None]:
#casual log
kf = KFold(n_splits=10, shuffle=True, random_state=1).split(train_0, train_casual_log)
kf_ = list(kf)

cnt = 0
sum = 0
for Ktrain, Ktest in kf_:
    cnt += 1
    model = lgb.LGBMRegressor()
    model.fit(train_0.loc[Ktrain],
              train_casual_log.loc[Ktrain],
              eval_set = [(train_0.loc[Ktest], train_casual_log.loc[Ktest])],
              eval_metric = 'rmsle',                         
              verbose=0)
 
    y_pred_log = model.predict(train_0.loc[Ktest])
    y_pred = np.exp(y_pred_log) - 1
    #y_pred[y_pred < 0] = 0                                    
    
    print(f"Fold:{cnt}")
    print('RMSLE:', np.sqrt(mean_squared_log_error(train_casual.loc[Ktest], y_pred)))
    diff = train_casual.loc[Ktest] - y_pred
    print(diff[diff<0].sum())
    print(diff[diff>0].sum())
    
    true_df = train.loc[Ktest][['casual', 'hour']].groupby(['hour'], as_index = True).mean().sort_values(by = 'casual')
    
    trainhour = train_0.loc[Ktest]['hour'].reset_index()
    pred_df = pd.concat([trainhour['hour'], pd.Series(y_pred)], axis=1)
    pred_df.columns = ['hour','predict']
    pred_df = pred_df.groupby(['hour'], as_index = True).mean().sort_values(by = 'predict')
    
    fig = plt.figure(figsize=(15,5))
    sns.lineplot(data=true_df, palette='Oranges')
    sns.lineplot(data=pred_df, palette='Blues')
    plt.show()

In [None]:
#registered log
kf = KFold(n_splits=10, shuffle=True, random_state=1).split(train_0, train_regis_log)
kf_ = list(kf)

cnt = 0
sum = 0
for Ktrain, Ktest in kf_:
    cnt += 1
    model = lgb.LGBMRegressor()
    model.fit(train_0.loc[Ktrain],
              train_regis_log.loc[Ktrain],
              eval_set = [(train_0.loc[Ktest], train_regis_log.loc[Ktest])],
              eval_metric = 'rmsle',                         
              verbose=0)
 
    y_pred_log = model.predict(train_0.loc[Ktest])
    y_pred = np.exp(y_pred_log) - 1
    #y_pred[y_pred < 0] = 0                                    
    
    print(f"Fold:{cnt}")
    print('RMSLE:', np.sqrt(mean_squared_log_error(train_regis.loc[Ktest], y_pred)))
    diff = train_regis.loc[Ktest] - y_pred
    print(diff[diff<0].sum())
    print(diff[diff>0].sum())
    
    true_df = train.loc[Ktest][['registered', 'hour']].groupby(['hour'], as_index = True).mean().sort_values(by = 'registered')
    
    trainhour = train_0.loc[Ktest]['hour'].reset_index()
    pred_df = pd.concat([trainhour['hour'], pd.Series(y_pred)], axis=1)
    pred_df.columns = ['hour','predict']
    pred_df = pred_df.groupby(['hour'], as_index = True).mean().sort_values(by = 'predict')
    
    fig = plt.figure(figsize=(15,5))
    sns.lineplot(data=true_df, palette='Oranges')
    sns.lineplot(data=pred_df, palette='Blues')
    plt.show()

In [None]:
#casual + registered log
model = lgb.LGBMRegressor()
model.fit(train_0, train_casual_log)
y_pred_casual = model.predict(test_0)
y_pred_casual_ = np.exp(y_pred_casual) - 1

model = lgb.LGBMRegressor()
model.fit(train_0, train_regis_log)
y_pred_regis = model.predict(test_0)
y_pred_regis_ = np.exp(y_pred_regis) - 1

y_pred_test = y_pred_casual_ + y_pred_regis_

sub0 = pd.concat([test_datetime, pd.Series(y_pred_test)], axis=1)
sub0.columns = ['datetime','count']
sub0

In [None]:
sub0.to_csv('casuregi_log.csv', index = False)
#kaggle 0.420

# パラメータチューニング

## まずデフォルトのスコア

In [None]:
model = lgb.LGBMRegressor()
model.fit(X_train, y_train_log)
y_pred_test = model.predict(X_test)
y_pred = np.exp(y_pred_test) - 1
np.sqrt(msle(y_test, y_pred))

## GridSearchCV

In [None]:
#対数変換をしていない
%%time
from sklearn.model_selection import GridSearchCV
import lightgbm as lgb
#合計45万通りの組み合せすべて検証は多すぎる(1000とおり)(4500)(20000)(3000)(1500)
cv_params ={'max_depth':[9,10,11],
            'min_child_weight':[1,2],
            'subsample':[0.1, 0.2, 0.3],
            'colsample_bytree':[0.7, 0.8, 0.9],
            'reg_alpha':[1e-2, 0.1, 1],
            'n_estimators':[1000, 2000],       #earlystop
            'reg_lambda':[1, 10, 100],
            'learning_rate':[0.1]
           }

model = lgb.LGBMRegressor(silent=False,n_jobs=-1)
model_grid = GridSearchCV(model, cv_params, cv=5, n_jobs=-1)
model_grid.fit(X_train,
                y_train,
                early_stopping_rounds=50,
                eval_set=[(X_test, y_test)],
                eval_metric='rmsle',
                verbose=0)
print('optimal_parameters\n', model_grid.best_params_)

In [None]:
#対数変換した
%%time
from sklearn.model_selection import GridSearchCV
import lightgbm as lgb
#合計45万通りの組み合せすべて検証は多すぎる((6561)（4600）
cv_params ={'max_depth':[10,11],
            'min_child_weight':[1,2],
            'subsample':[0.5, 0.6, 0.7, 0.8],
            'colsample_bytree':[0.6, 0.7, 0.8, 0.9],
            'reg_alpha':[0.01, 0.1, 1,10],
            'n_estimators':[1000, 2000],       #earlystop
            'reg_lambda':[0.01, 0.01, 0.1],
           }

model = lgb.LGBMRegressor(silent=False,n_jobs=-1)
model_grid = GridSearchCV(model, cv_params, cv=5, n_jobs=-1, scoring='neg_mean_squared_error')
model_grid.fit(X_train,
                y_train_log,
                early_stopping_rounds=50,
                eval_set=[(X_test, y_test_log)],
                eval_metric='rmse',
                verbose=0)
print('optimal_parameters\n', model_grid.best_params_)

## RandomizedSearchCV

In [None]:
%%time
#対数変換していない
from sklearn.model_selection import RandomizedSearchCV
import lightgbm as lgb
#探索空間（パラメータ候補）を定義する。（合計45万通りの組み合せ）
cv_params ={'max_depth':[10],
            'min_child_weight':[1,2,3,4,5],
            'subsample':[i/10.0 for i in range(6,11)],
            'colsample_bytree':[0.7],
            'reg_alpha':[1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 0.1, 1],
            'n_estimators':[3000],
            'reg_lambda':[1e-5, 1e-4, 1e-3, 1e-2, 0.1, 1],
            'learning_rate':[0.1]
            }
model = lgb.LGBMRegressor(silent=False,n_jobs=-1)
model_rand = RandomizedSearchCV(model, cv_params, n_iter=200, cv=5, n_jobs=-1)
model_rand.fit(X_train,
               y_train,
               early_stopping_rounds=50,
               eval_set=[(X_test, y_test)],
               eval_metric='rmse',
               verbose=0
              )
print('optimal_parameters\n', model_rand.best_params_)

## 確認

In [None]:
model = lgb.LGBMRegressor(silent = False,           
                          n_jobs = -1,
                          max_depth = 10,           
                          min_child_weight = 1,     
                          subsample = 0.5,           
                          colsample_bytree = 0.6,   
                          reg_alpha = 0.01,           
                          n_estimators = 1000,     
                          reg_lambda = 0.1,           
                          learning_rate = 0.1        
                          )
model.fit(X_train, y_train_log)
y_pred_test = model.predict(X_test)
y_pred = np.exp(y_pred_test) - 1
np.sqrt(msle(y_test, y_pred))

# その他

In [None]:
#対数変換に対応していないものしかない
#回帰する
import lightgbm as lgb
kf = KFold(n_splits=9, shuffle=True, random_state=1).split(X_train, y_train)
kf_ = list(kf)
model = lgb.LGBMRegressor(silent = False,           
                          n_jobs = -1,
                          max_depth = 10,           
                          min_child_weight = 2,     
                          subsample = 0.9,           
                          colsample_bytree = 1.0,   
                          reg_alpha = 100,           
                          n_estimators = 1000,     
                          reg_lambda = 0.1,           
                          learning_rate = 0.1        
                          )
scores =[]
score = 0

diffs = []
for train, valid in kf_:
    model.fit(X_train.loc[train],
              y_train.loc[train],
              early_stopping_rounds=50,
              eval_set=[(X_train.loc[valid], y_train.loc[valid])],
              eval_metric='rmse',
              verbose=0
              )
    prediction = model.predict(X_train.loc[valid])
    prediction[prediction < 0] = 0
    score = np.sqrt(msle(y_train.loc[valid], prediction))
    scores.append(score)
    y_true = y_train.loc[valid]
    df_diff = prediction - y_true
    plus = df_diff[df_diff > 0].sum()
    equal = df_diff[df_diff == 0].sum()
    minus = df_diff[df_diff < 0].sum()
    _ = [plus, equal, minus]
    diffs.append(_)
mean_score = np.mean(scores) 
diff_mean = np.mean(np.array(diffs), axis=0)
print(f"mean_score:{mean_score}\n")
print(f"plus={diff_mean[0]}")
print(f"equal={diff_mean[1]}")
print(f"minus={diff_mean[2]}")