<div class="alert alert-block" style="border: 1px solid #455A64;background-color:#ECEFF1;">
본 자료 및 영상 컨텐츠는 저작권법 제25조 2항에 의해 보호를 받습니다. 본 컨텐츠 및 컨텐츠 일부 문구등을 외부에 공개, 게시하는 것을 금지합니다. 특히 자료에 대해서는 저작권법을 엄격하게 적용하겠습니다.
</div>

### 0. Get data

### 1. train/test 데이터 임포트

In [None]:
import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor 
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score

import string
import warnings
import missingno
warnings.filterwarnings('ignore')

In [None]:
df_train = pd.read_csv('bikesharing/train.csv')
df_test = pd.read_csv('bikesharing/test.csv')
df_all = pd.concat((df_train, df_test)).reset_index(drop=True)

In [None]:
def split_df(df):
    return df[:10885], df[10886:]

### RMSLE 기반 예측을 위한 log 필드 추가
> RMSLE 가 log 로 계산되므로, 예측값 또한 log 값으로 계산되도록 하는 편이 보다 RMSLE 성능에 도움을 줌

In [None]:
df_all['casual_log'] = np.log(df_all['casual'] + 1)
df_all['registered_log'] = np.log(df_all['registered'] + 1)
df_all['count_log'] = np.log(df_all['count'] + 1)

### 시간 필드 추가

In [None]:
dt = pd.DatetimeIndex(df_all['datetime'])
df_all.set_index(dt, inplace=True)

df_all['date'] = dt.date
df_all['day'] = dt.day
df_all['month'] = dt.month
df_all['year'] = dt.year
df_all['hour'] = dt.hour
df_all['dow'] = dt.dayofweek
df_all['woy'] = dt.weekofyear

### peak 타임 필드 추가

In [None]:
def func(df_data):
    if df_data['workingday'] == 1:
        if (df_data['hour'] == 8) or (df_data['hour'] == 17) or (df_data['hour'] == 18):
            return 4
        elif (df_data['hour'] == 7) or (df_data['hour'] == 16) or (df_data['hour'] == 19): 
            return 3           
    else:
        if (df_data['hour'] >= 12 and df_data['hour'] <= 16):
            return 2
        elif (df_data['hour'] >= 10 and df_data['hour'] <= 19):
            return 1
    return 0

# 0 or ‘index’: 각 컬럼에 함수 적용, 1 or ‘columns’: 각 행에 함수 적용
df_all['peak'] = df_all.apply(func, axis=1)

In [None]:
def func(df_data):
    # 2021.10.22 업데이트
    # 영상에서는 24 일부터 31 일까지를 적용하지만, 테스트 결과 대부분 확실히 쉬는 24일과 31일만 적용했을 때,
    # 보다 결과가 좋았기 때문에, 24일과 31일만 적용하였습니다.
    if (df_data['month'] == 12) and (df_data['day'] >= 24 or df_data['day'] <= 31):
            return 1
    return df_data['holiday']

df_all['holiday'] = df_all.apply(func, axis=1)

In [None]:
def func(df_data):
    # 2021.10.22 업데이트
    # 영상에서는 24 일부터 31 일까지를 적용하지만, 테스트 결과 대부분 확실히 쉬는 24일과 31일만 적용했을 때,
    # 보다 결과가 좋았기 때문에, 24일과 31일만 적용하였습니다.
    if (df_data['month'] == 12) and (df_data['day'] == 24 or df_data['day'] == 31):
            return 0
    return df_data['workingday']

df_all['workingday'] = df_all.apply(func, axis=1)

### 온도, 풍속, 습도, 날씨 기반 fit & humid 필드 추가

In [None]:
def func(df_data):
    if (df_data['weather'] <= 2 and df_data['windspeed'] <= 20):
        if (df_data['temp'] > 15 and df_data['temp'] <= 35):
            return 1
    return 0

df_all['fit'] = df_all.apply(func, axis=1)

In [None]:
def func(df_data):
    if df_data['humidity'] >= 70:
            return 1
    return 0

df_all['humid'] = df_all.apply(func, axis=1)

### Metric

In [None]:
from sklearn.metrics import make_scorer

def get_rmsle(y_actual, y_pred):
    diff = np.log(y_pred + 1) - np.log(y_actual + 1)
    mean_error = np.square(diff).mean()
    return np.sqrt(mean_error)

### Model Evaluation
- 참고: np.log() 와 np.exp 는 역함수 
- 수학적인 부분보다, 다음과 같이 코드로 바로 이해하기로 함

In [None]:
def predict_bikecount(model, select_columns):
    df_train, df_test = split_df(df_all)

    X_train = df_train[select_columns]
    y_train_cas = df_train['casual_log']
    y_train_reg = df_train['registered_log']
    X_test = df_test[select_columns]
    
    casual_model = model.fit(X_train, y_train_cas)
    y_pred_cas = casual_model.predict(X_test)
    y_pred_cas = np.exp(y_pred_cas) - 1

    registered_model = model.fit(X_train, y_train_reg)
    y_pred_reg = registered_model.predict(X_test)
    y_pred_reg = np.exp(y_pred_reg) - 1

    return y_pred_cas + y_pred_reg

### Model Evaluation Test: Random Forest Regressor

In [None]:
df_train, df_test = split_df(df_all)
randomforest_columns = [
    'weather', 'temp', 'atemp', 'windspeed', 'workingday', 'season', 'holiday', 'hour', 'dow', 
    'humid', 'woy', 'peak'
]
X_train = df_train[randomforest_columns].copy()
y_train = df_train['count']
rmsle_scorer = make_scorer(get_rmsle, greater_is_better=False)

In [None]:
from sklearn.model_selection import GridSearchCV

n_estimators = [800, 1000, 1200]
max_depth = [10, 12, 15]
min_samples_split = [4, 5, 6]
min_samples_leaf = [4, 5, 6]

hyperparams = {'n_estimators': n_estimators, 'max_depth': max_depth, 
               'min_samples_split': min_samples_split, 'min_samples_leaf': min_samples_leaf}

rf_grid = GridSearchCV(estimator = RandomForestRegressor(), param_grid = hyperparams, 
                verbose=True, scoring=rmsle_scorer, cv=5, n_jobs=-1)

rf_grid.fit(X_train, y_train)
print(rf_grid.best_params_)

In [None]:
rf_model = rf_grid.best_estimator_
ml_pred = predict_bikecount(rf_model, randomforest_columns)
df_test['count'] = ml_pred
final_df = df_test[['datetime', 'count']].copy()
final_df.to_csv('submission_rf.csv', header=True, index=False)
!kaggle competitions submit -c bike-sharing-demand -f submission_rf.csv -m "Message"

### Model Evaluation Test: XGBoost Regressor

In [None]:
df_train, df_test = split_df(df_all)
xgboost_columns = [
    'weather', 'temp', 'atemp', 'windspeed', 'workingday', 'season', 'holiday', 'hour', 'dow',
    'humidity', 'fit', 'year'
]
X_train = df_train[xgboost_columns].copy()
y_train = df_train['count']
rmsle_scorer = make_scorer(get_rmsle, greater_is_better=False)

In [None]:
from xgboost import XGBRegressor # 회귀트리 모델
from sklearn.model_selection import GridSearchCV

hyperparams = {'nthread':[4],
              'learning_rate': [0.02, 0.05, 0.1],
              'max_depth': [4, 5],
              'min_child_weight': [4, 5],
              'subsample': [0.8, 0.9],
              'colsample_bytree': [0.7, 0.8],
              'n_estimators': [500, 750, 1000]}

xgb_grid = GridSearchCV(estimator = XGBRegressor(), param_grid = hyperparams, 
                verbose=True, scoring=rmsle_scorer, cv=5, n_jobs=-1)

xgb_grid.fit(X_train, y_train)
print(xgb_grid.best_params_)

In [None]:
xgb_model = xgb_grid.best_estimator_
ml_pred = predict_bikecount(xgb_model, xgboost_columns)
df_test['count'] = ml_pred
final_df = df_test[['datetime', 'count']].copy()
final_df.to_csv('submission_xg.csv', header=True, index=False)
!kaggle competitions submit -c bike-sharing-demand -f submission_xg.csv -m "Message"

### Model Evaluation Test: Gradient Boosting Regressor

In [None]:
df_train, df_test = split_df(df_all)
gradientboost_columns = [
    'weather', 'temp', 'atemp', 'windspeed', 'workingday', 'season', 'holiday', 'hour', 'dow',
    'humidity', 'fit', 'year'
]
X_train = df_train[gradientboost_columns].copy()
y_train = df_train['count']
rmsle_scorer = make_scorer(get_rmsle, greater_is_better=False)

In [None]:
from sklearn.model_selection import GridSearchCV

n_estimators = [200, 500, 750]
max_depth = [4, 5, 6]
min_samples_leaf = [12, 15]
learning_rate = [0.02, 0.05, 0.1]
subsample = [0.6, 0.7, 0.8]

hyperparams = {'n_estimators': n_estimators, 'max_depth': max_depth, 
                    'min_samples_leaf': min_samples_leaf,
                    'learning_rate': learning_rate, 'subsample': subsample
              }

gb_grid=GridSearchCV(estimator = GradientBoostingRegressor(), param_grid = hyperparams, 
                verbose=True, scoring=rmsle_scorer, cv=5, n_jobs=-1)

gb_grid.fit(X_train, y_train)
print(gb_grid.best_params_)

In [None]:
gb_model = gb_grid.best_estimator_
ml_pred = predict_bikecount(gb_model, gradientboost_columns)
df_test['count'] = ml_pred
final_df = df_test[['datetime', 'count']].copy()
final_df.to_csv('submission_gb.csv', header=True, index=False)
!kaggle competitions submit -c bike-sharing-demand -f submission_gb.csv -m "Message"

### Random Forest + Gradient Boost

In [None]:
randomforest_columns = [
    'weather', 'temp', 'atemp', 'windspeed', 'workingday', 'season', 'holiday', 'hour', 'dow', 
    'humid', 'woy', 'peak'
]
gradientboost_columns = [
    'weather', 'temp', 'atemp', 'windspeed', 'workingday', 'season', 'holiday', 'hour', 'dow',
    'humidity', 'fit', 'year'
]

In [None]:
randomforest_model = rf_grid.best_estimator_
gradientboost_model = gb_grid.best_estimator_

randomforest_pred = predict_bikecount(randomforest_model, randomforest_columns)
gradientboost_pred = predict_bikecount(gradientboost_model, gradientboost_columns)
y_pred = np.round(0.2 * randomforest_pred + 0.8 * gradientboost_pred)
df_test['count'] = y_pred
final_df = df_test[['datetime', 'count']].copy()
final_df.to_csv('submissions_rf_gb.csv', header=True, index=False)
!kaggle competitions submit -c bike-sharing-demand -f submissions_rf_gb.csv -m "Message"

### Random Forest + XGBoost 

In [None]:
randomforest_columns = [
    'weather', 'temp', 'atemp', 'windspeed', 'workingday', 'season', 'holiday', 'hour', 'dow', 
    'humid', 'woy', 'peak'
]
xgboost_columns = [
    'weather', 'temp', 'atemp', 'windspeed', 'workingday', 'season', 'holiday', 'hour', 'dow',
    'humidity', 'fit', 'year'
]

In [None]:
randomforest_model = rf_grid.best_estimator_
xgb_model = xgb_grid.best_estimator_

randomforest_pred = predict_bikecount(randomforest_model, randomforest_columns)
xgboost_pred = predict_bikecount(xgb_model, xgboost_columns)
y_pred = np.round(0.2 * randomforest_pred + 0.8 * xgboost_pred)
df_test['count'] = y_pred
final_df = df_test[['datetime', 'count']].copy()
final_df.to_csv('submissions_rf_xg.csv', header=True, index=False)
!kaggle competitions submit -c bike-sharing-demand -f submissions_rf_xg.csv -m "Message"

<div class="alert alert-block" style="border: 1px solid #455A64;background-color:#ECEFF1;">
본 자료 및 영상 컨텐츠는 저작권법 제25조 2항에 의해 보호를 받습니다. 본 컨텐츠 및 컨텐츠 일부 문구등을 외부에 공개, 게시하는 것을 금지합니다. 특히 자료에 대해서는 저작권법을 엄격하게 적용하겠습니다.
</div>