# Import Dataset

In [1]:
import pandas as pd
import numpy as np
import warnings
import matplotlib.pyplot as plt

warnings.filterwarnings("ignore")
pd.options.display.max_rows = 999
pd.options.display.max_columns = 999
%matplotlib inline 

In [16]:
X = pd.read_csv('X_ver1.csv')
Y = pd.read_csv('Y.csv')

In [17]:
X = X.iloc[:,1:]
Y = Y.iloc[:,1:]

# Modeling

In [13]:
import time

In [26]:
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
import lightgbm as lgb
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

In [18]:
x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size=0.2,random_state=42)

In [19]:
X = np.array(x_train)
y = np.array(y_train)

In [23]:
names = ['RandomForest', 'LGBM', 'XGB']
reg_list = [RandomForestRegressor(random_state=42), 
            lgb.LGBMRegressor(random_state=42), 
            xgb.XGBRegressor(random_state=42) ]

In [24]:
for name, reg in zip(names, reg_list):
    start = time.time()
    reg.fit(X,y)
    print('---- {} ----'.format(name))    
    print('cv score : ', cross_val_score(reg,X,y, cv=5).mean())
    print('time spent : ',time.time()-start)
    print('-----------------'.format(name))    

---- RandomForest ----
cv score :  0.6434664125561793
time spent :  194.63353967666626
-----------
---- LGBM ----
cv score :  0.6531707102937545
time spent :  12.887619018554688
-----------
---- XGB ----
cv score :  0.5029248931853489
time spent :  1068.7486505508423
-----------


# Hyperparameter Tuning

In [27]:
lgbm_params ={'max_depth': np.arange(3, 30),
             'num_leaves': np.arange(10, 100), 
             'learning_rate': [ 0.01, 0.05, 0.01, 0.001],
             'min_child_samples': randint(2, 30),
             'min_child_weight': [1e-5, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4],
             'subsample': np.linspace(0.6, 0.9, 30, endpoint=True), 
             'colsample_bytree': np.linspace(0.1, 0.8, 100, endpoint=True),
             'reg_alpha': [0, 1e-1, 1, 2, 5, 7, 10, 50, 100],
             'reg_lambda': [0, 1e-1, 1, 5, 10, 20, 50, 100],
             'n_estimators': np.arange(100, 400)}

In [30]:
def hypertuning_rscv(est, p_distr, nbr_iter,X,y):
    rdmsearch = RandomizedSearchCV(est, param_distributions=p_distr, n_jobs=-1, n_iter=nbr_iter, cv=5, random_state=0)    
    rdmsearch.fit(X,y)
    ht_params = rdmsearch.best_params_
    ht_score = rdmsearch.best_score_
    return ht_params, ht_score

In [33]:
lgbm_reg= lgb.LGBMRegressor(random_state=42)
best_params = hypertuning_rscv(lgbm_reg,lgbm_params,30,X,y)

In [35]:
lgbm_reg.set_params(**best_params[0])
cross_val_score(lgbm_reg, X, y, cv=5).mean()

0.6768423903841061

In [37]:
tuned_lgbm = lgb.LGBMRegressor(random_state=42, **best_params[0])
tuned_lgbm.fit(X,y)

# Evaluation

In [None]:
act = y.copy()
act = act.flatten()

In [76]:
def MAPE(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true))

## Train

In [78]:
train_pred = tuned_lgbm.predict(X)
train_act = y.copy()
train_act = train_act.flatten()

In [79]:
MAPE(train_act,train_pred)

7.906285100464103

## Test

In [65]:
test_X = np.array(x_test)
test_y = np.array(y_test)

In [66]:
test_pred = tuned_lgbm.predict(test_X)
test_act = test_y.copy()
test_act = test_act.flatten()

In [67]:
MAPE(test_act,test_pred)

9.170466935888188

In [80]:
test_act.round(-4)[0:10]

array([18220000.,  7710000., 22580000.,  7440000., 13560000., 16640000.,
       27990000., 31180000., 41840000., 24380000.])

In [81]:
test_pred.round(-4)[0:10]

array([17870000.,  5500000., 29810000.,  7520000., 15970000., 23230000.,
       15370000., 39280000., 20220000., 18850000.])