# Data Model -  Regression

## Initial Setup for XGBoost

In [1]:
#installation for xgboost
!pip install xgboost 



## Import Libraries

In [2]:
#Importing libraries
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor
import sklearn.metrics as metrics
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
import xgboost as xgb
import matplotlib.pyplot as plt
import math
from sklearn.metrics import mean_squared_error, mean_absolute_error
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()
from sklearn import set_config
set_config(print_changed_only=False)


## Loading dataset 

In [3]:
#Loading Data
train_final = pd.read_csv (r'D:\Jupyter\Merged\train_final.csv')  

train_final.Date = pd.to_datetime(train_final.Date,format='%Y-%m-%d')
train_final.index = train_final.Date
train_final = train_final.drop('Date', axis=1)
train_final.head()

Unnamed: 0_level_0,Store,Dept,Weekly_Sales,IsHoliday,Type,Size,Year,Week
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2010-02-05,1,1,24924.5,0,3,151315,2010,5
2010-02-05,1,2,50605.27,0,3,151315,2010,5
2010-02-05,1,3,13740.12,0,3,151315,2010,5
2010-02-05,1,4,39954.04,0,3,151315,2010,5
2010-02-05,1,5,32229.38,0,3,151315,2010,5


## Setting Training Set and Split

In [4]:
train_data, test_data = sklearn.model_selection.train_test_split(train_final, train_size = 0.7,shuffle=False)
X_train, y_train = train_data[['Store', 'Dept','IsHoliday','Type','Size','Year','Week']], train_data['Weekly_Sales']

X_test, y_test = test_data[['Store', 'Dept','IsHoliday','Type','Size','Year','Week']], test_data['Weekly_Sales']

X_train.shape, y_train.shape, X_test.shape, y_test.shape

((295099, 7), (295099,), (126471, 7), (126471,))

## Decision Tree Modeling

In [5]:
#Define Decision Tree
model = DecisionTreeRegressor(random_state=1)

#Setting parameters to test on Decision Tree
param1 = [
    {'min_samples_leaf': range(1,51,2),
     'min_samples_split':range(2,100,2),
     'max_depth':range(5,1000,5)}
]

#Perform Randomized Search CV
grid_search = RandomizedSearchCV(model, param1, cv = 3, verbose = 3, 
                                 n_jobs = -1)
grid_search.fit(X_train, y_train)
results = grid_search.cv_results_
best_param=grid_search.best_params_

#Display the best result
best_param 

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    6.5s finished


{'min_samples_split': 54, 'min_samples_leaf': 25, 'max_depth': 440}

In [6]:
#Fitting the parameters into Decision Tree Regressor Model
dt = DecisionTreeRegressor(min_samples_split=54,min_samples_leaf=25,max_depth=440,random_state=42)
dt=dt.fit(X_train,y_train)
y_pred_dt = dt.predict(X_test)

#Display the settings
print(dt)

DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=440,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=25, min_samples_split=54,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=42, splitter='best')


## Random Forest Regressor Modeling

In [7]:
#Define Random Forest Regressor
model = RandomForestRegressor(random_state = 42)

#Setting parameters to test on Random Forest Regressor Model
params={
 "n_estimators"     : range(100,300,100) ,
 "max_depth"        : [5,25,50,100,200],
 "min_samples_split": [2,5,8,10,15,20],
 "min_samples_leaf" : [1,2,5,8,10]
}

#Perform Randomized Search CV
grid_search = RandomizedSearchCV(model, params, cv = 3, verbose = 3, 
                                 n_jobs = -1)
grid_search.fit(X_train, y_train)
results = grid_search.cv_results_
best_param=grid_search.best_params_

#Display the best result
best_param

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:  7.6min finished


{'n_estimators': 200,
 'min_samples_split': 15,
 'min_samples_leaf': 8,
 'max_depth': 200}

In [8]:
#Random Forest Regressor modeling
rfr = RandomForestRegressor(n_estimators=200,min_samples_split=15, min_samples_leaf=8,max_depth=200)        
rfr.fit(X_train,y_train)
y_pred_rfr=rfr.predict(X_test)

## XGBoost Modeling

In [9]:
#Define XGBoost
model = xgb.XGBRegressor(random_state = 1)

#Setting parameters to test on XGBoost
params={
 "learning_rate"    : [0.05, 0.10, 0.15, 0.20, 0.25, 0.30 ] ,
 "max_depth"        : [5,25,50,100,200,500,1000],
 "min_child_weight" : [ 1, 3, 5, 7 ],
 "gamma"            : [ 0.0, 0.1, 0.2 , 0.3, 0.4 ],
 "colsample_bytree" : [ 0.3, 0.4, 0.5 , 0.7 ]   
}

#Perform Randomized Search CV
grid_search = RandomizedSearchCV(model, params, cv = 3, verbose = 3,n_jobs = -1)
grid_search.fit(X_train, y_train)
results = grid_search.cv_results_
grid_search.best_params_
best_param=grid_search.best_params_
#Display the best result
best_param

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:  3.0min finished


{'min_child_weight': 5,
 'max_depth': 25,
 'learning_rate': 0.05,
 'gamma': 0.1,
 'colsample_bytree': 0.7}

In [10]:
#XGBoost modeling
xgb=xgb.XGBRegressor(min_child_weight=5,max_depth=25,learning_rate=0.05,gamma=0.1,colsample_bytree=0.7)
xgb.fit(X_train,y_train)
y_pred_xgb=xgb.predict(X_test)