# <center>**Importing Required Libraries**</center>

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
data = pd.read_csv('CLEANED_METRO_INTERSTATE_TRAFFIC.csv',index_col='date_time')

In [3]:
data.head()

Unnamed: 0_level_0,holiday,temp,clouds_all,weather_main,traffic_volume,year,month,weekday,hour,day_part
date_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2012-10-02 09:00:00,,288.28,40,Clouds,5545,2012,10,1,9,morning
2012-10-02 10:00:00,,289.36,75,Clouds,4516,2012,10,1,10,morning
2012-10-02 11:00:00,,289.58,90,Clouds,4767,2012,10,1,11,morning
2012-10-02 12:00:00,,290.13,90,Clouds,5026,2012,10,1,12,morning
2012-10-02 13:00:00,,291.14,75,Clouds,4918,2012,10,1,13,afternoon


# <center>**Splitting the data into independent and target feature**</center>

In [4]:
x = data.drop(['day_part', 'traffic_volume'], axis = 1)

In [5]:
x.head()

Unnamed: 0_level_0,holiday,temp,clouds_all,weather_main,year,month,weekday,hour
date_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2012-10-02 09:00:00,,288.28,40,Clouds,2012,10,1,9
2012-10-02 10:00:00,,289.36,75,Clouds,2012,10,1,10
2012-10-02 11:00:00,,289.58,90,Clouds,2012,10,1,11
2012-10-02 12:00:00,,290.13,90,Clouds,2012,10,1,12
2012-10-02 13:00:00,,291.14,75,Clouds,2012,10,1,13


In [6]:
y = data['traffic_volume']

In [7]:
y.head()

date_time
2012-10-02 09:00:00    5545
2012-10-02 10:00:00    4516
2012-10-02 11:00:00    4767
2012-10-02 12:00:00    5026
2012-10-02 13:00:00    4918
Name: traffic_volume, dtype: int64

# <center>**Label Encoding - Independent Features**</center>

In [8]:
from sklearn.preprocessing  import LabelEncoder

In [9]:
le = LabelEncoder()

In [10]:
for column in x.columns:
    if x[column].dtypes == 'object':
        x[column] = le.fit_transform(x[column])

In [11]:
x.head()

Unnamed: 0_level_0,holiday,temp,clouds_all,weather_main,year,month,weekday,hour
date_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2012-10-02 09:00:00,0,288.28,40,1,2012,10,1,9
2012-10-02 10:00:00,0,289.36,75,1,2012,10,1,10
2012-10-02 11:00:00,0,289.58,90,1,2012,10,1,11
2012-10-02 12:00:00,0,290.13,90,1,2012,10,1,12
2012-10-02 13:00:00,0,291.14,75,1,2012,10,1,13


In [12]:
l1 = sorted(data['weather_main'].unique())

In [13]:
l1

['Clear',
 'Clouds',
 'Drizzle',
 'Fog',
 'Haze',
 'Mist',
 'Rain',
 'Smoke',
 'Snow',
 'Squall',
 'Thunderstorm']

In [14]:
l2 = list(range(len(data['weather_main'].unique())))

In [15]:
l2

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

In [16]:
pd.DataFrame(list(zip(l1,l2)),columns=['Weather', 'Assigned Value'], index=None)

Unnamed: 0,Weather,Assigned Value
0,Clear,0
1,Clouds,1
2,Drizzle,2
3,Fog,3
4,Haze,4
5,Mist,5
6,Rain,6
7,Smoke,7
8,Snow,8
9,Squall,9


# <center>**Train Test Split**</center>

In [17]:
from sklearn.model_selection import train_test_split

In [18]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.30,random_state=22)

In [19]:
x_train.head()

Unnamed: 0_level_0,holiday,temp,clouds_all,weather_main,year,month,weekday,hour
date_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2018-09-12 10:00:00,0,294.49,20,6,2018,9,2,10
2017-05-18 08:00:00,0,281.83,90,6,2017,5,3,8
2013-01-14 07:00:00,0,256.34,90,1,2013,1,0,7
2018-02-24 20:00:00,0,271.35,90,3,2018,2,5,20
2017-07-05 08:00:00,0,295.65,1,6,2017,7,2,8


# <center>**Scaling the Data**</center>

In [20]:
from sklearn.preprocessing import StandardScaler

In [21]:
scale = StandardScaler()

In [22]:
scale.fit(x_train)

StandardScaler()

In [23]:
x_train_scaled = scale.transform(x_train)

In [24]:
x_test_scaled = scale.transform(x_test)

In [25]:
x_train_scaled

array([[-0.03823196,  1.04018578, -0.7465836 , ...,  0.73070247,
        -0.48926673, -0.19924476],
       [-0.03823196,  0.04426294,  1.0468734 , ..., -0.44610515,
         0.00873717, -0.48693325],
       [-0.03823196, -1.96095611,  1.0468734 , ..., -1.62291278,
        -1.48527452, -0.6307775 ],
       ...,
       [-0.03823196, -0.85175295, -1.23337907, ..., -1.03450896,
        -0.98727062, -0.48693325],
       [-0.03823196, -0.65429905,  1.0468734 , ..., -1.62291278,
        -0.98727062, -1.63768722],
       [-0.03823196,  0.29521033, -1.23337907, ..., -0.44610515,
        -0.48926673, -0.77462174]])

# <center>**Importing required modules for training**</center>

In [26]:
from sklearn.linear_model import LinearRegression,Lasso, Ridge, ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.model_selection import RandomizedSearchCV

# <center>**Model Training**</center>

In [27]:
def train_model(models,model_names, x_train_scaled, x_test_scaled, y_train, y_test):
    model_res = []
    perc_res = []
    for model in range(len(models)):
        regressor = models[model]
        regressor.fit(x_train_scaled, y_train)
        y_pred = regressor.predict(x_test_scaled)
        mae = mean_absolute_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        perc = r2*100
        perc_res.append(perc)
        model_res.append(r2)
        print(f"{'-'*30} {model_names[model]} {'-'*30}")
        print(f'Mean Absolute Error : {mae}')
        print(f'R2 Score : {r2}')
        print(f'Percentage(r2) : {perc}')

        print('*'*80)
        
    result_df = pd.DataFrame(list(zip(model_names, model_res, perc_res)), columns = ['Model', 'R2_Score', 'Accuracy Percentage'])
    return result_df 

In [28]:
models = [LinearRegression(), Ridge(), Lasso(), ElasticNet(), SVR(), XGBRegressor(), RandomForestRegressor(), KNeighborsRegressor(), DecisionTreeRegressor()]
model_names = ['Linear Regression', 'Ridge Regression', 'Lasso Regression', 'ElasticNet Regression', 'SVR', 'XGB Regressor', 'Random Forest Regressor', 'K Neighbors Regressor', 'Decision Tree Regressor']


In [29]:
model_res = train_model(models,model_names, x_train_scaled, x_test_scaled, y_train, y_test)

------------------------------ Linear Regression ------------------------------
Mean Absolute Error : 1598.8662665639945
R2 Score : 0.161249167099626
Percentage(r2) : 16.1249167099626
********************************************************************************
------------------------------ Ridge Regression ------------------------------
Mean Absolute Error : 1598.8689334136534
R2 Score : 0.16124919412795125
Percentage(r2) : 16.124919412795123
********************************************************************************
------------------------------ Lasso Regression ------------------------------
Mean Absolute Error : 1599.1426996196676
R2 Score : 0.16126511261963783
Percentage(r2) : 16.126511261963785
********************************************************************************
------------------------------ ElasticNet Regression ------------------------------
Mean Absolute Error : 1634.1034368543083
R2 Score : 0.14421735416111037
Percentage(r2) : 14.421735416111037
*******

In [30]:
model_res

Unnamed: 0,Model,R2_Score,Accuracy Percentage
0,Linear Regression,0.161249,16.124917
1,Ridge Regression,0.161249,16.124919
2,Lasso Regression,0.161265,16.126511
3,ElasticNet Regression,0.144217,14.421735
4,SVR,0.247513,24.751277
5,XGB Regressor,0.957241,95.724063
6,Random Forest Regressor,0.952653,95.26527
7,K Neighbors Regressor,0.828779,82.877933
8,Decision Tree Regressor,0.911242,91.124181


- Here, XGB Regressor and Random Forest Regressor are the two most accurate models with 95.72% and 95.31% accuracy respectively.
- So, we will tune hyperparameters for these algorithms and use the best model for prediction.

# <center>**HyperParameter Tuning**</center>

We will use RandomizedSearchCV for hyperparameter tuning.

#### 1. Tuning XGB Regressor

In [31]:
params_grid = [{'learning_rate' : [0.1,0.2,0.3,0.35],
             'max_depth' : [6,8,9],
             'gamma': [0.1, 0.3, 0.4, 0.5 ]
}]


In [32]:
xgb = XGBRegressor()

In [33]:
rcv = RandomizedSearchCV(estimator = xgb, param_distributions=params_grid, n_iter=100, cv= 5, verbose = 2)

In [35]:
rcv.fit(x_train_scaled, y_train)



Fitting 5 folds for each of 48 candidates, totalling 240 fits
[CV] END ..........gamma=0.1, learning_rate=0.1, max_depth=6; total time=   3.1s
[CV] END ..........gamma=0.1, learning_rate=0.1, max_depth=6; total time=   2.6s
[CV] END ..........gamma=0.1, learning_rate=0.1, max_depth=6; total time=   2.9s
[CV] END ..........gamma=0.1, learning_rate=0.1, max_depth=6; total time=   2.7s
[CV] END ..........gamma=0.1, learning_rate=0.1, max_depth=6; total time=   2.9s
[CV] END ..........gamma=0.1, learning_rate=0.1, max_depth=8; total time=   4.9s
[CV] END ..........gamma=0.1, learning_rate=0.1, max_depth=8; total time=   4.6s
[CV] END ..........gamma=0.1, learning_rate=0.1, max_depth=8; total time=   5.1s
[CV] END ..........gamma=0.1, learning_rate=0.1, max_depth=8; total time=   3.8s
[CV] END ..........gamma=0.1, learning_rate=0.1, max_depth=8; total time=   4.2s
[CV] END ..........gamma=0.1, learning_rate=0.1, max_depth=9; total time=   4.8s
[CV] END ..........gamma=0.1, learning_rate=0.1

RandomizedSearchCV(cv=5,
                   estimator=XGBRegressor(base_score=None, booster=None,
                                          callbacks=None,
                                          colsample_bylevel=None,
                                          colsample_bynode=None,
                                          colsample_bytree=None,
                                          early_stopping_rounds=None,
                                          enable_categorical=False,
                                          eval_metric=None, gamma=None,
                                          gpu_id=None, grow_policy=None,
                                          importance_type=None,
                                          interaction_constraints=None,
                                          learning_rate=None, max_bin=None,
                                          m...
                                          max_delta_step=None, max_depth=None,
                           

In [None]:
rcv.best_params_

{'max_depth': 9, 'learning_rate': 0.2, 'gamma': 0.3}

In [None]:
xgb_best_estimator = rcv.best_estimator_

In [None]:
y_pred_xgb = xgb_best_estimator.predict(x_test_scaled)

In [None]:
mae = mean_absolute_error(y_test, y_pred_xgb)
r2 = r2_score(y_test, y_pred_xgb)
print('XG Boost Regressor Tuned!')
print('R2 Score : ', r2)
print('Mean Absolute Error : ', mae)

XG Boost Regressor Tuned!
R2 Score :  0.961895302866642
Mean Absolute Error :  219.60258204936318


#### 2. Random Forest Regressor

In [None]:
grid_params2 = [{
             'max_depth': [10,30,50,70,100],
             'min_samples_leaf': [1, 2, 4],
             'min_samples_split': [2, 5, 10]
}]

In [None]:
rfr = RandomForestRegressor()

In [None]:
rcv1 = RandomizedSearchCV(estimator=rfr, param_distributions=grid_params2, cv= 5, n_iter=100, verbose=2)

In [None]:
rcv1.fit(x_train_scaled,y_train)



Fitting 5 folds for each of 45 candidates, totalling 225 fits
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2; total time=   6.9s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2; total time=   6.8s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2; total time=   7.3s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2; total time=   6.8s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2; total time=   7.4s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=5; total time=   6.7s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=5; total time=   6.8s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=5; total time=   8.0s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=5; total time=   6.6s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=5; total time=   6.7s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=10; total time=   6.7s
[CV] END max_depth=10, m

RandomizedSearchCV(cv=5, estimator=RandomForestRegressor(), n_iter=100,
                   param_distributions=[{'max_depth': [10, 30, 50, 70, 100],
                                         'min_samples_leaf': [1, 2, 4],
                                         'min_samples_split': [2, 5, 10]}],
                   verbose=2)

In [None]:
rcv1.best_params_

{'min_samples_split': 2, 'min_samples_leaf': 2, 'max_depth': 50}

In [None]:
rfr_best_estimator = rcv1.best_estimator_

In [None]:
y_pred_rfr = rfr_best_estimator.predict(x_test_scaled)

In [None]:
mae = mean_absolute_error(y_test, y_pred_rfr)
r2 = r2_score(y_test, y_pred_rfr)
print('Random Forest Regressor Tuned!')
print('R2 Score : ', r2)
print('Mean Absolute Error : ', mae)

Random Forest Regressor Tuned!
R2 Score :  0.9530012824347245
Mean Absolute Error :  234.74038918381297


- Results of Random Forest Regressor
  - R2 Score - 0.95300
  - Mean Absolute Error - 234.74038

- Results of XG Boost Regressor
  - R2 Score - 0.96189
  - Mean Absolute Error - 219.60258

##### **As we see XB Boost Algorithm has outperformed every other algorithms, so we will dump this model for prediction.**

# <center>**Saving the Model**</center>

In [None]:
import pickle

In [None]:
pickle.dump(xgb_best_estimator, open('best_model.pkl', 'wb'))

In [None]:
best_model = pickle.load(open('best_model.pkl', 'rb'))

In [None]:
best_model.predict(x_test)

array([1072.5189 , 1386.5486 ,  998.05225, ..., 1347.6932 , 1347.6932 ,
       1172.7191 ], dtype=float32)