In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import RandomForestRegressor

In [26]:
filename='/content/drive/MyDrive/Coding Dojo/Week 5/food-sales-predictions'
df=pd.read_csv(filename)
df.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,Small,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [27]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Item_Identifier            8523 non-null   object 
 1   Item_Weight                8523 non-null   float64
 2   Item_Fat_Content           8523 non-null   object 
 3   Item_Visibility            8523 non-null   float64
 4   Item_Type                  8523 non-null   object 
 5   Item_MRP                   8523 non-null   float64
 6   Outlet_Identifier          8523 non-null   object 
 7   Outlet_Establishment_Year  8523 non-null   int64  
 8   Outlet_Size                8523 non-null   object 
 9   Outlet_Location_Type       8523 non-null   object 
 10  Outlet_Type                8523 non-null   object 
 11  Item_Outlet_Sales          8523 non-null   float64
dtypes: float64(4), int64(1), object(7)
memory usage: 799.2+ KB


In [28]:
df.shape

(8523, 12)

In [29]:
# deal with categorical variable 
fat={'Low Fat':0,'Regular':1}
df['Item_Fat_Content']=df['Item_Fat_Content'].map(fat)

In [30]:
size={'Small':0,'Medium':1,'High':2}
df['Outlet_Size']=df['Outlet_Size'].map(size)

In [31]:
tier={'Tier 1':0,'Tier 2':1,'Tier 3':2}
df['Outlet_Location_Type']=df['Outlet_Location_Type'].map(tier)

In [32]:
# create dummy variables for Item Type and Outlet Type
dum_columns=['Item_Type','Outlet_Type']
df_reg=pd.get_dummies(df, columns=dum_columns ,drop_first=True)

In [33]:
# Drop two columns that are not associated with outlet sales
df_reg.drop(columns=['Item_Identifier','Outlet_Identifier'],inplace=True)

In [34]:
df_reg.head()

Unnamed: 0,Item_Weight,Item_Fat_Content,Item_Visibility,Item_MRP,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Item_Outlet_Sales,Item_Type_Breads,Item_Type_Breakfast,Item_Type_Canned,Item_Type_Dairy,Item_Type_Frozen Foods,Item_Type_Fruits and Vegetables,Item_Type_Hard Drinks,Item_Type_Health and Hygiene,Item_Type_Household,Item_Type_Meat,Item_Type_Others,Item_Type_Seafood,Item_Type_Snack Foods,Item_Type_Soft Drinks,Item_Type_Starchy Foods,Outlet_Type_Supermarket Type1,Outlet_Type_Supermarket Type2,Outlet_Type_Supermarket Type3
0,9.3,0,0.016047,249.8092,1999,1,0,3735.138,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0
1,5.92,1,0.019278,48.2692,2009,1,2,443.4228,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0
2,17.5,0,0.01676,141.618,1999,1,0,2097.27,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0
3,19.2,1,0.0,182.095,1998,0,2,732.38,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
4,8.93,0,0.0,53.8614,1987,2,2,994.7052,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0


In [35]:
# Train test split
X=df_reg.drop(columns='Item_Outlet_Sales')
y=df['Item_Outlet_Sales']
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=3)

In [36]:
# create bagged tress model
bt=BaggingRegressor(random_state=42)
bt.fit(X_train,y_train)
bt.score(X_train,y_train)

0.9130181677417296

In [37]:
bt.score(X_test,y_test)

0.5267271101009705

In [38]:
# Create Random Forest model
rf=RandomForestRegressor()
rf.fit(X_train,y_train)
rf.score(X_train,y_train)

0.9358298787897408

In [39]:
rf.score(X_test,y_test)

0.5726390117320227

In [52]:
# Optimizing hyperparameters using RandomisedSearchCV. Referenced'https://www.youtube.com/watch?v=SctFnD_puQI'
n_estimators=[int(x) for x in np.arange(start=10,stop=150,step=10)]
max_features=['auto','sqrt']
max_depth=[2,4,6,8]
min_samples_split=[2,5,7,9]
min_samples_leaf=[1,2]
bootstrap=[True,False]

In [53]:
param_grid={'n_estimators':n_estimators,'max_features':max_features,'max_depth':max_depth,'min_samples_split':min_samples_split,'min_samples_leaf':min_samples_leaf,'bootstrap':bootstrap}

In [54]:
from sklearn.model_selection import RandomizedSearchCV
randomgrid=RandomizedSearchCV(estimator=rf,param_distributions=param_grid,cv=10,verbose=2,n_jobs=4)

In [55]:
randomgrid.fit(X_train,y_train)

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:   13.3s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:   46.6s finished


RandomizedSearchCV(cv=10, error_score=nan,
                   estimator=RandomForestRegressor(bootstrap=True,
                                                   ccp_alpha=0.0,
                                                   criterion='mse',
                                                   max_depth=None,
                                                   max_features='auto',
                                                   max_leaf_nodes=None,
                                                   max_samples=None,
                                                   min_impurity_decrease=0.0,
                                                   min_impurity_split=None,
                                                   min_samples_leaf=1,
                                                   min_samples_split=2,
                                                   min_weight_fraction_leaf=0.0,
                                                   n_estimators=100,
                             

In [56]:
randomgrid.best_params_

{'bootstrap': True,
 'max_depth': 6,
 'max_features': 'auto',
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 40}

In [57]:
randomgrid.score(X_test,y_test)

0.6207238328748141