In [97]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import KNNImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.model_selection import RandomizedSearchCV
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from lightgbm import LGBMRegressor
from sklearn.model_selection import cross_val_score
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [98]:
train_df = pd.read_csv("train.csv")
train_df.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [99]:
test_df = pd.read_csv("test.csv")
test_df.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type
0,FDW58,20.75,Low Fat,0.007565,Snack Foods,107.8622,OUT049,1999,Medium,Tier 1,Supermarket Type1
1,FDW14,8.3,reg,0.038428,Dairy,87.3198,OUT017,2007,,Tier 2,Supermarket Type1
2,NCN55,14.6,Low Fat,0.099575,Others,241.7538,OUT010,1998,,Tier 3,Grocery Store
3,FDQ58,7.315,Low Fat,0.015388,Snack Foods,155.034,OUT017,2007,,Tier 2,Supermarket Type1
4,FDY38,,Regular,0.118599,Dairy,234.23,OUT027,1985,Medium,Tier 3,Supermarket Type3


In [100]:
train_df['type']='train'
test_df['type']='test'

data = pd.concat([train_df, test_df],ignore_index=True)
data.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales,type
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138,train
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228,train
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27,train
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38,train
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052,train


# Preprocessing

Replace 'LF': 'Low Fat', 'low fat': 'Low Fat', 'reg': 'Regular'

In [101]:
data['Item_Fat_Content'] = data['Item_Fat_Content'].replace({'LF': 'Low Fat', 'low fat': 'Low Fat', 'reg': 'Regular' })
data['Item_Fat_Content'].value_counts()

Low Fat    9185
Regular    5019
Name: Item_Fat_Content, dtype: int64

**"Item_Identifier" - 1st 2 letter maps to item_type**

In [102]:
data[['Item_Type','Item_Identifier']]

Unnamed: 0,Item_Type,Item_Identifier
0,Dairy,FDA15
1,Soft Drinks,DRC01
2,Meat,FDN15
3,Fruits and Vegetables,FDX07
4,Household,NCD19
...,...,...
14199,Snack Foods,FDB58
14200,Starchy Foods,FDD47
14201,Health and Hygiene,NCO17
14202,Canned,FDJ26


In [103]:
#Get the first two characters of ID:
data['Item_Type_Combined'] = data['Item_Identifier'].apply(lambda x: x[0:2])
#Rename them to more intuitive categories:
data['Item_Type_Combined'] = data['Item_Type_Combined'].map({'FD':'Food', 'NC':'Non-Consumable', 'DR':'Drinks'})
data['Item_Type_Combined'].value_counts()

Food              10201
Non-Consumable     2686
Drinks             1317
Name: Item_Type_Combined, dtype: int64

In [104]:
data.loc[data['Item_Type_Combined'] == 'Non-Consumable', 'Item_Fat_Content'] = "Non-Edible"

The data scientists at BigMart have collected 2013 sales data for 1559 products across 10 stores in different cities. Also, certain attributes of each product and store have been defined. The aim is to build a predictive model and predict the sales of each product at a particular outlet.

In [105]:
data['outlet_years'] = 2013 - data['Outlet_Establishment_Year']

In [106]:
data.isna().sum()

Item_Identifier                 0
Item_Weight                  2439
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  4016
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales            5681
type                            0
Item_Type_Combined              0
outlet_years                    0
dtype: int64

In [107]:
outlet_size_mode_pt = data.pivot_table(values="Outlet_Size", columns="Outlet_Type", aggfunc=lambda x: x.mode())
outlet_size_mode_pt

Outlet_Type,Grocery Store,Supermarket Type1,Supermarket Type2,Supermarket Type3
Outlet_Size,Small,Small,Medium,Medium


In [108]:
missing_value = data['Outlet_Size'].isnull()
data.loc[missing_value, 'Outlet_Size'] = data.loc[missing_value, 'Outlet_Type'].apply(lambda x: outlet_size_mode_pt[x].Outlet_Size)
data.isna().sum()

Item_Identifier                 0
Item_Weight                  2439
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                     0
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales            5681
type                            0
Item_Type_Combined              0
outlet_years                    0
dtype: int64

In [109]:
data.dtypes

Item_Identifier               object
Item_Weight                  float64
Item_Fat_Content              object
Item_Visibility              float64
Item_Type                     object
Item_MRP                     float64
Outlet_Identifier             object
Outlet_Establishment_Year      int64
Outlet_Size                   object
Outlet_Location_Type          object
Outlet_Type                   object
Item_Outlet_Sales            float64
type                          object
Item_Type_Combined            object
outlet_years                   int64
dtype: object

In [110]:
group = data.groupby(["Item_Identifier"]).mean()
group

Unnamed: 0_level_0,Item_Weight,Item_Visibility,Item_MRP,Outlet_Establishment_Year,Item_Outlet_Sales,outlet_years
Item_Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
DRA12,11.600,0.034938,141.682067,1998.666667,1843.600200,14.333333
DRA24,19.350,0.045646,164.016800,1997.300000,2246.218971,15.700000
DRA59,8.270,0.133384,184.982400,1997.300000,2614.430150,15.700000
DRB01,7.390,0.079736,189.890500,1996.625000,1518.024000,16.375000
DRB13,6.115,0.006799,189.797444,1998.666667,2428.838400,14.333333
...,...,...,...,...,...,...
NCZ30,6.590,0.027302,121.009800,1997.300000,1807.647000,15.700000
NCZ41,19.850,0.056396,125.757900,1998.750000,1827.487840,14.250000
NCZ42,10.500,0.011015,236.954800,1997.300000,3839.801760,15.700000
NCZ53,9.600,0.026330,188.554733,1998.666667,3014.742400,14.333333


In [111]:
group.loc[group.index == 'DRA12', 'Item_Weight'].values
#.loc[group['Item_Identifier'].isin(num1_range), 'Item_Weight']

array([11.6])

In [112]:
empty_weight = data['Item_Weight'].isna()
avg_weight = data.groupby(["Item_Identifier"]).mean()
data.loc[empty_weight, 'Item_Weight'] = data.loc[empty_weight,"Item_Identifier"].apply(lambda x: avg_weight.loc[group.index == x, 'Item_Weight'].values[0])

In [113]:
data.isna().sum()

Item_Identifier                 0
Item_Weight                     0
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                     0
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales            5681
type                            0
Item_Type_Combined              0
outlet_years                    0
dtype: int64

In [114]:
data = pd.get_dummies(data, columns=['Item_Fat_Content','Outlet_Location_Type','Outlet_Size','Outlet_Type',
                              'Item_Type_Combined','Outlet_Identifier'])

In [115]:
data.dtypes

Item_Identifier                       object
Item_Weight                          float64
Item_Visibility                      float64
Item_Type                             object
Item_MRP                             float64
Outlet_Establishment_Year              int64
Item_Outlet_Sales                    float64
type                                  object
outlet_years                           int64
Item_Fat_Content_Low Fat               uint8
Item_Fat_Content_Non-Edible            uint8
Item_Fat_Content_Regular               uint8
Outlet_Location_Type_Tier 1            uint8
Outlet_Location_Type_Tier 2            uint8
Outlet_Location_Type_Tier 3            uint8
Outlet_Size_High                       uint8
Outlet_Size_Medium                     uint8
Outlet_Size_Small                      uint8
Outlet_Type_Grocery Store              uint8
Outlet_Type_Supermarket Type1          uint8
Outlet_Type_Supermarket Type2          uint8
Outlet_Type_Supermarket Type3          uint8
Item_Type_

In [116]:
data.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Visibility,Item_Type,Item_MRP,Outlet_Establishment_Year,Item_Outlet_Sales,type,outlet_years,Item_Fat_Content_Low Fat,...,Outlet_Identifier_OUT010,Outlet_Identifier_OUT013,Outlet_Identifier_OUT017,Outlet_Identifier_OUT018,Outlet_Identifier_OUT019,Outlet_Identifier_OUT027,Outlet_Identifier_OUT035,Outlet_Identifier_OUT045,Outlet_Identifier_OUT046,Outlet_Identifier_OUT049
0,FDA15,9.3,0.016047,Dairy,249.8092,1999,3735.138,train,14,1,...,0,0,0,0,0,0,0,0,0,1
1,DRC01,5.92,0.019278,Soft Drinks,48.2692,2009,443.4228,train,4,0,...,0,0,0,1,0,0,0,0,0,0
2,FDN15,17.5,0.01676,Meat,141.618,1999,2097.27,train,14,1,...,0,0,0,0,0,0,0,0,0,1
3,FDX07,19.2,0.0,Fruits and Vegetables,182.095,1998,732.38,train,15,0,...,1,0,0,0,0,0,0,0,0,0
4,NCD19,8.93,0.0,Household,53.8614,1987,994.7052,train,26,0,...,0,1,0,0,0,0,0,0,0,0


In [117]:
#Drop the columns which have been converted to different types:
# Remove Identifiers
data.drop(['Item_Type','Outlet_Establishment_Year', 'Item_Identifier', 'outlet_years'],axis=1,inplace=True)

In [118]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
features = ["Item_Weight", "Item_Visibility", "Item_MRP"]
data[features]=sc.fit_transform(data[features])

In [119]:
data.head()

Unnamed: 0,Item_Weight,Item_Visibility,Item_MRP,Item_Outlet_Sales,type,Item_Fat_Content_Low Fat,Item_Fat_Content_Non-Edible,Item_Fat_Content_Regular,Outlet_Location_Type_Tier 1,Outlet_Location_Type_Tier 2,...,Outlet_Identifier_OUT010,Outlet_Identifier_OUT013,Outlet_Identifier_OUT017,Outlet_Identifier_OUT018,Outlet_Identifier_OUT019,Outlet_Identifier_OUT027,Outlet_Identifier_OUT035,Outlet_Identifier_OUT045,Outlet_Identifier_OUT046,Outlet_Identifier_OUT049
0,-0.751014,-0.969852,1.752511,3735.138,train,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
1,-1.477653,-0.907063,-1.493696,443.4228,train,0,0,1,0,0,...,0,0,0,1,0,0,0,0,0,0
2,1.011839,-0.956,0.009874,2097.27,train,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
3,1.377308,-1.281712,0.661838,732.38,train,0,0,1,0,0,...,1,0,0,0,0,0,0,0,0,0
4,-0.830557,-1.281712,-1.403623,994.7052,train,0,1,0,0,0,...,0,1,0,0,0,0,0,0,0,0


In [120]:
#Divide into test and train:
train = data.loc[data['type']=="train"]
test = data.loc[data['type']=="test"]

#Drop unnecessary columns:
test.drop(['Item_Outlet_Sales','type'],axis=1,inplace=True)
train.drop(['type'],axis=1,inplace=True)

#Export files as modified versions:
train.to_csv("train_modified.csv",index=False)
test.to_csv("test_modified.csv",index=False)

In [121]:
train.isna().sum()

Item_Weight                          0
Item_Visibility                      0
Item_MRP                             0
Item_Outlet_Sales                    0
Item_Fat_Content_Low Fat             0
Item_Fat_Content_Non-Edible          0
Item_Fat_Content_Regular             0
Outlet_Location_Type_Tier 1          0
Outlet_Location_Type_Tier 2          0
Outlet_Location_Type_Tier 3          0
Outlet_Size_High                     0
Outlet_Size_Medium                   0
Outlet_Size_Small                    0
Outlet_Type_Grocery Store            0
Outlet_Type_Supermarket Type1        0
Outlet_Type_Supermarket Type2        0
Outlet_Type_Supermarket Type3        0
Item_Type_Combined_Drinks            0
Item_Type_Combined_Food              0
Item_Type_Combined_Non-Consumable    0
Outlet_Identifier_OUT010             0
Outlet_Identifier_OUT013             0
Outlet_Identifier_OUT017             0
Outlet_Identifier_OUT018             0
Outlet_Identifier_OUT019             0
Outlet_Identifier_OUT027 

In [122]:
features = train.drop('Item_Outlet_Sales', axis=1)
target = train['Item_Outlet_Sales']
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(features,target,test_size=0.2,random_state=42)

In [123]:
from sklearn.model_selection import RandomizedSearchCV
import lightgbm as lgb
np.random.seed(0)

d1 = np.random.randint(2, size=(100, 9))
d2 = np.random.randint(3, size=(100, 9))
d3 = np.random.randint(4, size=(100, 9))

Y = np.random.randint(7, size=(100,))
X = np.column_stack([d1, d2, d3])

rs_params = {
        'bagging_fraction': (0.5, 0.8),
        'bagging_frequency': (5, 8),
        'feature_fraction': (0.5, 0.8),
        'max_depth': (10, 13),
        'min_data_in_leaf': (90, 120),
        'num_leaves': (1200, 1550)
}

# Initialize a RandomizedSearchCV object using 5-fold CV-
rs_cv = RandomizedSearchCV(estimator=LGBMRegressor(), param_distributions=rs_params, cv = 5, n_iter=100,verbose=1)

# Train on training data-
rs_cv.fit(features, target,verbose=1)

Fitting 5 folds for each of 64 candidates, totalling 320 fits


RandomizedSearchCV(cv=5, estimator=LGBMRegressor(), n_iter=100,
                   param_distributions={'bagging_fraction': (0.5, 0.8),
                                        'bagging_frequency': (5, 8),
                                        'feature_fraction': (0.5, 0.8),
                                        'max_depth': (10, 13),
                                        'min_data_in_leaf': (90, 120),
                                        'num_leaves': (1200, 1550)},
                   verbose=1)

In [125]:
from sklearn.metrics import mean_squared_error

test_predict = rs_cv.best_estimator_.predict(X_test)
test_predict[test_predict < 33] = 33

mse = mean_squared_error(y_test, test_predict)
rmse = mse**(0.5)
print("MSE: %.2f" % mse)
print("RMSE: %.2f" % rmse)

MSE: 884208.55
RMSE: 940.32


In [126]:
predicted_value = rs_cv.best_estimator_.predict(test)

In [127]:
predicted_value

array([1336.03859216, 1334.61111466,  690.88535055, ..., 1723.79532535,
       3915.85914099, 1464.85816436])

In [128]:
np.round(train_df['Item_Outlet_Sales'].min())

33.0

In [129]:
predicted_value[predicted_value < 33] = 33

In [130]:
predicted_value.min()

33.0

In [136]:
test_df['Item_Outlet_Sales'] = np.round(predicted_value, decimals=0)

In [137]:
test_df.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,type,Item_Outlet_Sales
0,FDW58,20.75,Low Fat,0.007565,Snack Foods,107.8622,OUT049,1999,Medium,Tier 1,Supermarket Type1,test,1336.0
1,FDW14,8.3,reg,0.038428,Dairy,87.3198,OUT017,2007,,Tier 2,Supermarket Type1,test,1335.0
2,NCN55,14.6,Low Fat,0.099575,Others,241.7538,OUT010,1998,,Tier 3,Grocery Store,test,691.0
3,FDQ58,7.315,Low Fat,0.015388,Snack Foods,155.034,OUT017,2007,,Tier 2,Supermarket Type1,test,2310.0
4,FDY38,,Regular,0.118599,Dairy,234.23,OUT027,1985,Medium,Tier 3,Supermarket Type3,test,5947.0


In [133]:
test_df[['Item_Identifier', 'Outlet_Identifier', 'Item_Outlet_Sales']].to_csv("submission.csv", index=False)

In [38]:
! pip install catboost

Collecting catboost
  Downloading catboost-1.1.1-cp39-none-win_amd64.whl (74.0 MB)
     ---------------------------------------- 74.0/74.0 MB 6.3 MB/s eta 0:00:00
Collecting graphviz
  Downloading graphviz-0.20.1-py3-none-any.whl (47 kB)
     ---------------------------------------- 47.0/47.0 kB ? eta 0:00:00
Installing collected packages: graphviz, catboost
Successfully installed catboost-1.1.1 graphviz-0.20.1


In [138]:
from sklearn.model_selection import GridSearchCV
from catboost import CatBoostRegressor
from scipy.stats import randint


model = CatBoostRegressor()
parameters = {'depth' : [6,8,10],
              'learning_rate' : np.linspace(0,0.2,5),
              'n_estimators':[100, 200, 300],
              }

grid = GridSearchCV(estimator=model, param_grid = parameters, cv = 5, n_jobs=-1)
grid.fit(features, target)

0:	learn: 1658.5492625	total: 2.98ms	remaining: 295ms
1:	learn: 1613.7920089	total: 7.26ms	remaining: 356ms
2:	learn: 1575.4058358	total: 10.8ms	remaining: 351ms
3:	learn: 1536.3774282	total: 14.6ms	remaining: 350ms
4:	learn: 1502.4942362	total: 18ms	remaining: 342ms
5:	learn: 1473.3808527	total: 20.8ms	remaining: 326ms
6:	learn: 1442.1796565	total: 24.4ms	remaining: 325ms
7:	learn: 1412.6529783	total: 27.7ms	remaining: 319ms
8:	learn: 1385.2538741	total: 31.1ms	remaining: 315ms
9:	learn: 1360.1457659	total: 34.3ms	remaining: 309ms
10:	learn: 1340.7795411	total: 36.6ms	remaining: 296ms
11:	learn: 1318.8507370	total: 40.6ms	remaining: 298ms
12:	learn: 1298.6789981	total: 60.3ms	remaining: 403ms
13:	learn: 1280.9269252	total: 63.9ms	remaining: 392ms
14:	learn: 1264.2926834	total: 67.5ms	remaining: 383ms
15:	learn: 1247.8418131	total: 71.2ms	remaining: 374ms
16:	learn: 1234.1749541	total: 74.5ms	remaining: 364ms
17:	learn: 1220.3074015	total: 77.7ms	remaining: 354ms
18:	learn: 1207.259308

GridSearchCV(cv=5,
             estimator=<catboost.core.CatBoostRegressor object at 0x00000144726DFAF0>,
             n_jobs=-1,
             param_grid={'depth': [6, 8, 10],
                         'learning_rate': array([0.  , 0.05, 0.1 , 0.15, 0.2 ]),
                         'n_estimators': [100, 200, 300]})

In [139]:
y_pred = grid.best_estimator_.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
rmse = mse**(0.5)
print("MSE: %.2f" % mse)
print("RMSE: %.2f" % rmse)

MSE: 1000159.98
RMSE: 1000.08


In [140]:
#grid.best_estimator_.predict

In [141]:
predicted_value = grid.best_estimator_.predict(test)

In [142]:
predicted_value

array([1705.84975475, 1386.73802784,  699.14154706, ..., 1858.07427751,
       3596.1044803 , 1225.24276693])

In [143]:
np.round(train_df['Item_Outlet_Sales'].min())

33.0

In [144]:
predicted_value[predicted_value < 33] = 33

In [145]:
predicted_value.min()

78.39122613187465

In [146]:
test_df['Item_Outlet_Sales'] = np.round(predicted_value, decimals=0)

In [147]:
test_df.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,type,Item_Outlet_Sales
0,FDW58,20.75,Low Fat,0.007565,Snack Foods,107.8622,OUT049,1999,Medium,Tier 1,Supermarket Type1,test,1706.0
1,FDW14,8.3,reg,0.038428,Dairy,87.3198,OUT017,2007,,Tier 2,Supermarket Type1,test,1387.0
2,NCN55,14.6,Low Fat,0.099575,Others,241.7538,OUT010,1998,,Tier 3,Grocery Store,test,699.0
3,FDQ58,7.315,Low Fat,0.015388,Snack Foods,155.034,OUT017,2007,,Tier 2,Supermarket Type1,test,2533.0
4,FDY38,,Regular,0.118599,Dairy,234.23,OUT027,1985,Medium,Tier 3,Supermarket Type3,test,6009.0


In [148]:
test_df[['Item_Identifier', 'Outlet_Identifier', 'Item_Outlet_Sales']].to_csv("submission.csv", index=False)