# About Practice Problem: Big Mart Sales III

https://datahack.analyticsvidhya.com/contest/practice-problem-big-mart-sales-iii/

The data scientists at BigMart have collected 2013 sales data for 1559 products across 10 stores in different cities. Also, certain attributes of each product and store have been defined. The aim is to build a predictive model and find out the sales of each product at a particular store.

Using this model, BigMart will try to understand the properties of products and stores which play a key role in increasing sales.

In [1]:
import pandas as pd
import numpy as np
import sys
sys.path.append('../../hand_made_stuff')
from handmadestuff import fit_predict_report
import statsmodels.formula.api as sm

# Importing data
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')

# # Dropping rows containing NaN values
# loan_df = loan_df.dropna().iloc[:, 1:]

# Display how the data looks
train_df.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [2]:
# Counting an amount of missing values
train_df.isna().sum()

Item_Identifier                 0
Item_Weight                  1463
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  2410
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales               0
dtype: int64

In [3]:
train_df[train_df.isna().any(axis=1)].head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
7,FDP10,,Low Fat,0.12747,Snack Foods,107.7622,OUT027,1985,Medium,Tier 3,Supermarket Type3,4022.7636
8,FDH17,16.2,Regular,0.016687,Frozen Foods,96.9726,OUT045,2002,,Tier 2,Supermarket Type1,1076.5986
9,FDU28,19.2,Regular,0.09445,Frozen Foods,187.8214,OUT017,2007,,Tier 2,Supermarket Type1,4710.535
18,DRI11,,Low Fat,0.034238,Hard Drinks,113.2834,OUT027,1985,Medium,Tier 3,Supermarket Type3,2303.668


In [4]:
train_df[train_df.Outlet_Type == 'Grocery Store'].head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
23,FDC37,,Low Fat,0.057557,Baking Goods,107.6938,OUT019,1985,Small,Tier 1,Grocery Store,214.3876
28,FDE51,5.925,Regular,0.161467,Dairy,45.5086,OUT010,1998,,Tier 3,Grocery Store,178.4344
29,FDC14,,Regular,0.072222,Canned,43.6454,OUT019,1985,Small,Tier 1,Grocery Store,125.8362
30,FDV38,19.25,Low Fat,0.170349,Dairy,55.7956,OUT010,1998,,Tier 3,Grocery Store,163.7868


In [5]:
train_df[train_df.Outlet_Type == 'Grocery Store'].Outlet_Size.unique()

array([nan, 'Small'], dtype=object)

In [6]:
train_df.loc[train_df.Outlet_Type == 'Grocery Store', 'Outlet_Size'] = 'Small'

In [7]:
train_df[train_df.Outlet_Type == 'Grocery Store'].head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,Small,Tier 3,Grocery Store,732.38
23,FDC37,,Low Fat,0.057557,Baking Goods,107.6938,OUT019,1985,Small,Tier 1,Grocery Store,214.3876
28,FDE51,5.925,Regular,0.161467,Dairy,45.5086,OUT010,1998,Small,Tier 3,Grocery Store,178.4344
29,FDC14,,Regular,0.072222,Canned,43.6454,OUT019,1985,Small,Tier 1,Grocery Store,125.8362
30,FDV38,19.25,Low Fat,0.170349,Dairy,55.7956,OUT010,1998,Small,Tier 3,Grocery Store,163.7868


In [8]:
train_df[train_df.Outlet_Type == 'Grocery Store'].Outlet_Size.unique()

array(['Small'], dtype=object)

In [9]:
train_df[train_df.isna().any(axis=1)].shape

(3318, 12)

In [10]:
(train_df[(train_df.Outlet_Type == 'Supermarket Type1') & (train_df.Outlet_Location_Type == 'Tier 2')]
 .Outlet_Size
 .unique())

array([nan, 'Small'], dtype=object)

In [11]:
train_df.loc[(train_df.Outlet_Type == 'Supermarket Type1') & (train_df.Outlet_Location_Type == 'Tier 2'), 'Outlet_Size'] = 'Small'

In [12]:
(train_df[(train_df.Outlet_Type == 'Supermarket Type1') & (train_df.Outlet_Location_Type == 'Tier 2')]
 .Outlet_Size
 .unique())

array(['Small'], dtype=object)

In [13]:
train_df[train_df.isna().any(axis=1)].shape

(1463, 12)

In [14]:
(train_df[(train_df.Outlet_Type == 'Supermarket Type1') & (train_df.Outlet_Location_Type == 'Tier 2')]
 .Outlet_Size
 .unique())

array(['Small'], dtype=object)

In [15]:
(train_df[(train_df.Outlet_Type == 'Supermarket Type2')]
 .Outlet_Size
 .unique())

array(['Medium'], dtype=object)

In [16]:
(train_df[(train_df.Outlet_Type == 'Supermarket Type3')]
 .Outlet_Size
 .unique())

array(['Medium'], dtype=object)

In [17]:
train_df[train_df.Item_Weight.isna()].head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
7,FDP10,,Low Fat,0.12747,Snack Foods,107.7622,OUT027,1985,Medium,Tier 3,Supermarket Type3,4022.7636
18,DRI11,,Low Fat,0.034238,Hard Drinks,113.2834,OUT027,1985,Medium,Tier 3,Supermarket Type3,2303.668
21,FDW12,,Regular,0.0354,Baking Goods,144.5444,OUT027,1985,Medium,Tier 3,Supermarket Type3,4064.0432
23,FDC37,,Low Fat,0.057557,Baking Goods,107.6938,OUT019,1985,Small,Tier 1,Grocery Store,214.3876
29,FDC14,,Regular,0.072222,Canned,43.6454,OUT019,1985,Small,Tier 1,Grocery Store,125.8362


In [18]:
train_df.Item_Weight = train_df.Item_Weight.fillna(method='ffill')

In [19]:
train_df.Item_Fat_Content.nunique()

5

In [20]:
train_df.Item_Type.nunique()

16

In [21]:
train_df.Outlet_Identifier.nunique()

10

In [22]:
train_df.Outlet_Size.nunique()

3

In [23]:
train_df.Outlet_Location_Type.nunique()

3

In [24]:
train_df.Outlet_Type.nunique()

4

In [25]:
train_df.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,Small,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [26]:
X_train = train_df.iloc[:, 1:-1].values
y_train = train_df.iloc[:, -1].values

In [27]:
X_train[0]

array([9.3, 'Low Fat', 0.016047301, 'Dairy', 249.8092, 'OUT049', 1999,
       'Medium', 'Tier 1', 'Supermarket Type1'], dtype=object)

In [28]:
# Encoding categorical data
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

# Encoding the source values
labelencoder_X = LabelEncoder()
categorical_sources = [1, 3, 5, 7, 8, 9]
for i in categorical_sources:
    X_train[:, i] = labelencoder_X.fit_transform(X_train[:, i])

In [29]:
# Splitting the source values into the categorical and the continuous variables
X_1 = X_train[:, categorical_sources]
X_2 = X_train[:, [item for item in list(range(len(X_train[0]))) if item not in categorical_sources]]

In [30]:
# Hot encoding the categorical values
onehotencoder = OneHotEncoder()

In [31]:
categorical_sources.remove(7)

In [32]:
X_new = X_2
for i in range(len(categorical_sources)):
    # Encoding the fat content column values
    X_1_current_encoded = onehotencoder.fit_transform(X_1[:, i].reshape(-1, 1)).toarray()
    # Avoiding the dummy variable trap
    X_1_current_encoded = X_1_current_encoded[:, :-1]
    X_new = np.concatenate((X_new, X_1_current_encoded), axis = 1)
    # Displaying the data
    print(len(X_new[0]))

8
23
32
34
36


In [33]:
# Scaling the data
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_scaled = sc_X.fit_transform(X_new)



In [34]:
sc_y = StandardScaler()
y_scaled = sc_y.fit_transform(y_train.reshape(-1, 1))

In [35]:
X_scaled = np.append(np.ones((X_scaled.shape[0], 1)), X_scaled, axis = 1)
regressor_OLS = sm.OLS(y_scaled, X_scaled).fit()
print(regressor_OLS.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.564
Model:                            OLS   Adj. R-squared:                  0.562
Method:                 Least Squares   F-statistic:                     353.8
Date:                Sun, 25 Nov 2018   Prob (F-statistic):               0.00
Time:                        00:35:15   Log-Likelihood:                -8559.8
No. Observations:                8523   AIC:                         1.718e+04
Df Residuals:                    8491   BIC:                         1.741e+04
Df Model:                          31                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.0036      0.009      0.416      0.6

In [36]:
import statsmodels.formula.api as sm
def backwardElimination(x, y, SL):
    numVars = len(x[0])
    temp = np.zeros(x.shape)
    for i in range(0, numVars):
        regressor_OLS = sm.OLS(y, x).fit()
        maxVar = max(regressor_OLS.pvalues).astype(float)
        adjR_before = regressor_OLS.rsquared_adj.astype(float)
        if maxVar > SL:
            for j in range(0, numVars - i):
                if (regressor_OLS.pvalues[j].astype(float) == maxVar):
                    temp[:,j] = x[:, j]
                    x = np.delete(x, j, 1)
                    tmp_regressor = sm.OLS(y, x).fit()
                    adjR_after = tmp_regressor.rsquared_adj.astype(float)
                    if (adjR_before >= adjR_after):
                        x_rollback = np.hstack((x, temp[:,[0,j]]))
                        x_rollback = np.delete(x_rollback, j, 1)
                        print (regressor_OLS.summary())
                        return x_rollback
                    else:
                        continue
    regressor_OLS.summary()
    return x

In [37]:
SL = 0.05
X_modeled = backwardElimination(X_scaled, y_scaled, SL)

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.564
Model:                            OLS   Adj. R-squared:                  0.562
Method:                 Least Squares   F-statistic:                     378.6
Date:                Sun, 25 Nov 2018   Prob (F-statistic):               0.00
Time:                        00:35:17   Log-Likelihood:                -8557.6
No. Observations:                8523   AIC:                         1.718e+04
Df Residuals:                    8493   BIC:                         1.739e+04
Df Model:                          29                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       1.206e-15      0.007   1.68e-13      1.0

In [38]:
# Splitting the dataset into train and test datasets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_modeled, y_scaled, test_size = 0.2,
                                                    random_state = 0)

In [39]:
# Fitting Multiple Linear Regression to the Training set
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [40]:
# Predicting the test set results
y_pred = regressor.predict(X_test)

In [41]:
def calculate_rmse(y_test, y_pred):
    rmse = 0
    y_test, y_pred = y_test.ravel(), y_pred.ravel()
    for i in range(len(y_test)):
        rmse += (y_test[i] - y_pred[i]) ** 2
    return (rmse / len(y_test)) ** (1/2)

In [42]:
calculate_rmse(y_test, y_pred)

0.6659160147309358

In [43]:
# Fitting SVR to the dataset
from sklearn.svm import SVR
regressor_svr = SVR()
regressor_svr.fit(X_train, y_train.ravel())

SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',
  kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [44]:
# Predicting the test set results
y_pred_svr = regressor_svr.predict(X_test)

In [45]:
calculate_rmse(y_test, y_pred_svr)

0.6496713232999762

In [46]:
# Fitting the Regression Model to the dataset
from sklearn.ensemble import RandomForestRegressor
regressor_rf = RandomForestRegressor(n_estimators = 500, 
                                     random_state = 0)
regressor_rf.fit(X_train, y_train.ravel())

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=1,
           oob_score=False, random_state=0, verbose=0, warm_start=False)

In [47]:
# Predicting the test set results
y_pred_rf = regressor_rf.predict(X_test)

In [48]:
calculate_rmse(y_test, y_pred_rf)

0.6633598996291715