In [50]:
import pandas as pd
import numpy as np

train_data= pd.read_csv('train_big_mart_sales_prediction.csv')
test_data= pd.read_csv('test_big_mart_sales_prediction.csv')

# As part of data preprocessing you need to manually validate if any of the fields (both test & train data) have blanks or NaN values. 
# Below are ways to deal with them.

In [2]:
print(train_data.shape)

(8523, 12)


In [6]:
print(train_data.head(10))

  Item_Identifier  Item_Weight Item_Fat_Content  Item_Visibility  \
0           FDA15        9.300          Low Fat         0.016047   
1           DRC01        5.920          Regular         0.019278   
2           FDN15       17.500          Low Fat         0.016760   
3           FDX07       19.200          Regular         0.000000   
4           NCD19        8.930          Low Fat         0.000000   
5           FDP36       10.395          Regular         0.000000   
6           FDO10       13.650          Regular         0.012741   
7           FDP10          NaN          Low Fat         0.127470   
8           FDH17       16.200          Regular         0.016687   
9           FDU28       19.200          Regular         0.094450   

               Item_Type  Item_MRP Outlet_Identifier  \
0                  Dairy  249.8092            OUT049   
1            Soft Drinks   48.2692            OUT018   
2                   Meat  141.6180            OUT049   
3  Fruits and Vegetables  1

In [6]:
td = train_data.isnull()
# In Pandas, isnull().sum() is used to count the number of missing values (NaNs) in each column of a DataFrame. 
# The isnull() method returns a DataFrame of boolean values, where True indicates a missing value and False indicates a non-missing value. 
# The sum() method then sums the True values in each column, effectively counting the number of missing values. 
# For instance, if you have a DataFrame named df, you can use df.isnull().sum() to get the count of missing values for each column. 
# The result will be a Pandas Series with column names as the index and the number of missing values as the values.
print(td)
print(td.sum())

      Item_Identifier  Item_Weight  Item_Fat_Content  Item_Visibility  \
0               False        False             False            False   
1               False        False             False            False   
2               False        False             False            False   
3               False        False             False            False   
4               False        False             False            False   
...               ...          ...               ...              ...   
8518            False        False             False            False   
8519            False        False             False            False   
8520            False        False             False            False   
8521            False        False             False            False   
8522            False        False             False            False   

      Item_Type  Item_MRP  Outlet_Identifier  Outlet_Establishment_Year  \
0         False     False              False    

In [7]:
print(test_data.shape)

(5681, 11)


In [8]:
td = test_data.isnull()
# In Pandas, isnull().sum() is used to count the number of missing values (NaNs) in each column of a DataFrame. 
# The isnull() method returns a DataFrame of boolean values, where True indicates a missing value and False indicates a non-missing value. 
# The sum() method then sums the True values in each column, effectively counting the number of missing values. 
# For instance, if you have a DataFrame named df, you can use df.isnull().sum() to get the count of missing values for each column. 
# The result will be a Pandas Series with column names as the index and the number of missing values as the values.
print(td)
print(td.sum())

      Item_Identifier  Item_Weight  Item_Fat_Content  Item_Visibility  \
0               False        False             False            False   
1               False        False             False            False   
2               False        False             False            False   
3               False        False             False            False   
4               False         True             False            False   
...               ...          ...               ...              ...   
5676            False        False             False            False   
5677            False        False             False            False   
5678            False        False             False            False   
5679            False        False             False            False   
5680            False        False             False            False   

      Item_Type  Item_MRP  Outlet_Identifier  Outlet_Establishment_Year  \
0         False     False              False    

In [51]:
# Data contains missing values in Item_Weight and Outlet_Size for both test_data & train_data. Let's try to fill missing values in it.
# Pandas DataFrame.loc attribute accesses a group of rows and columns by label(s) or a boolean array in the given Pandas DataFrame.
# trying to analyze the train_data to check NaN for Item_Identifier=='FDP10'
print(train_data.loc[:,['Item_Identifier','Item_Weight']][train_data.Item_Identifier=='FDP10'])

     Item_Identifier  Item_Weight
7              FDP10          NaN
585            FDP10         19.0
2623           FDP10         19.0
3382           FDP10         19.0
4585           FDP10         19.0
6087           FDP10         19.0
7883           FDP10         19.0


In [52]:
# A close look reveals Item_Weight can be identified from Unique Item_Identifier, as seen for FDP10 value above.
# One of the way to create Pandas DataFrame is by using zip() function. You can use the lists to create lists of tuples and create a dictionary 
# from it. Then, this dictionary can be used to construct a dataframe. zip() function creates the objects and that can be used to produce single 
# item at a time. This function can create pandas DataFrames by merging two lists.

item_weight ={}
for i, j in zip(train_data.Item_Identifier,train_data.Item_Weight):
    if np.isnan(j): # np.isnan(j) is a NumPy function that checks if the input j is a "Not a Number" (NaN) value. It returns a boolean value: True if j is NaN, and False otherwise. 
        continue
    else:
        item_weight[i]=j
#print(item_weight)

new_item_weight=[]
for i, j in zip(train_data.Item_Identifier,train_data.Item_Weight): # train data
    if np.isnan(j):
        if i in item_weight:
            new_item_weight.append(item_weight[i])
        else:
            new_item_weight.append(j)
    else:
        new_item_weight.append(j)
train_data.Item_Weight=new_item_weight

new_item_weight=[]
for i, j in zip(test_data.Item_Identifier,test_data.Item_Weight): # test data
    if np.isnan(j):
        if i in item_weight:
            new_item_weight.append(item_weight[i])
        else:
            new_item_weight.append(j)
    else:
        new_item_weight.append(j)
test_data.Item_Weight=new_item_weight

In [11]:
td = train_data.isnull()
# In Pandas, isnull().sum() is used to count the number of missing values (NaNs) in each column of a DataFrame. 
# The isnull() method returns a DataFrame of boolean values, where True indicates a missing value and False indicates a non-missing value. 
# The sum() method then sums the True values in each column, effectively counting the number of missing values. 
# For instance, if you have a DataFrame named df, you can use df.isnull().sum() to get the count of missing values for each column. 
# The result will be a Pandas Series with column names as the index and the number of missing values as the values.
print(td)
print(td.sum())

      Item_Identifier  Item_Weight  Item_Fat_Content  Item_Visibility  \
0               False        False             False            False   
1               False        False             False            False   
2               False        False             False            False   
3               False        False             False            False   
4               False        False             False            False   
...               ...          ...               ...              ...   
8518            False        False             False            False   
8519            False        False             False            False   
8520            False        False             False            False   
8521            False        False             False            False   
8522            False        False             False            False   

      Item_Type  Item_MRP  Outlet_Identifier  Outlet_Establishment_Year  \
0         False     False              False    

In [12]:
td = test_data.isnull()
# In Pandas, isnull().sum() is used to count the number of missing values (NaNs) in each column of a DataFrame. 
# The isnull() method returns a DataFrame of boolean values, where True indicates a missing value and False indicates a non-missing value. 
# The sum() method then sums the True values in each column, effectively counting the number of missing values. 
# For instance, if you have a DataFrame named df, you can use df.isnull().sum() to get the count of missing values for each column. 
# The result will be a Pandas Series with column names as the index and the number of missing values as the values.
print(td)
print(td.sum())

      Item_Identifier  Item_Weight  Item_Fat_Content  Item_Visibility  \
0               False        False             False            False   
1               False        False             False            False   
2               False        False             False            False   
3               False        False             False            False   
4               False        False             False            False   
...               ...          ...               ...              ...   
5676            False        False             False            False   
5677            False        False             False            False   
5678            False        False             False            False   
5679            False        False             False            False   
5680            False        False             False            False   

      Item_Type  Item_MRP  Outlet_Identifier  Outlet_Establishment_Year  \
0         False     False              False    

In [53]:
# There are still some missing values in both train and test_data. 
# We can simply remove the row from training data but fill missing value in test_data by median weight of train data

train_data= train_data[train_data.Item_Weight.notnull()]
print("Train Data shape after removal ")
print(train_data.shape)

Train Data shape after removal 
(8519, 12)


In [54]:
# Finding the median and then adding that for test data

m = train_data.Item_Weight.median()
print(m)
test_data.Item_Weight.fillna(m, inplace=True)

12.65


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_data.Item_Weight.fillna(m, inplace=True)


In [9]:
print(train_data.isnull().sum())

Item_Identifier                 0
Item_Weight                     0
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  2410
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales               0
dtype: int64


In [10]:
print(test_data.isnull().sum())

Item_Identifier                 0
Item_Weight                     0
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  1606
Outlet_Location_Type            0
Outlet_Type                     0
dtype: int64


In [55]:
# So Item_Weight is no longer NaN value. Now let's check for Outlet_Size
# pandas.crosstab() function in Python is used to compute a cross-tabulation (contingency table) of two or more categorical variables. 
# By default, computes a frequency table of the factors unless an array of values and an aggregation function are passed. 
# It also supports aggregation when additional data and a custom function are provided. 
# This function is ideal for summarizing relationships between categories and analyzing patterns in datasets

ct = pd.crosstab(train_data['Outlet_Size'], train_data['Outlet_Type'])
print(ct)

Outlet_Type  Grocery Store  Supermarket Type1  Supermarket Type2  \
Outlet_Size                                                        
High                     0                932                  0   
Medium                   0                930                928   
Small                  527               1860                  0   

Outlet_Type  Supermarket Type3  
Outlet_Size                     
High                         0  
Medium                     932  
Small                        0  


In [56]:
# all 'grocery store' have small outlet and all supermarket type 2 and type 3 outlet are medium size.

ct = pd.crosstab(train_data['Outlet_Size'], train_data['Outlet_Establishment_Year'])
print(ct)

Outlet_Establishment_Year  1985  1987  1997  1999  2004  2009
Outlet_Size                                                  
High                          0   932     0     0     0     0
Medium                      932     0     0   930     0   928
Small                       527     0   930     0   930     0


In [57]:
ct = pd.crosstab(train_data['Outlet_Size'], train_data['Outlet_Location_Type'])
print(ct)

Outlet_Location_Type  Tier 1  Tier 2  Tier 3
Outlet_Size                                 
High                       0       0     932
Medium                   930       0    1860
Small                   1457     930       0


In [58]:
# All Tier 2 Outlet_Location_Type are small outlet size. And Tier 1 location do not have high outlet size. 
# so we can fill 'small' outlet size as it is more in number. Similary we can use year variable to impute values of outlet size.
# The condition for the blank Outlet Size could be put up as follows, given the above tables:
# 1. if the Outlet_Type = Type 2 or Type 3, the Outlet_Size must be Medium.
# 2. if the Outlet_Type =  Grocery Store, the Outlet_Size must be Small.
# 3. if the Outlet_Establishment_Year =  1987, the Outlet_Size must be High
# 4. if the Outlet_Establishment_Year =  1997 or 2004, the Outlet_Size must be Small
# 5. if the Outlet_Establishment_Year =  1999 or 2009, the Outlet_Size must be Medium
# 6. if the Outlet_Location_Type = Tier 2, the Outlet_Size must be Small
# 7. if the Outlet_Location_Type = Tier 1, the Outlet_Size must be Small -- since more stores are Small
# 8. if the Outlet_Location_Type = Tier 3, the Outlet_Size must be Medium -- since more stores are Medium

outlet_size =[]
for loc_type,size,outlet_type,year in zip(train_data.Outlet_Location_Type,train_data.Outlet_Size,train_data.Outlet_Type,train_data.Outlet_Establishment_Year):
    if type(size)==float:
        if outlet_type=='Supermarket Type3' or outlet_type=='Supermarket Type2':
            outlet_size.append('Medium')
            continue
        elif outlet_type=='Grocery Store':
            outlet_size.append('Small')
            continue
        elif year==1987:
            outlet_size.append('High')
            continue
        elif year==1997 or year==2004:
            outlet_size.append('Small')
            continue
        elif year==1999 or year==2009:
            outlet_size.append('Small')
            continue
        elif loc_type=='Tier 2':
            outlet_size.append('Small')
            continue 
        elif loc_type=='Tier 3':
            outlet_size.append('Medium')
            continue 
        elif loc_type=='Tier 1':
            outlet_size.append('Small')
            continue
    else:
        outlet_size.append(size)
train_data.Outlet_Size= outlet_size

In [21]:
print(train_data.isnull().sum())

Item_Identifier              0
Item_Weight                  0
Item_Fat_Content             0
Item_Visibility              0
Item_Type                    0
Item_MRP                     0
Outlet_Identifier            0
Outlet_Establishment_Year    0
Outlet_Size                  0
Outlet_Location_Type         0
Outlet_Type                  0
Item_Outlet_Sales            0
dtype: int64


In [59]:
# Repeating the above for test data

outlet_size =[]
for loc_type,size,outlet_type,year in zip(test_data.Outlet_Location_Type,test_data.Outlet_Size,test_data.Outlet_Type,test_data.Outlet_Establishment_Year):
    if type(size)==float:
        if outlet_type=='Supermarket Type3' or outlet_type=='Supermarket Type2':
            outlet_size.append('Medium')
            continue
        elif outlet_type=='Grocery Store':
            outlet_size.append('Small')
            continue
        elif year==1987:
            outlet_size.append('High')
            continue
        elif year==1997 or year==2004:
            outlet_size.append('Small')
            continue
        elif year==1999 or year==2009:
            outlet_size.append('Small')
            continue
        elif loc_type=='Tier 2':
            outlet_size.append('Small')
            continue 
        elif loc_type=='Tier 3':
            outlet_size.append('Medium')
            continue 
        elif loc_type=='Tier 1':
            outlet_size.append('Small')
            continue
    else:
        outlet_size.append(size)
test_data.Outlet_Size= outlet_size
print(test_data.isnull().sum())

Item_Identifier              0
Item_Weight                  0
Item_Fat_Content             0
Item_Visibility              0
Item_Type                    0
Item_MRP                     0
Outlet_Identifier            0
Outlet_Establishment_Year    0
Outlet_Size                  0
Outlet_Location_Type         0
Outlet_Type                  0
dtype: int64


In [60]:
# To get the column names of a Pandas DataFrame that have the data type 'object', the .select_dtypes() method can be used. 
# This method allows selecting columns based on their data type. By specifying include='object', it will return a DataFrame containing only 
# the columns with the 'object' data type. Then, accessing the .columns attribute of the resulting DataFrame provides an Index object containing 
# the names of those columns.
# The select_dtypes() method returns a new DataFrame that includes/excludes columns of the specified dtype(s).
# Use the include parameter to specify the included columns, or use the exclude parameter to specify which columns to exclude

z = train_data.select_dtypes(include = 'object').columns
print(z)
for column in z:
    x=train_data.loc[:,column].nunique() # The nunique() method returns the number of unique values for each column. #By specifying the column axis (axis='columns'), the nunique() method searches column-wise and returns the number of unique values for each row.
    print('for column {} no. of unique values {}'.format(column,x))

Index(['Item_Identifier', 'Item_Fat_Content', 'Item_Type', 'Outlet_Identifier',
       'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type'],
      dtype='object')
for column Item_Identifier no. of unique values 1555
for column Item_Fat_Content no. of unique values 5
for column Item_Type no. of unique values 16
for column Outlet_Identifier no. of unique values 10
for column Outlet_Size no. of unique values 3
for column Outlet_Location_Type no. of unique values 3
for column Outlet_Type no. of unique values 4


In [61]:
from sklearn.preprocessing import LabelEncoder

# Pandas append function is used to add rows of other dataframes to end of existing dataframe, returning a new dataframe object.
# sklearn LabelEncoder will be used for Item_Identifier to convert all the unique values to single number. 
# This does not add new columns as in case of get_dummies.

le = LabelEncoder()
tdii = train_data.Item_Identifier
teii = test_data.Item_Identifier
le.fit(pd.concat([tdii, teii]))
train_data.Item_Identifier=le.transform(train_data.Item_Identifier).astype(int)
test_data.Item_Identifier=le.transform(test_data.Item_Identifier).astype(int)

In [62]:
# The get_dummies() method returns a DataFrame where the value in the input becomes a separate column filled with binary values (1s and 0s), 
# indicating the presence or absence of that value in each row of the original data.

train_data= pd.get_dummies(train_data)
print(train_data.head())

   Item_Identifier  Item_Weight  Item_Visibility  Item_MRP  \
0              156         9.30         0.016047  249.8092   
1                8         5.92         0.019278   48.2692   
2              662        17.50         0.016760  141.6180   
3             1121        19.20         0.000000  182.0950   
4             1297         8.93         0.000000   53.8614   

   Outlet_Establishment_Year  Item_Outlet_Sales  Item_Fat_Content_LF  \
0                       1999          3735.1380                False   
1                       2009           443.4228                False   
2                       1999          2097.2700                False   
3                       1998           732.3800                False   
4                       1987           994.7052                False   

   Item_Fat_Content_Low Fat  Item_Fat_Content_Regular  \
0                      True                     False   
1                     False                      True   
2                      

In [26]:
print(train_data.columns)

Index(['Item_Identifier', 'Item_Weight', 'Item_Visibility', 'Item_MRP',
       'Outlet_Establishment_Year', 'Item_Outlet_Sales', 'Item_Fat_Content_LF',
       'Item_Fat_Content_Low Fat', 'Item_Fat_Content_Regular',
       'Item_Fat_Content_low fat', 'Item_Fat_Content_reg',
       'Item_Type_Baking Goods', 'Item_Type_Breads', 'Item_Type_Breakfast',
       'Item_Type_Canned', 'Item_Type_Dairy', 'Item_Type_Frozen Foods',
       'Item_Type_Fruits and Vegetables', 'Item_Type_Hard Drinks',
       'Item_Type_Health and Hygiene', 'Item_Type_Household', 'Item_Type_Meat',
       'Item_Type_Others', 'Item_Type_Seafood', 'Item_Type_Snack Foods',
       'Item_Type_Soft Drinks', 'Item_Type_Starchy Foods',
       'Outlet_Identifier_OUT010', 'Outlet_Identifier_OUT013',
       'Outlet_Identifier_OUT017', 'Outlet_Identifier_OUT018',
       'Outlet_Identifier_OUT019', 'Outlet_Identifier_OUT027',
       'Outlet_Identifier_OUT035', 'Outlet_Identifier_OUT045',
       'Outlet_Identifier_OUT046', 'Outlet_Iden

In [63]:
# while get_dummies could have been used for Item_Identifier also, but it has no. of unique values 1555. Hence the number of columns would increase
# drastically, increasing the count of columns (dimensions) - "Curse of Dimensionality". Also, Lebel encoder is advised to be used for nominal data 
# while get_dummies is used for ordinal data. 
# Nominal data and ordinal data are both types of categorical data, but they differ in their order or ranking of categories. 
# Nominal data simply classifies categories without any inherent order, like colors (red, blue, green). 
# Ordinal data, on the other hand, classifies categories with a meaningful order or ranking, such as education levels 
# (high school, bachelor's, master's).
# Hence for Item_Identifier we use LabelEncoder, so that in one column we have multiple encoded number, and no columns are added. For other columns
# viz - Item_Fat_Content, Outlet_Identifier, Outlet_Location_Type etc. we use get_dummies, hence above rows were added to train_data.

In [64]:
test_data= pd.get_dummies(test_data)
print(test_data.head())

   Item_Identifier  Item_Weight  Item_Visibility  Item_MRP  \
0             1114       20.750         0.007565  107.8622   
1             1078        8.300         0.038428   87.3198   
2             1420       14.600         0.099575  241.7538   
3              817        7.315         0.015388  155.0340   
4             1197       13.600         0.118599  234.2300   

   Outlet_Establishment_Year  Item_Fat_Content_LF  Item_Fat_Content_Low Fat  \
0                       1999                False                      True   
1                       2007                False                     False   
2                       1998                False                      True   
3                       2007                False                      True   
4                       1985                False                     False   

   Item_Fat_Content_Regular  Item_Fat_Content_low fat  Item_Fat_Content_reg  \
0                     False                     False                 Fal

In [65]:
# All of that data is preprocessed, let's save it into new files

train_data.to_csv('processed_train_data.csv',index=False)
print(pd.read_csv('processed_train_data.csv').head(10))

   Item_Identifier  Item_Weight  Item_Visibility  Item_MRP  \
0              156        9.300         0.016047  249.8092   
1                8        5.920         0.019278   48.2692   
2              662       17.500         0.016760  141.6180   
3             1121       19.200         0.000000  182.0950   
4             1297        8.930         0.000000   53.8614   
5              758       10.395         0.000000   51.4008   
6              696       13.650         0.012741   57.6588   
7              738       19.000         0.127470  107.7622   
8              440       16.200         0.016687   96.9726   
9              990       19.200         0.094450  187.8214   

   Outlet_Establishment_Year  Item_Outlet_Sales  Item_Fat_Content_LF  \
0                       1999          3735.1380                False   
1                       2009           443.4228                False   
2                       1999          2097.2700                False   
3                       1998 

In [66]:
test_data.to_csv('processed_test_data.csv',index=False)
print(pd.read_csv('processed_test_data.csv').head(10))

   Item_Identifier  Item_Weight  Item_Visibility  Item_MRP  \
0             1114       20.750         0.007565  107.8622   
1             1078        8.300         0.038428   87.3198   
2             1420       14.600         0.099575  241.7538   
3              817        7.315         0.015388  155.0340   
4             1197       13.600         0.118599  234.2300   
5              465        9.800         0.063817  117.1492   
6              610       19.350         0.082602   50.1034   
7              268        9.195         0.015782   81.0592   
8              674        6.305         0.123365   95.7436   
9              172        5.985         0.005698  186.8924   

   Outlet_Establishment_Year  Item_Fat_Content_LF  Item_Fat_Content_Low Fat  \
0                       1999                False                      True   
1                       2007                False                     False   
2                       1998                False                      True   
3

In [67]:
# let's start training our ML model. Not using any library like scikit learn, but trying the learnings from Andrew Ng classes

train_data = pd.read_csv('processed_train_data.csv')
train_y = train_data.pop('Item_Outlet_Sales')
train_x = train_data

In [68]:
print(f"X Shape: {train_x.shape}, X Type:{type(train_x)})")
print(train_x)
print(f"y Shape: {train_y.shape}, y Type:{type(train_y)})")
print(train_y)

X Shape: (8519, 46), X Type:<class 'pandas.core.frame.DataFrame'>)
      Item_Identifier  Item_Weight  Item_Visibility  Item_MRP  \
0                 156        9.300         0.016047  249.8092   
1                   8        5.920         0.019278   48.2692   
2                 662       17.500         0.016760  141.6180   
3                1121       19.200         0.000000  182.0950   
4                1297        8.930         0.000000   53.8614   
...               ...          ...              ...       ...   
8514              370        6.865         0.056783  214.5218   
8515              897        8.380         0.046982  108.1570   
8516             1357       10.600         0.035186   85.1224   
8517              681        7.210         0.145221  103.1332   
8518               50       14.800         0.044878   75.4670   

      Outlet_Establishment_Year  Item_Fat_Content_LF  \
0                          1999                False   
1                          2009         

In [69]:
X  = train_x.to_numpy()
Y  = train_y.to_numpy()
print(type(X))
print(type(Y))
w_size = len(X[0])
print(w_size)

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
46


In [34]:
b_init = 785.1811367994083
w_init = np.random.uniform(0, 1, w_size)
print(w_init)
print(f"w_init shape: {w_init.shape}, b_init type: {type(b_init)}")

[0.49386532 0.27860199 0.18127279 0.99827739 0.83556756 0.26004257
 0.47816635 0.96090829 0.17621644 0.57177156 0.93224362 0.32997676
 0.41553769 0.82773918 0.26627487 0.00208105 0.25146278 0.94217896
 0.82289367 0.38152619 0.56998813 0.32330328 0.74066318 0.2577017
 0.34301447 0.69109908 0.25676977 0.71441448 0.13885877 0.29853626
 0.63614611 0.38523198 0.72833793 0.39169827 0.67230182 0.47770074
 0.85026785 0.12706128 0.22068636 0.59883297 0.0179919  0.23896085
 0.70931461 0.91276301 0.35345413 0.71943766]
w_init shape: (46,), b_init type: <class 'float'>


In [35]:
def predict_single_loop(x, w, b): 
    """
    single predict using linear regression
    
    Args:
      x (ndarray): Shape (n,) example with multiple features
      w (ndarray): Shape (n,) model parameters    
      b (scalar):  model parameter     
      
    Returns:
      p (scalar):  prediction
    """
    n = x.shape[0]
    p = 0
    for i in range(n):
        p_i = x[i] * w[i]  
        p = p + p_i         
    p = p + b                
    return p

In [36]:
# get a row from our training data
x_vec = X[0,:]
print(f"x_vec shape {x_vec.shape}, x_vec value: {x_vec}")

# make a prediction
f_wb = predict_single_loop(x_vec, w_init, b_init)
print(f"f_wb shape {f_wb.shape}, prediction: {f_wb}")

x_vec shape (46,), x_vec value: [156 9.3 0.016047301 249.8092 1999 False True False False False False
 False False False True False False False False False False False False
 False False False False False False False False False False False False
 True False True False True False False False True False False]
f_wb shape (), prediction: 2787.357260296407


In [37]:
def predict(x, w, b): 
    """
    single predict using linear regression
    Args:
      x (ndarray): Shape (n,) example with multiple features
      w (ndarray): Shape (n,) model parameters   
      b (scalar):             model parameter 
      
    Returns:
      p (scalar):  prediction
    """
    p = np.dot(x, w) + b  
    #print (p.shape)
    return p 

In [38]:
# get a row from our training data
x_vec = X[0,:]
print(f"x_vec shape {x_vec.shape}, x_vec value: {x_vec}")

# make a prediction
f_wb = predict(x_vec,w_init, b_init)
print("prediction:" + str(f_wb))

x_vec shape (46,), x_vec value: [156 9.3 0.016047301 249.8092 1999 False True False False False False
 False False False True False False False False False False False False
 False False False False False False False False False False False False
 True False True False True False False False True False False]
prediction:2787.357260296407


In [72]:
# Compute Cost With Multiple Variables

def compute_cost(X, y, w, b): 
    """
    compute cost
    Args:
      X (ndarray (m,n)): Data, m examples with n features
      y (ndarray (m,)) : target values
      w (ndarray (n,)) : model parameters  
      b (scalar)       : model parameter
      
    Returns:
      cost (scalar): cost
    """
    m = X.shape[0]
    cost = 0.0
    for i in range(m):                                
        f_wb_i = np.dot(X[i], w) + b           #(n,)(n,) = scalar (see np.dot)
        cost = cost + (f_wb_i - y[i])**2       #scalar
    cost = cost / (2 * m)                      #scalar    
    return cost

In [40]:
# Compute and display cost using our pre-chosen optimal parameters. 
cost = compute_cost(X, Y, w_init, b_init)
print(f'Cost at optimal w : {cost}')

Cost at optimal w : 1746404.478909463


In [73]:
# Compute Gradient with Multiple Variables

def compute_gradient(X, y, w, b): 
    """
    Computes the gradient for linear regression 
    Args:
      X (ndarray (m,n)): Data, m examples with n features
      y (ndarray (m,)) : target values
      w (ndarray (n,)) : model parameters  
      b (scalar)       : model parameter
      
    Returns:
      dj_dw (ndarray (n,)): The gradient of the cost w.r.t. the parameters w. 
      dj_db (scalar):       The gradient of the cost w.r.t. the parameter b. 
    """
    m,n = X.shape           #(number of examples, number of features)
    dj_dw = np.zeros((n,))
    dj_db = 0.

    for i in range(m):                             
        err = (np.dot(X[i], w) + b) - y[i]   
        for j in range(n):                         
            dj_dw[j] = dj_dw[j] + err * X[i, j]    
        dj_db = dj_db + err                        
    dj_dw = dj_dw / m                                
    dj_db = dj_db / m                                
        
    return dj_db, dj_dw

In [42]:
#Compute and display gradient 
tmp_dj_db, tmp_dj_dw = compute_gradient(X, Y, w_init, b_init)
print(f'dj_db at initial w,b: {tmp_dj_db}')
print(f'dj_dw at initial w,b: \n {tmp_dj_dw}')

dj_db at initial w,b: 805.6630788712908
dj_dw at initial w,b: 
 [ 7.25957895e+05  1.03316475e+04  6.42343525e+01  5.73145016e+04
  1.61031922e+06  3.46385057e+01  5.02588930e+02  2.43216436e+02
  1.17623242e+01  1.34568826e+01  7.66610483e+01  2.55149489e+01
  1.07442399e+01  5.39415450e+01  5.24966659e+01  7.80735607e+01
  9.53829198e+01  1.28482284e+01  7.88745784e+01  1.11120321e+02
  4.20032720e+01  2.70084293e+01  3.82087369e+00  9.80018396e+01
  3.21301339e+01  7.04047396e+00  1.73018019e+02  7.44130757e+01
  7.05458691e+01  1.09039376e+02  1.63572263e+02 -7.83918339e+01
  5.96883433e+01  8.66926844e+01  7.80249796e+01  6.90603018e+01
  7.44130757e+01  9.97078441e+01  6.31542159e+02  3.10657545e+02
  2.16926897e+02  2.78078637e+02  3.36590283e+02  4.38425254e+02
  1.09039376e+02 -7.83918339e+01]


In [70]:
# Implementing Gradient Descent
import math
import copy

def gradient_descent(X, y, w_in, b_in, cost_function, gradient_function, alpha, num_iters): 
    """
    Performs batch gradient descent to learn theta. Updates theta by taking 
    num_iters gradient steps with learning rate alpha
    
    Args:
      X (ndarray (m,n))   : Data, m examples with n features
      y (ndarray (m,))    : target values
      w_in (ndarray (n,)) : initial model parameters  
      b_in (scalar)       : initial model parameter
      cost_function       : function to compute cost
      gradient_function   : function to compute the gradient
      alpha (float)       : Learning rate
      num_iters (int)     : number of iterations to run gradient descent
      
    Returns:
      w (ndarray (n,)) : Updated values of parameters 
      b (scalar)       : Updated value of parameter 
      """
    
    # An array to store cost J and w's at each iteration primarily for graphing later
    J_history = []
    w = copy.deepcopy(w_in)  #avoid modifying global w within function
    b = b_in
    c = 0
    
    for i in range(num_iters):
        c = c + 1
        # Calculate the gradient and update the parameters
        dj_db,dj_dw = gradient_function(X, y, w, b)   ##None

        # Update Parameters using w, b, alpha and gradient
        w = w - alpha * dj_dw               ##None
        b = b - alpha * dj_db               ##None
      
        # Save cost J at each iteration
        if i<100000:      # prevent resource exhaustion 
            J_history.append(cost_function(X, y, w, b))

        # Print cost every at intervals 10 times or as many iterations if < 10
        #if (i%math.ceil(num_iters/100) == 0):
        if (i%100 == 0):
            print(f"Iteration {i:4d}: Cost {J_history[-1]:8.2f}   ")
        
    return w, b, J_history #return final w,b and J history for graphing

In [45]:
# initialize parameters
initial_w = np.zeros_like(w_init)
initial_b = 0.
# some gradient descent settings
iterations = 500
alpha = 9.9e-10 # the initial value for this was 5.0e-7, now changed so that convergence can happen.
# run gradient descent 
print(initial_w)

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


In [46]:
w_final, b_final, J_hist = gradient_descent(X, Y, initial_w, initial_b,
                                                    compute_cost, compute_gradient, 
                                                    alpha, iterations)
print(f"b,w found by gradient descent: {b_final:0.2f},{w_final} ")
m,_ = X.shape
for i in range(m):
    print(f"prediction: {np.dot(X[i], w_final) + b_final:0.2f}, target value: {Y[i]}")

Iteration    0: Cost 3812961.54   
Iteration  100: Cost 2398608.43   
Iteration  200: Cost 1835745.20   
Iteration  300: Cost 1611460.02   
Iteration  400: Cost 1521809.11   
b,w found by gradient descent: 0.00,[ 3.11598718e-01  5.52286510e-03  2.26316945e-05  8.97638937e-02
  8.50272386e-01  1.36337927e-05  2.46790623e-04  1.55887679e-04
  4.98765903e-06  4.47754809e-06  2.40177891e-05  1.23456753e-05
  5.17722379e-06  3.49649172e-05  3.78324896e-05  4.19461866e-05
  7.05906794e-05  1.19815421e-05  1.65383121e-05  4.25367661e-05
  2.05353245e-05  4.62088987e-06  3.98654384e-06  6.70895518e-05
  2.17530605e-05  9.86034998e-06 -3.17830284e-05  5.32955008e-05
  5.46485471e-05  3.60117754e-05 -2.99187454e-05  1.28897835e-04
  6.04095956e-05  4.69680659e-05  5.16142170e-05  5.56335392e-05
  5.32955008e-05  2.20543149e-04  1.51938652e-04  7.73290109e-05
  1.62026209e-04  1.86422082e-04 -6.17017738e-05  3.22569466e-04
  3.60117754e-05  1.28897835e-04] 
prediction: 1770.78, target value: 3735

In [70]:
# We can see above that predictions are not very good. Hence let's try feature engineering.

#print(train_x.head())
mu   = np.mean(train_x,axis=0)   
sigma  = np.std(train_x,axis=0) 
X_mean = (train_x - mu)
X_norm = (train_x - mu)/sigma
print(X_norm)

      Item_Identifier  Item_Weight  Item_Visibility  Item_MRP  \
0           -1.388462    -0.769598        -0.970582  1.746938   
1           -1.717885    -1.497133        -0.907946 -1.489096   
2           -0.262192     0.995427        -0.956764  0.009762   
3            0.759463     1.361347        -1.281681  0.659682   
4            1.151209    -0.849240        -1.281681 -1.399305   
...               ...          ...              ...       ...   
8514        -0.912135    -1.293725        -0.180855  1.180344   
8515         0.260877    -0.967626        -0.370861 -0.527506   
8516         1.284759    -0.489777        -0.599546 -0.897362   
8517        -0.219902    -1.219465         1.533626 -0.608170   
8518        -1.624400     0.414260        -0.411653 -1.052394   

      Outlet_Establishment_Year  Item_Fat_Content_LF  \
0                      0.138865            -0.196271   
1                      1.333806            -0.196271   
2                      0.138865            -0.19627

In [26]:
print(train_x.head())
print(train_y.head())

   Item_Identifier  Item_Weight  Item_Visibility  Item_MRP  \
0              156         9.30         0.016047  249.8092   
1                8         5.92         0.019278   48.2692   
2              662        17.50         0.016760  141.6180   
3             1121        19.20         0.000000  182.0950   
4             1297         8.93         0.000000   53.8614   

   Outlet_Establishment_Year  Item_Fat_Content_LF  Item_Fat_Content_Low Fat  \
0                       1999                False                      True   
1                       2009                False                     False   
2                       1999                False                      True   
3                       1998                False                     False   
4                       1987                False                      True   

   Item_Fat_Content_Regular  Item_Fat_Content_low fat  Item_Fat_Content_reg  \
0                     False                     False                 Fal

In [81]:
import numpy as np
from sklearn.linear_model import SGDRegressor
from sklearn.preprocessing import StandardScaler
np.set_printoptions(precision=2)

In [84]:
scaler = StandardScaler()
X_norm = scaler.fit_transform(train_x)
print(X_norm)

[[-1.39 -0.77 -0.97 ...  0.73 -0.35 -0.35]
 [-1.72 -1.5  -0.91 ... -1.38  2.86 -0.35]
 [-0.26  1.   -0.96 ...  0.73 -0.35 -0.35]
 ...
 [ 1.28 -0.49 -0.6  ...  0.73 -0.35 -0.35]
 [-0.22 -1.22  1.53 ... -1.38  2.86 -0.35]
 [-1.62  0.41 -0.41 ...  0.73 -0.35 -0.35]]


In [85]:
sgdr = SGDRegressor(max_iter=1000)
sgdr.fit(X_norm, train_y)
print(sgdr)
print(f"number of iterations completed: {sgdr.n_iter_}, number of weight updates: {sgdr.t_}")

SGDRegressor()
number of iterations completed: 34, number of weight updates: 289647.0


In [86]:
b_norm = sgdr.intercept_
w_norm = sgdr.coef_
print(f"model parameters:                   w: {w_norm}, b:{b_norm}")
print( "model parameters from previous lab: w: [110.56 -21.27 -32.71 -37.97], b: 363.16")

model parameters:                   w: [ 3.16e+01  2.43e+00 -8.37e+00  9.78e+02 -2.58e+01 -1.08e+01 -2.73e+01
  3.20e+01  6.03e+00 -3.13e+00 -1.80e+00  2.71e+01  2.20e+01  2.88e+01
 -5.70e+00 -4.10e+00 -2.13e+01  8.49e+00 -1.23e+01 -6.25e+00 -8.99e-01
 -1.99e+00  1.32e+01 -7.12e+00  5.47e-01  9.67e+00 -2.03e+02 -2.62e+01
  4.75e+01 -4.90e+01 -1.69e+02  2.17e+02  7.60e+01 -2.14e+01  7.85e+01
 -3.17e+01 -2.62e+01  9.10e+01 -6.96e+01 -5.84e+01  6.78e+01 -1.14e+01
 -2.73e+02  8.03e+01 -4.90e+01  2.17e+02], b:[2198.16]
model parameters from previous lab: w: [110.56 -21.27 -32.71 -37.97], b: 363.16


In [88]:
# make a prediction using sgdr.predict()
y_pred_sgd = sgdr.predict(X_norm)
# make a prediction using w,b. 
y_pred = np.dot(X_norm, w_norm) + b_norm  
print(f"prediction using np.dot() and sgdr.predict match: {(y_pred == y_pred_sgd).all()}")

print(f"Prediction on training set:\n{y_pred[:4]}" )
print(f"Target values \n{train_y[:4]}")

prediction using np.dot() and sgdr.predict match: True
Prediction on training set:
[3885.33  549.6  2243.72 1049.97]
Target values 
0    3735.1380
1     443.4228
2    2097.2700
3     732.3800
Name: Item_Outlet_Sales, dtype: float64


In [99]:
from sklearn.linear_model import LinearRegression
linear_model = LinearRegression()
#X must be a 2-D Matrix
linear_model.fit(X_norm, Y)

In [100]:
b = linear_model.intercept_
w = linear_model.coef_
print(f"w = {w:}, b = {b:0.2f}")
print(f"'manual' prediction: f_wb = wx+b : {1200*w + b}")

w = [ 1.97e+01  5.70e-01 -1.58e+01  9.69e+02 -3.09e+01 -8.92e+00 -7.50e+00
  1.27e+01  3.58e+00 -9.08e+00  1.93e+00  6.23e-01  2.00e+00  9.65e+00
 -7.31e+00 -3.96e+00  1.41e+01  5.37e+00 -8.18e+00 -1.88e+01  9.54e-01
 -6.05e+00  1.76e+01 -1.50e+00  2.29e+00  6.61e+00 -1.96e+02 -9.11e+00
  4.38e+01 -4.84e+01 -1.72e+02  2.07e+02  5.29e+01 -1.63e+01  7.17e+01
 -1.35e+01 -9.11e+00  9.64e+01 -8.55e+01 -5.21e+01  5.34e+01 -3.41e+00
 -2.70e+02  8.49e+01 -4.84e+01  2.07e+02], b = 2181.19
'manual' prediction: f_wb = wx+b : [ 2.58e+04  2.86e+03 -1.67e+04  1.17e+06 -3.50e+04 -8.52e+03 -6.81e+03
  1.74e+04  6.48e+03 -8.72e+03  4.50e+03  2.93e+03  4.58e+03  1.38e+04
 -6.59e+03 -2.57e+03  1.91e+04  8.63e+03 -7.63e+03 -2.04e+04  3.33e+03
 -5.08e+03  2.33e+04  3.86e+02  4.93e+03  1.01e+04 -2.33e+05 -8.75e+03
  5.47e+04 -5.59e+04 -2.05e+05  2.50e+05  6.57e+04 -1.73e+04  8.82e+04
 -1.41e+04 -8.75e+03  1.18e+05 -1.00e+05 -6.03e+04  6.63e+04 -1.92e+03
 -3.22e+05  1.04e+05 -5.59e+04  2.50e+05]


In [107]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
y_pred = linear_model.predict(X_norm)
print("Prediction on training set:", y_pred)
X_test = pd.read_csv('processed_test_data.csv')
mae = mean_absolute_error(train_y, y_pred)
print("mean_absolute_error:", mae)
mse = mean_squared_error(train_y, y_pred)
print("mean_squared_error:", mse)
y_pred_test = linear_model.predict(X_test)
print("Prediction on testing set:", y_pred_test)

Prediction on training set: [3998.7   559.92 2369.41 ... 1522.6  1390.32 1213.16]
mean_absolute_error: 835.6643155223994
mean_squared_error: 1270109.1917809336
Prediction on testing set: [ 66948.44  46047.64 202100.06 ...  83462.6  158483.87  37291.23]




In [71]:
from sklearn.model_selection import train_test_split
X = pd.read_csv('processed_train_data.csv')
Y = X.pop('Item_Outlet_Sales')
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(6815, 46) (1704, 46) (6815,) (1704,)


In [72]:
from xgboost import XGBRegressor
m = XGBRegressor()
#X must be a 2-D Matrix
m.fit(X_train, y_train)

In [78]:
y_pred = m.predict(X_test)
print(Y_test.shape)

(1704,)


In [83]:
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_test, y_pred)
print(f"MSE: {mse}")
rmse = np.sqrt(mse)
print(f"RMSE: {rmse}")

MSE: 1454511.6050452772
RMSE: 1206.0313449679809
