In [22]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error

import warnings
warnings.filterwarnings('ignore')

# Problem Statement

Predict the item outlet sales

# Data Gathering

In [23]:
df=pd.read_csv('sales_data.csv')
df

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.300,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.1380
1,DRC01,5.920,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.500,Low Fat,0.016760,Meat,141.6180,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.2700
3,FDX07,19.200,Regular,0.000000,Fruits and Vegetables,182.0950,OUT010,1998,,Tier 3,Grocery Store,732.3800
4,NCD19,8.930,Low Fat,0.000000,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052
...,...,...,...,...,...,...,...,...,...,...,...,...
8518,FDF22,6.865,Low Fat,0.056783,Snack Foods,214.5218,OUT013,1987,High,Tier 3,Supermarket Type1,2778.3834
8519,FDS36,8.380,Regular,0.046982,Baking Goods,108.1570,OUT045,2002,,Tier 2,Supermarket Type1,549.2850
8520,NCJ29,10.600,Low Fat,0.035186,Health and Hygiene,85.1224,OUT035,2004,Small,Tier 2,Supermarket Type1,1193.1136
8521,FDN46,7.210,Regular,0.145221,Snack Foods,103.1332,OUT018,2009,Medium,Tier 3,Supermarket Type2,1845.5976


In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Item_Identifier            8523 non-null   object 
 1   Item_Weight                7060 non-null   float64
 2   Item_Fat_Content           8523 non-null   object 
 3   Item_Visibility            8523 non-null   float64
 4   Item_Type                  8523 non-null   object 
 5   Item_MRP                   8523 non-null   float64
 6   Outlet_Identifier          8523 non-null   object 
 7   Outlet_Establishment_Year  8523 non-null   int64  
 8   Outlet_Size                6113 non-null   object 
 9   Outlet_Location_Type       8523 non-null   object 
 10  Outlet_Type                8523 non-null   object 
 11  Item_Outlet_Sales          8523 non-null   float64
dtypes: float64(4), int64(1), object(7)
memory usage: 799.2+ KB


In [25]:
df.describe()

Unnamed: 0,Item_Weight,Item_Visibility,Item_MRP,Outlet_Establishment_Year,Item_Outlet_Sales
count,7060.0,8523.0,8523.0,8523.0,8523.0
mean,12.857645,0.066132,140.992782,1997.831867,2181.288914
std,4.643456,0.051598,62.275067,8.37176,1706.499616
min,4.555,0.0,31.29,1985.0,33.29
25%,8.77375,0.026989,93.8265,1987.0,834.2474
50%,12.6,0.053931,143.0128,1999.0,1794.331
75%,16.85,0.094585,185.6437,2004.0,3101.2964
max,21.35,0.328391,266.8884,2009.0,13086.9648


# 1. Item_Identifier

In [26]:
df['Item_Identifier'].value_counts()

FDW13    10
FDG33    10
NCY18     9
FDD38     9
DRE49     9
         ..
FDY43     1
FDQ60     1
FDO33     1
DRF48     1
FDC23     1
Name: Item_Identifier, Length: 1559, dtype: int64

In [27]:
df=pd.get_dummies(df,columns=['Item_Identifier'])
df

Unnamed: 0,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,...,Item_Identifier_NCZ05,Item_Identifier_NCZ06,Item_Identifier_NCZ17,Item_Identifier_NCZ18,Item_Identifier_NCZ29,Item_Identifier_NCZ30,Item_Identifier_NCZ41,Item_Identifier_NCZ42,Item_Identifier_NCZ53,Item_Identifier_NCZ54
0,9.300,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,...,0,0,0,0,0,0,0,0,0,0
1,5.920,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,...,0,0,0,0,0,0,0,0,0,0
2,17.500,Low Fat,0.016760,Meat,141.6180,OUT049,1999,Medium,Tier 1,Supermarket Type1,...,0,0,0,0,0,0,0,0,0,0
3,19.200,Regular,0.000000,Fruits and Vegetables,182.0950,OUT010,1998,,Tier 3,Grocery Store,...,0,0,0,0,0,0,0,0,0,0
4,8.930,Low Fat,0.000000,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8518,6.865,Low Fat,0.056783,Snack Foods,214.5218,OUT013,1987,High,Tier 3,Supermarket Type1,...,0,0,0,0,0,0,0,0,0,0
8519,8.380,Regular,0.046982,Baking Goods,108.1570,OUT045,2002,,Tier 2,Supermarket Type1,...,0,0,0,0,0,0,0,0,0,0
8520,10.600,Low Fat,0.035186,Health and Hygiene,85.1224,OUT035,2004,Small,Tier 2,Supermarket Type1,...,0,0,0,0,0,0,0,0,0,0
8521,7.210,Regular,0.145221,Snack Foods,103.1332,OUT018,2009,Medium,Tier 3,Supermarket Type2,...,0,0,0,0,0,0,0,0,0,0


In [82]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Columns: 1585 entries, Item_Weight to Item_Type_Starchy Foods
dtypes: float64(4), int64(6), uint8(1575)
memory usage: 13.5 MB


# 2. Item_Weight

In [28]:
df['Item_Weight'].value_counts()

12.150    86
17.600    82
13.650    77
11.800    76
15.100    68
          ..
7.275      2
7.685      1
9.420      1
6.520      1
5.400      1
Name: Item_Weight, Length: 415, dtype: int64

In [29]:
df.replace({' ':np.nan},inplace=True)

In [30]:
df.isna().sum()

Item_Weight              1463
Item_Fat_Content            0
Item_Visibility             0
Item_Type                   0
Item_MRP                    0
                         ... 
Item_Identifier_NCZ30       0
Item_Identifier_NCZ41       0
Item_Identifier_NCZ42       0
Item_Identifier_NCZ53       0
Item_Identifier_NCZ54       0
Length: 1570, dtype: int64

In [31]:
Item_Weight_median=df['Item_Weight'].median()
Item_Weight_median

12.6

In [32]:
df['Item_Weight'].fillna(Item_Weight_median,inplace=True)

In [33]:
df['Item_Weight'].isna().sum()

0

# 3. Item_Fat_Content

In [34]:
df['Item_Fat_Content'].value_counts().to_dict()

{'Low Fat': 5089, 'Regular': 2889, 'LF': 316, 'reg': 117, 'low fat': 112}

In [35]:
Item_Fat_Content_value={'Low Fat': 0, 'Regular': 1, 'LF': 3, 'reg': 4, 'low fat': 5}
df['Item_Fat_Content'].replace(Item_Fat_Content_value,inplace=True)

In [36]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Columns: 1570 entries, Item_Weight to Item_Identifier_NCZ54
dtypes: float64(4), int64(2), object(5), uint8(1559)
memory usage: 13.4+ MB


# 4. Item_Visibility

In [39]:
df['Item_Visibility'].isna().sum()

0

# 5. Item_Type

In [40]:
df['Item_Type'].value_counts()

Fruits and Vegetables    1232
Snack Foods              1200
Household                 910
Frozen Foods              856
Dairy                     682
Canned                    649
Baking Goods              648
Health and Hygiene        520
Soft Drinks               445
Meat                      425
Breads                    251
Hard Drinks               214
Others                    169
Starchy Foods             148
Breakfast                 110
Seafood                    64
Name: Item_Type, dtype: int64

In [41]:
df=pd.get_dummies(df,columns=['Item_Type'])
df

Unnamed: 0,Item_Weight,Item_Fat_Content,Item_Visibility,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales,...,Item_Type_Fruits and Vegetables,Item_Type_Hard Drinks,Item_Type_Health and Hygiene,Item_Type_Household,Item_Type_Meat,Item_Type_Others,Item_Type_Seafood,Item_Type_Snack Foods,Item_Type_Soft Drinks,Item_Type_Starchy Foods
0,9.300,0,0.016047,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.1380,...,0,0,0,0,0,0,0,0,0,0
1,5.920,1,0.019278,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228,...,0,0,0,0,0,0,0,0,1,0
2,17.500,0,0.016760,141.6180,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.2700,...,0,0,0,0,1,0,0,0,0,0
3,19.200,1,0.000000,182.0950,OUT010,1998,,Tier 3,Grocery Store,732.3800,...,1,0,0,0,0,0,0,0,0,0
4,8.930,0,0.000000,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052,...,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8518,6.865,0,0.056783,214.5218,OUT013,1987,High,Tier 3,Supermarket Type1,2778.3834,...,0,0,0,0,0,0,0,1,0,0
8519,8.380,1,0.046982,108.1570,OUT045,2002,,Tier 2,Supermarket Type1,549.2850,...,0,0,0,0,0,0,0,0,0,0
8520,10.600,0,0.035186,85.1224,OUT035,2004,Small,Tier 2,Supermarket Type1,1193.1136,...,0,0,1,0,0,0,0,0,0,0
8521,7.210,1,0.145221,103.1332,OUT018,2009,Medium,Tier 3,Supermarket Type2,1845.5976,...,0,0,0,0,0,0,0,1,0,0


In [83]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Columns: 1585 entries, Item_Weight to Item_Type_Starchy Foods
dtypes: float64(4), int64(6), uint8(1575)
memory usage: 13.5 MB


# 6. Item_MRP

In [42]:
df['Item_MRP'].isna().sum()

0

# 7. Outlet_Identifier

In [43]:
df['Outlet_Identifier'].value_counts().to_dict()

{'OUT027': 935,
 'OUT013': 932,
 'OUT049': 930,
 'OUT046': 930,
 'OUT035': 930,
 'OUT045': 929,
 'OUT018': 928,
 'OUT017': 926,
 'OUT010': 555,
 'OUT019': 528}

In [44]:
Outlet_Identifier_value={'OUT027': 27,
 'OUT013': 13,
 'OUT049': 49,
 'OUT046': 46,
 'OUT035': 35,
 'OUT045': 45,
 'OUT018': 18,
 'OUT017': 17,
 'OUT010': 10,
 'OUT019': 19}

In [45]:
df['Outlet_Identifier'].replace(Outlet_Identifier_value,inplace=True)

# 8. Outlet_Establishment_Year

In [46]:
df['Outlet_Establishment_Year'].isna().sum()

0

In [47]:
df['Outlet_Establishment_Year'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 8523 entries, 0 to 8522
Series name: Outlet_Establishment_Year
Non-Null Count  Dtype
--------------  -----
8523 non-null   int64
dtypes: int64(1)
memory usage: 66.7 KB


# 9. Outlet_Size

In [48]:
df['Outlet_Size'].isna().sum()

2410

In [49]:
Outlet_Size_mode=df['Outlet_Size'].mode()
Outlet_Size_mode[0]

'Medium'

In [50]:
df['Outlet_Size'].fillna(Outlet_Size_mode[0],inplace=True)

In [51]:
df['Outlet_Size'].isna().sum()

0

In [52]:
df['Outlet_Size'].value_counts().to_dict()

{'Medium': 5203, 'Small': 2388, 'High': 932}

In [53]:
Outlet_Size_value={ 'Small': 0,'Medium': 1, 'High': 2}
df['Outlet_Size'].replace(Outlet_Size_value,inplace=True)

# 10. Outlet_Location_Type

In [54]:
df['Outlet_Location_Type'].isna().sum()

0

In [55]:
df['Outlet_Location_Type'].value_counts().to_dict()

{'Tier 3': 3350, 'Tier 2': 2785, 'Tier 1': 2388}

In [56]:
Outlet_Location_Type_value={'Tier 1': 1, 'Tier 2': 2, 'Tier 3': 3}
df['Outlet_Location_Type'].replace(Outlet_Location_Type_value,inplace=True)

# 11. Outlet_Type

In [57]:
df['Outlet_Type'].isna().sum()

0

In [58]:
df['Outlet_Type'].value_counts().to_dict()

{'Supermarket Type1': 5577,
 'Grocery Store': 1083,
 'Supermarket Type3': 935,
 'Supermarket Type2': 928}

In [59]:
Outlet_Type_value={'Supermarket Type1': 1,'Supermarket Type2': 2,
 'Supermarket Type3': 3,'Grocery Store': 4,}
df['Outlet_Type'].replace(Outlet_Type_value,inplace=True)

In [61]:
df

Unnamed: 0,Item_Weight,Item_Fat_Content,Item_Visibility,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales,...,Item_Type_Fruits and Vegetables,Item_Type_Hard Drinks,Item_Type_Health and Hygiene,Item_Type_Household,Item_Type_Meat,Item_Type_Others,Item_Type_Seafood,Item_Type_Snack Foods,Item_Type_Soft Drinks,Item_Type_Starchy Foods
0,9.300,0,0.016047,249.8092,49,1999,1,1,1,3735.1380,...,0,0,0,0,0,0,0,0,0,0
1,5.920,1,0.019278,48.2692,18,2009,1,3,2,443.4228,...,0,0,0,0,0,0,0,0,1,0
2,17.500,0,0.016760,141.6180,49,1999,1,1,1,2097.2700,...,0,0,0,0,1,0,0,0,0,0
3,19.200,1,0.000000,182.0950,10,1998,1,3,4,732.3800,...,1,0,0,0,0,0,0,0,0,0
4,8.930,0,0.000000,53.8614,13,1987,2,3,1,994.7052,...,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8518,6.865,0,0.056783,214.5218,13,1987,2,3,1,2778.3834,...,0,0,0,0,0,0,0,1,0,0
8519,8.380,1,0.046982,108.1570,45,2002,1,2,1,549.2850,...,0,0,0,0,0,0,0,0,0,0
8520,10.600,0,0.035186,85.1224,35,2004,0,2,1,1193.1136,...,0,0,1,0,0,0,0,0,0,0
8521,7.210,1,0.145221,103.1332,18,2009,1,3,2,1845.5976,...,0,0,0,0,0,0,0,1,0,0


# 12. Item_Outlet_Sales

In [38]:
df['Item_Outlet_Sales'].isna().sum()

0

In [62]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Columns: 1585 entries, Item_Weight to Item_Type_Starchy Foods
dtypes: float64(4), int64(6), uint8(1575)
memory usage: 13.5 MB


# Training

In [63]:
x=df.drop('Item_Outlet_Sales',axis=1)
y=df['Item_Outlet_Sales']
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=25)

In [64]:
model=LinearRegression()
model.fit(x_train,y_train)

LinearRegression()

In [66]:
# Training Data Set
y_pred_train = model.predict(x_train)

mse = mean_squared_error(y_train, y_pred_train)
print("MSE :",mse)

rmse = np.sqrt(mse)
print("RMSE :",rmse)

mae = mean_absolute_error(y_train, y_pred_train)
print("MAE :",mae)

r2 = r2_score(y_train, y_pred_train)
print("R Squared :",r2)

MSE : 1209081.1765216244
RMSE : 1099.5822736483271
MAE : 814.4222684198677
R Squared : 0.5819457155434875


In [67]:
# Testing Dataset
y_pred = model.predict(x_test)

mse = mean_squared_error(y_test, y_pred)
print("MSE :",mse)

rmse = np.sqrt(mse)
print("RMSE :",rmse)

mae = mean_absolute_error(y_test, y_pred)
print("MAE :",mae)

r2 = r2_score(y_test, y_pred)
print("R Squared :",r2)

MSE : 5.205378261067718e+18
RMSE : 2281529807.183706
MAE : 55255089.65042269
R Squared : -1741513935481.8418


# Testing single columns

In [68]:
column_names=x.columns
column_names

Index(['Item_Weight', 'Item_Fat_Content', 'Item_Visibility', 'Item_MRP',
       'Outlet_Identifier', 'Outlet_Establishment_Year', 'Outlet_Size',
       'Outlet_Location_Type', 'Outlet_Type', 'Item_Identifier_DRA12',
       ...
       'Item_Type_Fruits and Vegetables', 'Item_Type_Hard Drinks',
       'Item_Type_Health and Hygiene', 'Item_Type_Household', 'Item_Type_Meat',
       'Item_Type_Others', 'Item_Type_Seafood', 'Item_Type_Snack Foods',
       'Item_Type_Soft Drinks', 'Item_Type_Starchy Foods'],
      dtype='object', length=1584)

In [69]:
df

Unnamed: 0,Item_Weight,Item_Fat_Content,Item_Visibility,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales,...,Item_Type_Fruits and Vegetables,Item_Type_Hard Drinks,Item_Type_Health and Hygiene,Item_Type_Household,Item_Type_Meat,Item_Type_Others,Item_Type_Seafood,Item_Type_Snack Foods,Item_Type_Soft Drinks,Item_Type_Starchy Foods
0,9.300,0,0.016047,249.8092,49,1999,1,1,1,3735.1380,...,0,0,0,0,0,0,0,0,0,0
1,5.920,1,0.019278,48.2692,18,2009,1,3,2,443.4228,...,0,0,0,0,0,0,0,0,1,0
2,17.500,0,0.016760,141.6180,49,1999,1,1,1,2097.2700,...,0,0,0,0,1,0,0,0,0,0
3,19.200,1,0.000000,182.0950,10,1998,1,3,4,732.3800,...,1,0,0,0,0,0,0,0,0,0
4,8.930,0,0.000000,53.8614,13,1987,2,3,1,994.7052,...,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8518,6.865,0,0.056783,214.5218,13,1987,2,3,1,2778.3834,...,0,0,0,0,0,0,0,1,0,0
8519,8.380,1,0.046982,108.1570,45,2002,1,2,1,549.2850,...,0,0,0,0,0,0,0,0,0,0
8520,10.600,0,0.035186,85.1224,35,2004,0,2,1,1193.1136,...,0,0,1,0,0,0,0,0,0,0
8521,7.210,1,0.145221,103.1332,18,2009,1,3,2,1845.5976,...,0,0,0,0,0,0,0,1,0,0


In [70]:
print(Item_Fat_Content_value)
print("*"*15)


print(Outlet_Identifier_value)
print("*"*15)

print(Outlet_Size_value)
print("*"*15)


print(Outlet_Location_Type_value)
print("*"*15)


print(Outlet_Type_value)

{'Low Fat': 0, 'Regular': 1, 'LF': 3, 'reg': 4, 'low fat': 5}
***************
{'OUT027': 27, 'OUT013': 13, 'OUT049': 49, 'OUT046': 46, 'OUT035': 35, 'OUT045': 45, 'OUT018': 18, 'OUT017': 17, 'OUT010': 10, 'OUT019': 19}
***************
{'Small': 0, 'Medium': 1, 'High': 2}
***************
{'Tier 1': 1, 'Tier 2': 2, 'Tier 3': 3}
***************
{'Supermarket Type1': 1, 'Supermarket Type2': 2, 'Supermarket Type3': 3, 'Grocery Store': 4}


In [71]:
Item_Weight=9.8
Item_Fat_Content='Regular'
Item_Visibility=0.5
Item_MRP=250
Outlet_Identifier='OUT018'
Outlet_Establishment_Year=2001
Outlet_Size='High'
Outlet_Location_Type='Tier 1'
Outlet_Type='Supermarket Type3'
Item_Type='Seafood'
Item_Identifier='FDP49'

# Item_Outlet_Sales= ?

In [72]:
data={'Item_Fat_Content':Item_Fat_Content_value,'Outlet_Identifier':Outlet_Identifier_value,
      'Outlet_Size':Outlet_Size_value,'Outlet_Location_Type':Outlet_Location_Type_value,
     'Outlet_Type':Outlet_Type_value,"columns":list(column_names)}

data

{'Item_Fat_Content': {'Low Fat': 0,
  'Regular': 1,
  'LF': 3,
  'reg': 4,
  'low fat': 5},
 'Outlet_Identifier': {'OUT027': 27,
  'OUT013': 13,
  'OUT049': 49,
  'OUT046': 46,
  'OUT035': 35,
  'OUT045': 45,
  'OUT018': 18,
  'OUT017': 17,
  'OUT010': 10,
  'OUT019': 19},
 'Outlet_Size': {'Small': 0, 'Medium': 1, 'High': 2},
 'Outlet_Location_Type': {'Tier 1': 1, 'Tier 2': 2, 'Tier 3': 3},
 'Outlet_Type': {'Supermarket Type1': 1,
  'Supermarket Type2': 2,
  'Supermarket Type3': 3,
  'Grocery Store': 4},
 'columns': ['Item_Weight',
  'Item_Fat_Content',
  'Item_Visibility',
  'Item_MRP',
  'Outlet_Identifier',
  'Outlet_Establishment_Year',
  'Outlet_Size',
  'Outlet_Location_Type',
  'Outlet_Type',
  'Item_Identifier_DRA12',
  'Item_Identifier_DRA24',
  'Item_Identifier_DRA59',
  'Item_Identifier_DRB01',
  'Item_Identifier_DRB13',
  'Item_Identifier_DRB24',
  'Item_Identifier_DRB25',
  'Item_Identifier_DRB48',
  'Item_Identifier_DRC01',
  'Item_Identifier_DRC12',
  'Item_Identifier_DR

In [73]:
# define one hot encoded column

Item_Type='Item_Type_'+Item_Type
Item_Type

'Item_Type_Seafood'

In [74]:
Item_Identifier='Item_Identifier_' + Item_Identifier
Item_Identifier

'Item_Identifier_FDP49'

In [75]:
item_type_index = np.where(column_names == Item_Type)[0][0]
item_type_index

1580

In [76]:
Item_Identifier_index=np.where(column_names == Item_Identifier)[0][0]
Item_Identifier_index

776

In [77]:
arr = np.zeros(x.shape[1])
arr[0]=Item_Weight
arr[1]=data['Item_Fat_Content'][Item_Fat_Content]
arr[2]=Item_Visibility
arr[3]=Item_MRP
arr[4]=data['Outlet_Identifier'][Outlet_Identifier]
arr[5]=Outlet_Establishment_Year
arr[6]=data['Outlet_Size'][Outlet_Size]
arr[7]=data['Outlet_Location_Type'][Outlet_Location_Type]
arr[8]=data['Outlet_Type'][Outlet_Type]
arr[item_type_index]=1
arr[Item_Identifier_index]=1
arr

array([9.8, 1. , 0.5, ..., 0. , 0. , 0. ])

In [79]:
model.predict([arr])

array([2.59877088e+11])

In [80]:
import pickle

with open('linear_model.pkl','wb')as f:
    pickle.dump(model,f)

In [81]:
import json

with open('project_data.json','w') as f:
    json.dump(data,f)