<a href="https://colab.research.google.com/github/victoriawhite17/Sales_Predictions/blob/main/Transforming_Sales_Predictions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Sales Predictions
- Victoria White
- 17 August 2022

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_selector, make_column_transformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn import set_config
set_config(display='diagram')
from sklearn.preprocessing import StandardScaler, OneHotEncoder

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
df = pd.read_csv('https://docs.google.com/spreadsheets/d/e/2PACX-1vTm38yOK7rXwn_p6FXOnqmqh_Wbq_C1RrpevFVocGFoK7d7cCqW80-yQwpQMP0t-H7Cei94Wo0E8cEO/pub?gid=1919646656&single=true&output=csv')

In [3]:
#dropping duplicates
df.drop_duplicates()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.300,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.1380
1,DRC01,5.920,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.500,Low Fat,0.016760,Meat,141.6180,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.2700
3,FDX07,19.200,Regular,0.000000,Fruits and Vegetables,182.0950,OUT010,1998,,Tier 3,Grocery Store,732.3800
4,NCD19,8.930,Low Fat,0.000000,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052
...,...,...,...,...,...,...,...,...,...,...,...,...
8518,FDF22,6.865,Low Fat,0.056783,Snack Foods,214.5218,OUT013,1987,High,Tier 3,Supermarket Type1,2778.3834
8519,FDS36,8.380,Regular,0.046982,Baking Goods,108.1570,OUT045,2002,,Tier 2,Supermarket Type1,549.2850
8520,NCJ29,10.600,Low Fat,0.035186,Health and Hygiene,85.1224,OUT035,2004,Small,Tier 2,Supermarket Type1,1193.1136
8521,FDN46,7.210,Regular,0.145221,Snack Foods,103.1332,OUT018,2009,Medium,Tier 3,Supermarket Type2,1845.5976


In [4]:
#checking unique values
df.nunique()

Item_Identifier              1559
Item_Weight                   415
Item_Fat_Content                5
Item_Visibility              7880
Item_Type                      16
Item_MRP                     5938
Outlet_Identifier              10
Outlet_Establishment_Year       9
Outlet_Size                     3
Outlet_Location_Type            3
Outlet_Type                     4
Item_Outlet_Sales            3493
dtype: int64

In [5]:
df['Item_Fat_Content'].unique()

array(['Low Fat', 'Regular', 'low fat', 'LF', 'reg'], dtype=object)

In [6]:
#correcting unique values
df['Item_Fat_Content'].replace({'low fat':'Low Fat', 'LF':'Low Fat','reg':'Regular'}, inplace=True)

In [7]:
df['Item_Fat_Content'].unique()

array(['Low Fat', 'Regular'], dtype=object)

In [8]:
X = df.drop(columns=['Item_Identifier', 'Item_Weight', 'Outlet_Identifier', 'Item_Outlet_Sales'])
y = df['Item_Outlet_Sales']

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
display(X_train.info())
#nominal features are item type, location type, outlet type, and item fat content
#numerical features are item visibility, item establishment year, and item mrp
#orindal features are outlet location type, and outlet size

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6392 entries, 4776 to 7270
Data columns (total 8 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Item_Fat_Content           6392 non-null   object 
 1   Item_Visibility            6392 non-null   float64
 2   Item_Type                  6392 non-null   object 
 3   Item_MRP                   6392 non-null   float64
 4   Outlet_Establishment_Year  6392 non-null   int64  
 5   Outlet_Size                4580 non-null   object 
 6   Outlet_Location_Type       6392 non-null   object 
 7   Outlet_Type                6392 non-null   object 
dtypes: float64(2), int64(1), object(5)
memory usage: 449.4+ KB


None

In [10]:
#instantiating column selectors
cat_selector = make_column_selector(dtype_include='object')
num_selector = make_column_selector(dtype_include='number')

In [11]:
#instantiating transformers
freq_imputer = SimpleImputer(strategy='most_frequent')
mean_imputer = SimpleImputer(strategy='mean')
scaler = StandardScaler()
ohe = OneHotEncoder(handle_unknown='ignore', sparse=False)

In [12]:
#instantiating pipeline for numerical features
num_pipe = make_pipeline(mean_imputer, scaler)
num_pipe

In [13]:
#instantiating pipeline for nominal features
cat_pipe = make_pipeline(freq_imputer, ohe)
cat_pipe

In [14]:
#instantiating column transformer
num_tuple = (num_pipe, num_selector)
cat_tuple = (cat_pipe, cat_selector)
preprocessor = make_column_transformer(num_tuple, cat_tuple)
preprocessor

In [15]:
#Fitting the transformer
preprocessor.fit(X_train)

In [16]:
X_train_processed = preprocessor.transform(X_train)
X_test_processed = preprocessor.transform(X_test)

In [18]:
X_train_processed

array([[-0.71277507,  1.82810922,  1.32784893, ...,  0.        ,
         1.        ,  0.        ],
       [-1.29105225,  0.60336888,  1.32784893, ...,  0.        ,
         1.        ,  0.        ],
       [ 1.81331864,  0.24454056,  0.13618724, ...,  1.        ,
         0.        ,  0.        ],
       ...,
       [-0.92052713,  1.52302674,  0.49368575, ...,  1.        ,
         0.        ,  0.        ],
       [-0.2277552 , -0.38377708,  1.0895166 , ...,  1.        ,
         0.        ,  0.        ],
       [-0.95867683, -0.73836105, -0.10214509, ...,  1.        ,
         0.        ,  0.        ]])

In [19]:
X_test_processed

array([[-0.77664625, -0.99881554, -1.29380678, ...,  1.        ,
         0.        ,  0.        ],
       [ 0.1003166 , -1.58519423, -0.10214509, ...,  1.        ,
         0.        ,  0.        ],
       [-0.48299432, -1.59578435,  0.13618724, ...,  1.        ,
         0.        ,  0.        ],
       ...,
       [ 1.21832428,  1.09397975,  0.49368575, ...,  1.        ,
         0.        ,  0.        ],
       [-0.77809567, -0.36679966,  0.13618724, ...,  1.        ,
         0.        ,  0.        ],
       [-0.77976293,  0.11221189,  1.0895166 , ...,  1.        ,
         0.        ,  0.        ]])