<a href="https://colab.research.google.com/github/zevy613/supervised-machine-learning/blob/main/Project1_part5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [137]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn import set_config
set_config(display='diagram')

filename = "/content/drive/MyDrive/Colab Notebooks/CodingDojo/05IntroML/sales_predictions.csv"
df = pd.read_csv(filename)
df.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [138]:
#first lets check for duplicates
df.duplicated().sum()

0

In [139]:
df['Item_Fat_Content'].value_counts()
# from here we clearly see mistakes in the spelling of low fat and regular fat. lets fix these.

Low Fat    5089
Regular    2889
LF          316
reg         117
low fat     112
Name: Item_Fat_Content, dtype: int64

In [140]:
df['Item_Fat_Content'].replace('LF','Low Fat', inplace=True)
df['Item_Fat_Content'].replace('low fat','Low Fat', inplace=True)
df['Item_Fat_Content'].replace('reg','Regular', inplace=True)
df['Item_Fat_Content'].value_counts()
# now we have just the two categories we want.

Low Fat    5517
Regular    3006
Name: Item_Fat_Content, dtype: int64

In [141]:
#train test split
X = df.drop(columns = ['Item_Identifier','Outlet_Establishment_Year', 'Item_Outlet_Sales'])
y = df['Item_Outlet_Sales']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [142]:
X_train.isna().sum()
#Lets check the type of the columns that are missing data

Item_Weight             1107
Item_Fat_Content           0
Item_Visibility            0
Item_Type                  0
Item_MRP                   0
Outlet_Identifier          0
Outlet_Size             1812
Outlet_Location_Type       0
Outlet_Type                0
dtype: int64

In [143]:
display("The type of item weight is ", df['Item_Weight'].dtype)
display("The type of Outlet size is ", df['Outlet_Size'].dtype)

'The type of item weight is '

dtype('float64')

'The type of Outlet size is '

dtype('O')

In [144]:
# We'll need to impute the values for the numeric and catgorical columns.

In [145]:
#We begin by instantiating the selectors we need
num_selector = make_column_selector(dtype_include='number')
cat_selector = make_column_selector(dtype_include='object')

In [146]:
#Next we'll instantiate our imputers
mean_imputer = SimpleImputer(strategy='mean')
freq_imputer = SimpleImputer(strategy='most_frequent')

In [147]:
#Group them into tuples
num_tuple = (mean_imputer, num_selector)
cat_tuple = (freq_imputer, cat_selector)

In [148]:
#instantiate and fit transformer
col_transformer = make_column_transformer(num_tuple, cat_tuple, remainder='passthrough')
col_transformer.fit(X_train)

In [149]:
#perform the transformation
X_train_imputed = col_transformer.transform(X_train)
X_test_imputed = col_transformer.transform(X_test)

In [150]:
#convert to a data frame
X_train_df = pd.DataFrame(X_train_imputed, columns=X_train.columns)
X_train_df.head()

Unnamed: 0,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Size,Outlet_Location_Type,Outlet_Type
0,16.35,0.029565,256.4646,Low Fat,Household,OUT018,Medium,Tier 3,Supermarket Type2
1,15.25,0.0,179.766,Regular,Snack Foods,OUT018,Medium,Tier 3,Supermarket Type2
2,12.35,0.158716,157.2946,Regular,Meat,OUT049,Medium,Tier 1,Supermarket Type1
3,7.975,0.014628,82.325,Low Fat,Baking Goods,OUT035,Small,Tier 2,Supermarket Type1
4,19.35,0.016645,120.9098,Low Fat,Frozen Foods,OUT045,Medium,Tier 2,Supermarket Type1


In [151]:
#check to see if there are any null values
X_train_df.isna().any()

Item_Weight             False
Item_Fat_Content        False
Item_Visibility         False
Item_Type               False
Item_MRP                False
Outlet_Identifier       False
Outlet_Size             False
Outlet_Location_Type    False
Outlet_Type             False
dtype: bool

In [152]:
#Perfect, we have no missing data! We are ready for modeling