### Import the required libraries

In [17]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt #for visualizations

### Import the dataset

In [18]:
data = pd.read_csv('ProductPurchase.csv')

In [19]:
#Identify dependent and independent variables
X = data.iloc[:,:-1].values #independent variables
Y = data.iloc[:,-1].values #dependent variable

In [20]:
print(X)

[['Male' 40.0 58000.0]
 ['Female' 28.0 50000.0]
 ['Male' 56.0 72000.0]
 ['Female' 23.0 55000.0]
 ['Female' 38.0 nan]
 ['Male' nan 66000.0]
 ['Male' 49.0 45000.0]
 ['Male' 52.0 38000.0]
 ['Female' 32.0 62000.0]
 ['Male' 55.0 nan]]


In [21]:
print(Y)

['No' 'No' 'No' 'Yes' 'Yes' 'Yes' 'No' 'Yes' 'Yes' 'Yes']


### Handling missing data

In [22]:
#Method 1: Delete rows with missing data
dropData = pd.read_csv('ProductPurchase.csv')
print("Old dataframe length:",len(dropData))
dropData = dropData.dropna()
print("New dataframe length:",len(dropData))

#Note: This method is advisable only when missing data is 1% of the entire dataset

Old dataframe length: 10
New dataframe length: 7


In [23]:
#Method 2A: Impute missing values with numpy
imputeData = pd.read_csv('ProductPurchase.csv')
print("Null values before imputation:\n",imputeData.isnull().sum())
# fill missing values with mean column values
imputeData.fillna(imputeData.mean(), inplace=True)
print("\nDataframe after imputation:\n",imputeData)

Null values before imputation:
 Gender       0
Age          1
Salary       2
Purchased    0
dtype: int64

Dataframe after imputation:
    Gender        Age   Salary Purchased
0    Male  40.000000  58000.0        No
1  Female  28.000000  50000.0        No
2    Male  56.000000  72000.0        No
3  Female  23.000000  55000.0       Yes
4  Female  38.000000  55750.0       Yes
5    Male  41.444444  66000.0       Yes
6    Male  49.000000  45000.0        No
7    Male  52.000000  38000.0       Yes
8  Female  32.000000  62000.0       Yes
9    Male  55.000000  55750.0       Yes


In [31]:
#Method 2B: Impute missing values with sklearn
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(X[:,1:3])
X[:,1:3] = imputer.transform(X[:, 1:3])
print(X)

[['Male' 40.0 58000.0]
 ['Female' 28.0 50000.0]
 ['Male' 56.0 72000.0]
 ['Female' 23.0 55000.0]
 ['Female' 38.0 55750.0]
 ['Male' 41.44444444444444 66000.0]
 ['Male' 49.0 45000.0]
 ['Male' 52.0 38000.0]
 ['Female' 32.0 62000.0]
 ['Male' 55.0 55750.0]]


### Encoding Categorical Data

In [36]:
#Method 1: OrdinalEncoder
from sklearn.preprocessing import OrdinalEncoder
ordinalData = data
encoder = OrdinalEncoder()
encoder.fit(ordinalData[["Gender"]])
ordinalData[["Gender"]] = encoder.transform(ordinalData[["Gender"]])
print(ordinalData)

#If there is no relation between the values of the column this method is not ideal.
# An example where this method can be used is Size of a T-shirt (small,medium large) can be encoded as (0,1,2)

   Gender   Age   Salary Purchased
0     1.0  40.0  58000.0        No
1     0.0  28.0  50000.0        No
2     1.0  56.0  72000.0        No
3     0.0  23.0  55000.0       Yes
4     0.0  38.0      NaN       Yes
5     1.0   NaN  66000.0       Yes
6     1.0  49.0  45000.0        No
7     1.0  52.0  38000.0       Yes
8     0.0  32.0  62000.0       Yes
9     1.0  55.0      NaN       Yes


In [47]:
#Method 2: Dummy Variables
dummyData = pd.read_csv('ProductPurchase.csv')
encodedGender = pd.get_dummies(dummyData.Gender)
tempData = pd.concat([dummyData, encodedGender], axis='columns')
encodedData = tempData.drop(['Gender','Male'], axis='columns') #Dropped due to the dummy variable trap
#Read more about it here: https://www.algosome.com/articles/dummy-variable-trap-regression.html
print(encodedData)

    Age   Salary Purchased  Female
0  40.0  58000.0        No       0
1  28.0  50000.0        No       1
2  56.0  72000.0        No       0
3  23.0  55000.0       Yes       1
4  38.0      NaN       Yes       1
5   NaN  66000.0       Yes       0
6  49.0  45000.0        No       0
7  52.0  38000.0       Yes       0
8  32.0  62000.0       Yes       1
9  55.0      NaN       Yes       0


In [49]:
#Method 3: Label Encoder
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
labelEncoderData = pd.read_csv('ProductPurchase.csv')
labelEncoderData.Gender = encoder.fit_transform(labelEncoderData.Gender)
labelEncoderData

Unnamed: 0,Gender,Age,Salary,Purchased
0,1,40.0,58000.0,No
1,0,28.0,50000.0,No
2,1,56.0,72000.0,No
3,0,23.0,55000.0,Yes
4,0,38.0,,Yes
5,1,,66000.0,Yes
6,1,49.0,45000.0,No
7,1,52.0,38000.0,Yes
8,0,32.0,62000.0,Yes
9,1,55.0,,Yes
