# Import the necessary libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

  return f(*args, **kwds)


# Import the dataset

In [3]:
df = pd.read_csv("Data.csv")
df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


# Basic Data Exploration

In [5]:
#Get a rough feel of the data
df.describe()

Unnamed: 0,Age,Salary
count,9.0,9.0
mean,38.777778,63777.777778
std,7.693793,12265.579662
min,27.0,48000.0
25%,35.0,54000.0
50%,38.0,61000.0
75%,44.0,72000.0
max,50.0,83000.0


In [14]:
#Check the size of dataset
df.shape

(10, 4)

In [11]:
#Check the data types of the objects
df.dtypes

Country       object
Age          float64
Salary       float64
Purchased     object
dtype: object

In [6]:
#Check for any null values
df.isnull()

Unnamed: 0,Country,Age,Salary,Purchased
0,False,False,False,False
1,False,False,False,False
2,False,False,False,False
3,False,False,False,False
4,False,False,True,False
5,False,False,False,False
6,False,True,False,False
7,False,False,False,False
8,False,False,False,False
9,False,False,False,False


In [15]:
#Check for total number of null values present
df.isnull().sum()

Country      0
Age          1
Salary       1
Purchased    0
dtype: int64

# Seperating into dependant and independant variables

Finally, we define both our independant (features) variables and the dependant variables. In this case the first three columns are the features and the last column is the dependant variable. 

In [23]:
#Defining the independant and dependant variables
X = df.iloc[:,:-1].values
X #Defining the independant variables

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, nan],
       ['France', 35.0, 58000.0],
       ['Spain', nan, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

In [24]:
y= df.iloc[:,-1].values #Defining the depedant variable
y

array(['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes'],
      dtype=object)

###### Note : 
<li>loc gets rows (and/or columns) with particular labels.
<li>iloc gets rows (and/or columns) at integer locations.

# Handling missing values

We specify the missing values and the strategy. In this case the strategy is that the missing values will be replaced by the mean


In [25]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan,strategy='mean')

Now we need to transform and fit our data for the imputation. Select only the numerical columns

In [26]:
imputer.fit(X[:,1:3])  #This will select the second and third column
X[:,1:3] = imputer.transform(X[:,1:3]) #This will replace the missing values with the mean for the columns 2 and 3
X

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, 63777.77777777778],
       ['France', 35.0, 58000.0],
       ['Spain', 38.77777777777778, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

 # Encoding the independant Variables

In machine learning, all the data fed in must be numerical. Categorical variables can be represented by numbers through encoding and thus, encoding is important. 

In [27]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

In [28]:
ct = ColumnTransformer(transformers=[('encoder',OneHotEncoder(),[0])],remainder = 'passthrough') #passthrough will let you keep the remaining columns i.e. Age and Salary as well
X=ct.fit_transform(X)
X

array([[1.0, 0.0, 0.0, 44.0, 72000.0],
       [0.0, 0.0, 1.0, 27.0, 48000.0],
       [0.0, 1.0, 0.0, 30.0, 54000.0],
       [0.0, 0.0, 1.0, 38.0, 61000.0],
       [0.0, 1.0, 0.0, 40.0, 63777.77777777778],
       [1.0, 0.0, 0.0, 35.0, 58000.0],
       [0.0, 0.0, 1.0, 38.77777777777778, 52000.0],
       [1.0, 0.0, 0.0, 48.0, 79000.0],
       [0.0, 1.0, 0.0, 50.0, 83000.0],
       [1.0, 0.0, 0.0, 37.0, 67000.0]], dtype=object)

In [29]:
#Convert to numpy array
X = np.array(X)
X

array([[1.0, 0.0, 0.0, 44.0, 72000.0],
       [0.0, 0.0, 1.0, 27.0, 48000.0],
       [0.0, 1.0, 0.0, 30.0, 54000.0],
       [0.0, 0.0, 1.0, 38.0, 61000.0],
       [0.0, 1.0, 0.0, 40.0, 63777.77777777778],
       [1.0, 0.0, 0.0, 35.0, 58000.0],
       [0.0, 0.0, 1.0, 38.77777777777778, 52000.0],
       [1.0, 0.0, 0.0, 48.0, 79000.0],
       [0.0, 1.0, 0.0, 50.0, 83000.0],
       [1.0, 0.0, 0.0, 37.0, 67000.0]], dtype=object)

# Ecoding the dependant Variable

In [30]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)
y

array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1])

# Splitting the dataset into Training and Test

We do the splitting of train and test set before feature scaling. This is because, the test set is always hidden and shouldnt be leaked into the training set while training the model. In feature scaling we perform either normalization or standardization i.e. converting normal distrubtion to standard normal distribution. If feature scaling was done prior to splitting dataset , this could leak the standard deviation and mean of all the values including the ones of the test set. This is called information leakage. 

In [36]:
from sklearn.model_selection import train_test_split
X_train, X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)

In [37]:
X_train

array([[0.0, 1.0, 0.0, 50.0, 83000.0],
       [1.0, 0.0, 0.0, 37.0, 67000.0],
       [1.0, 0.0, 0.0, 35.0, 58000.0],
       [0.0, 0.0, 1.0, 38.77777777777778, 52000.0],
       [1.0, 0.0, 0.0, 44.0, 72000.0],
       [0.0, 0.0, 1.0, 38.0, 61000.0],
       [1.0, 0.0, 0.0, 48.0, 79000.0],
       [0.0, 1.0, 0.0, 40.0, 63777.77777777778]], dtype=object)

In [38]:
X_test

array([[0.0, 0.0, 1.0, 27.0, 48000.0],
       [0.0, 1.0, 0.0, 30.0, 54000.0]], dtype=object)

In [39]:
y_train

array([0, 1, 1, 0, 0, 0, 1, 1])

In [40]:
y_test

array([1, 0])

# Feature Scaling

The goal of feature scaling is to make all the values in the similar range<br>
Note : Dont apply feature scaling on the one hot encoding dummy variables 

In [41]:
from sklearn.preprocessing import StandardScaler
sc= StandardScaler()
X_train[:,3:] = sc.fit_transform(X_train[:,3:])
X_test[:,3:] =sc.transform(X_test[:,3:])

In [42]:
X_train

array([[0.0, 1.0, 0.0, 1.7094303252608214, 1.6259327914558346],
       [1.0, 0.0, 0.0, -0.8588309659817607, 0.002817907784152554],
       [1.0, 0.0, 0.0, -1.2539480877113887, -0.9101842142811686],
       [0.0, 0.0, 1.0, -0.5076157466665356, -1.5188522956580495],
       [1.0, 0.0, 0.0, 0.5240789600719373, 0.5100413089315532],
       [0.0, 0.0, 1.0, -0.6612724051169467, -0.6058501735927282],
       [1.0, 0.0, 0.0, 1.3143132035311933, 1.2201540705379141],
       [0.0, 1.0, 0.0, -0.2661552833873187, -0.3240593951775053]],
      dtype=object)

In [43]:
X_test

array([[0.0, 0.0, 1.0, -2.8344165746299006, -1.92463101657597],
       [0.0, 1.0, 0.0, -2.241740892035459, -1.3159629351990891]],
      dtype=object)

##### Note : Why fit_transform() on train set and transform() on test set ?
<p>fit_transform() is used on the training data so that we can scale the training data and also learn the scaling parameters of that data. Here, the model built by us will learn the mean and variance of the features of the training set. These learned parameters are then used to scale our test data.
So what actually is happening here! 🤔
The fit method is calculating the mean and variance of each of the features present in our data. The transform method is transforming all the features using the respective mean and variance.
Now, we want scaling to be applied to our test data too and at the same time do not want to be biased with our model. We want our test data to be a completely new and a surprise set for our model. The transform method helps us in this case.</p>

<p>transform()
Using the transform method we can use the same mean and variance as it is calculated from our training data to transform our test data. Thus, the parameters learned by our model using the training data will help us to transform our test data.</p>
<br>
Thus, this prevents information leakage. 