**Import Libraries**

In [31]:
import pandas as pd
import numpy as np

**Import Dataset**

In [32]:
dataset=pd.read_csv('Data.csv')
print(dataset)

   Country   Age   Salary Purchased
0   France  44.0  72000.0        No
1    Spain  27.0  48000.0       Yes
2  Germany  30.0  54000.0        No
3    Spain  38.0  61000.0        No
4  Germany  40.0      NaN       Yes
5   France  35.0  58000.0       Yes
6    Spain   NaN  52000.0        No
7   France  48.0  79000.0       Yes
8  Germany  50.0  83000.0        No
9   France  37.0  67000.0       Yes


In [33]:
x=dataset.iloc[:, :-1].values
y=dataset.iloc[:,-1].values
print(x)
print(y)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 nan]
 ['France' 35.0 58000.0]
 ['Spain' nan 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]
['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']


In [34]:
# you can see that there are some missing values (nan) .You may end up building
# a biased machine learning model, leading to incorrect results if the missing
# values are not handled properly.

**Handling missing values**

In [35]:
#how to handle missing values?
# 1. Imputation
# 2. Deletion
# Imputation-
#   1. Mean Imputation
#   2. Median Imputation
# Deletion- remove rows/cols with missing values

In [36]:
from sklearn.impute import SimpleImputer
imputer=SimpleImputer(missing_values=np.nan,strategy='mean')
imputer.fit(x[:,1:3])
x[:, 1:3] = imputer.transform(x[:, 1:3])
print(x)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 63777.77777777778]
 ['France' 35.0 58000.0]
 ['Spain' 38.77777777777778 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


**Encoding Categorical Variables**

In [37]:
# Since most machine learning models only accept numerical variables,
# preprocessing the categorical variables becomes a necessary step. We need to
# convert these categorical variables to numbers such that the model is able to
# understand and extract valuable information.


#Encoding is mostly done on the columns of Data

In [38]:
#Encoding independent variables
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

# One Hot Encoding is a way of encoding categorical variable to binary vectors
# should be used when you are not concerned with implied order/rank
ct=ColumnTransformer(transformers=[('encoder',OneHotEncoder(),[0])],remainder='passthrough')
x=ct.fit_transform(x)
# since we need x (independent features) for training & testing the model , it needs to be a np array . Hence
x=np.array(x)
print(x)

[[1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [0.0 1.0 0.0 30.0 54000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 35.0 58000.0]
 [0.0 0.0 1.0 38.77777777777778 52000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


In [39]:
# Label Encoding - Label Encoding is a technique that is used to convert
# categorical columns into numerical ones so that they can be fitted by machine
# learning models which only take numerical data. It is an important
# pre-processing step in a machine-learning project. %% Encoding dependent
# variables

# Label Encoder maintains order/rank

In [40]:
# Encoding dependent variables
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
y=le.fit_transform(y)
print(y)

[0 1 0 0 1 1 0 1 0 1]


**Split data into training set & test set**

In [41]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=1)


In [42]:
print(x_train)

[[0.0 0.0 1.0 38.77777777777778 52000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 35.0 58000.0]]


In [43]:
print(x_test)

[[0.0 1.0 0.0 30.0 54000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


In [45]:
print(y_train)

[0 1 0 0 1 1 0 1]


In [46]:
print(y_test)

[0 1]


**Feature Scaling**

In [48]:
# Feature scaling is a data preprocessing technique used to transform the values
# of features or variables in a dataset to a similar scale. The purpose is to
# ensure that all features contribute equally to the model and to avoid the
# domination of features with larger values.

In [54]:
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
# make sure that only the features that need scaling are scaled.
#Here we do not have any such constraint .
# However remember that not all features need scaling
x_train = scaler.fit_transform(x_train)
# for test data we do not need to fit it to the model as test data is used for prediction
x_test = scaler.transform(x_test)


In [55]:
print("Scaled x_train",x_train)

Scaled x_train [[-0.77459667 -0.57735027  1.29099445 -0.19159184 -1.07812594]
 [-0.77459667  1.73205081 -0.77459667 -0.01411729 -0.07013168]
 [ 1.29099445 -0.57735027 -0.77459667  0.56670851  0.63356243]
 [-0.77459667 -0.57735027  1.29099445 -0.30453019 -0.30786617]
 [-0.77459667 -0.57735027  1.29099445 -1.90180114 -1.42046362]
 [ 1.29099445 -0.57735027 -0.77459667  1.14753431  1.23265336]
 [-0.77459667  1.73205081 -0.77459667  1.43794721  1.57499104]
 [ 1.29099445 -0.57735027 -0.77459667 -0.74014954 -0.56461943]]


In [56]:
print("x_test ",x_test)

x_test  [[-0.77459667  1.73205081 -0.77459667 -1.46618179 -0.9069571 ]
 [ 1.29099445 -0.57735027 -0.77459667 -0.44973664  0.20564034]]
