In [1]:
#Data.csv

**Step 1: Importing the libraries**

In [2]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 

**Step 2: Importing dataset**

In [3]:
data = pd.read_csv('Data.csv')

In [4]:
data

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [5]:
data.isnull().sum()

Country      0
Age          1
Salary       1
Purchased    0
dtype: int64

**Step 3: Handling the missing data**

In [6]:
median_age = data['Age'].median()

In [7]:
data['Age'].fillna(median_age,inplace=True)

In [8]:
median_salary = data['Salary'].median()

In [9]:
data['Salary'].fillna(median_salary,inplace=True)

In [10]:
data

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,61000.0,Yes
5,France,35.0,58000.0,Yes
6,Spain,38.0,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


**Step 4: Encoding categorical data**

In [13]:
from sklearn.preprocessing import LabelEncoder

In [14]:
le = LabelEncoder()

In [15]:
data['Purchased'] = le.fit_transform(data['Purchased'])

In [16]:
data

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,0
1,Spain,27.0,48000.0,1
2,Germany,30.0,54000.0,0
3,Spain,38.0,61000.0,0
4,Germany,40.0,61000.0,1
5,France,35.0,58000.0,1
6,Spain,38.0,52000.0,0
7,France,48.0,79000.0,1
8,Germany,50.0,83000.0,0
9,France,37.0,67000.0,1


**Step 5: Creating a dummy variable**

In [20]:
data = pd.get_dummies(data,drop_first=True)

In [21]:
data

Unnamed: 0,Age,Salary,Purchased,Country_Germany,Country_Spain
0,44.0,72000.0,0,0,0
1,27.0,48000.0,1,0,1
2,30.0,54000.0,0,1,0
3,38.0,61000.0,0,0,1
4,40.0,61000.0,1,1,0
5,35.0,58000.0,1,0,0
6,38.0,52000.0,0,0,1
7,48.0,79000.0,1,0,0
8,50.0,83000.0,0,1,0
9,37.0,67000.0,1,0,0


**Step 6: Splitting the datasets into training sets and Test sets**

In [24]:
cols = [feat for feat in data.columns if feat!= 'Purchased']

In [25]:
cols

['Age', 'Salary', 'Country_Germany', 'Country_Spain']

In [29]:
X = data[cols]
Y = data.Purchased

In [30]:
from sklearn.model_selection import train_test_split

In [34]:
x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size=0.2)

In [35]:
x_test

Unnamed: 0,Age,Salary,Country_Germany,Country_Spain
8,50.0,83000.0,1,0
3,38.0,61000.0,0,1


In [36]:
x_train.head(2)

Unnamed: 0,Age,Salary,Country_Germany,Country_Spain
1,27.0,48000.0,0,1
9,37.0,67000.0,0,0


**Step 7: Feature Scaling**

In [37]:
from sklearn.preprocessing import StandardScaler

In [38]:
scl = StandardScaler()

In [39]:
x_train = pd.DataFrame(scl.fit_transform(x_train),columns=x_train.columns)

In [41]:
x_test = pd.DataFrame(scl.fit_transform(x_test),columns=x_test.columns)