In [None]:
#Data.csv

**Step 1: Importing the libraries**

In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

**Step 2: Importing dataset**

In [2]:
df=pd.read_csv("data.csv")

In [3]:
df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


**Step 3: Handling the missing data**

In [4]:
df.Age=df.Age.fillna(round(df.Age.mean(),0))
df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,39.0,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [5]:
df.Salary=df.Salary.fillna(round(df.Salary.mean(),-3))
df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,64000.0,Yes
5,France,35.0,58000.0,Yes
6,Spain,39.0,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


**Step 4: Encoding categorical data**

In [6]:
df.Purchased=df.Purchased.replace({"No":0,"Yes":1})
df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,0
1,Spain,27.0,48000.0,1
2,Germany,30.0,54000.0,0
3,Spain,38.0,61000.0,0
4,Germany,40.0,64000.0,1
5,France,35.0,58000.0,1
6,Spain,39.0,52000.0,0
7,France,48.0,79000.0,1
8,Germany,50.0,83000.0,0
9,France,37.0,67000.0,1


**Step 5: Creating a dummy variable**

In [7]:
country=pd.get_dummies(df.Country)
country

Unnamed: 0,France,Germany,Spain
0,1,0,0
1,0,0,1
2,0,1,0
3,0,0,1
4,0,1,0
5,1,0,0
6,0,0,1
7,1,0,0
8,0,1,0
9,1,0,0


In [8]:
df1=pd.concat([df,country],axis=1)

In [9]:
df1=df1.drop(columns=["Country"])

In [10]:
df1

Unnamed: 0,Age,Salary,Purchased,France,Germany,Spain
0,44.0,72000.0,0,1,0,0
1,27.0,48000.0,1,0,0,1
2,30.0,54000.0,0,0,1,0
3,38.0,61000.0,0,0,0,1
4,40.0,64000.0,1,0,1,0
5,35.0,58000.0,1,1,0,0
6,39.0,52000.0,0,0,0,1
7,48.0,79000.0,1,1,0,0
8,50.0,83000.0,0,0,1,0
9,37.0,67000.0,1,1,0,0


**Step 6: Splitting the datasets into training sets and Test sets**

In [11]:
x=df1.drop(columns=["Purchased"])

In [12]:
x

Unnamed: 0,Age,Salary,France,Germany,Spain
0,44.0,72000.0,1,0,0
1,27.0,48000.0,0,0,1
2,30.0,54000.0,0,1,0
3,38.0,61000.0,0,0,1
4,40.0,64000.0,0,1,0
5,35.0,58000.0,1,0,0
6,39.0,52000.0,0,0,1
7,48.0,79000.0,1,0,0
8,50.0,83000.0,0,1,0
9,37.0,67000.0,1,0,0


In [13]:
y=df1.Purchased
y

0    0
1    1
2    0
3    0
4    1
5    1
6    0
7    1
8    0
9    1
Name: Purchased, dtype: int64

In [14]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.30)

**Step 7: Feature Scaling**

In [15]:
scaler = StandardScaler()
X_trains= scaler.fit_transform(X_train)
X_tests=scaler.transform(X_test)

In [16]:
X_trains

array([[ 0.24913644, -0.85408104, -0.40824829, -0.8660254 ,  1.15470054],
       [ 0.10380685, -0.02627942, -0.40824829, -0.8660254 ,  1.15470054],
       [-0.04152274,  0.52558833,  2.44948974, -0.8660254 , -0.8660254 ],
       [ 1.84776193,  1.99723566, -0.40824829,  1.15470054, -0.8660254 ],
       [-1.49481864, -1.22199287, -0.40824829, -0.8660254 ,  1.15470054],
       [-1.05882987, -0.67012512, -0.40824829,  1.15470054, -0.8660254 ],
       [ 0.39446603,  0.24965446, -0.40824829,  1.15470054, -0.8660254 ]])

In [17]:
X_tests

array([[ 0.97578439,  0.98547812,  2.44948974, -0.8660254 , -0.8660254 ],
       [ 1.55710275,  1.62932383,  2.44948974, -0.8660254 , -0.8660254 ],
       [-0.33218192, -0.30221329,  2.44948974, -0.8660254 , -0.8660254 ]])