# Data Preprocessing

In [1]:
#importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('Data.csv')
df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


- x = Country, Age, Salary (independent variables).
- y = Purchase (dependent variable)
- contry -> categorical data
- age -> discrete data


In [3]:
df.describe()

Unnamed: 0,Age,Salary
count,9.0,9.0
mean,38.777778,63777.777778
std,7.693793,12265.579662
min,27.0,48000.0
25%,35.0,54000.0
50%,38.0,61000.0
75%,44.0,72000.0
max,50.0,83000.0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Country    10 non-null     object 
 1   Age        9 non-null      float64
 2   Salary     9 non-null      float64
 3   Purchased  10 non-null     object 
dtypes: float64(2), object(2)
memory usage: 448.0+ bytes


In [7]:
#independent variables
x = df.iloc[:,:-1].values
x

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, nan],
       ['France', 35.0, 58000.0],
       ['Spain', nan, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

In [9]:
#dependent variables
y = df.iloc[:,3:].values
y

array([['No'],
       ['Yes'],
       ['No'],
       ['No'],
       ['Yes'],
       ['Yes'],
       ['No'],
       ['Yes'],
       ['No'],
       ['Yes']], dtype=object)

# Handling of missing values

In [16]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values = np.NaN, strategy = 'mean')
imputer.fit(x[:,1:3])
x[:,1:3] = imputer.transform(x[:,1:3])

In [18]:
x

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, 63777.77777777778],
       ['France', 35.0, 58000.0],
       ['Spain', 38.77777777777778, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

# Spliting the data into the training dataset and testing dataset

In [20]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.2,random_state=0)

In [21]:
x_train

array([['Germany', 40.0, 63777.77777777778],
       ['France', 37.0, 67000.0],
       ['Spain', 27.0, 48000.0],
       ['Spain', 38.77777777777778, 52000.0],
       ['France', 48.0, 79000.0],
       ['Spain', 38.0, 61000.0],
       ['France', 44.0, 72000.0],
       ['France', 35.0, 58000.0]], dtype=object)

In [22]:
x_test

array([['Germany', 30.0, 54000.0],
       ['Germany', 50.0, 83000.0]], dtype=object)

In [23]:
y_train

array([['Yes'],
       ['Yes'],
       ['Yes'],
       ['No'],
       ['Yes'],
       ['No'],
       ['No'],
       ['Yes']], dtype=object)

In [24]:
y_test

array([['No'],
       ['No']], dtype=object)

# Encoding Categorical Data

In [32]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
labelencoder = LabelEncoder()
x[:,0] = labelencoder.fit_transform(x[:,0])

In [26]:
x

array([[0, 44.0, 72000.0],
       [2, 27.0, 48000.0],
       [1, 30.0, 54000.0],
       [2, 38.0, 61000.0],
       [1, 40.0, 63777.77777777778],
       [0, 35.0, 58000.0],
       [2, 38.77777777777778, 52000.0],
       [0, 48.0, 79000.0],
       [1, 50.0, 83000.0],
       [0, 37.0, 67000.0]], dtype=object)

# Data Transformation

In [33]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

columntransformer = ColumnTransformer([('encoder',OneHotEncoder(),[0])])
x1 = np.array(columntransformer.fit_transform(x),dtype=np.str)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  x1 = np.array(columntransformer.fit_transform(x),dtype=np.str)


In [34]:
x1

array([['0.0', '1.0'],
       ['1.0', '0.0'],
       ['1.0', '0.0'],
       ['1.0', '0.0'],
       ['1.0', '0.0'],
       ['0.0', '1.0'],
       ['1.0', '0.0'],
       ['0.0', '1.0'],
       ['1.0', '0.0'],
       ['0.0', '1.0']], dtype='<U32')

# Feature Scaling

In [39]:
from sklearn.preprocessing import StandardScaler
sc_x = StandardScaler()
x1 = sc_x.fit_transform(x1)
#x_test = sc_x.fit_transform(x2)

In [37]:
x1

array([[-1.22474487,  1.22474487],
       [ 0.81649658, -0.81649658],
       [ 0.81649658, -0.81649658],
       [ 0.81649658, -0.81649658],
       [ 0.81649658, -0.81649658],
       [-1.22474487,  1.22474487],
       [ 0.81649658, -0.81649658],
       [-1.22474487,  1.22474487],
       [ 0.81649658, -0.81649658],
       [-1.22474487,  1.22474487]])