# Data Preprocessing Tools

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Importing the Dataset

In [2]:
# Imports dataset
dataset = pd.read_csv('Data.csv')
#Imports X and Y being the inputs and outputs of the data
# : is all of the rows, : is all of the columns
# by putting the -1 it removes the last column
X = dataset.iloc[:, :-1].values
Y = dataset.iloc[:, +3:].values


In [3]:
dataset

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


# Taking care of missing data

In [4]:
#imports the imputer
from sklearn.impute import SimpleImputer
#tells the imputer that you're changing missing_values (which are represented in numpy as nan) to the mean of the column
imputer = SimpleImputer(missing_values = np.nan, strategy = 'mean')
#you're changing values in columns 1-3
imputer.fit(X[:,1:3])
#replaces X with the changed values
X[:, 1:3] = imputer.transform (X[:, 1:3])
print(X)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 63777.77777777778]
 ['France' 35.0 58000.0]
 ['Spain' 38.77777777777778 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


In [5]:
print(Y)

[['No']
 ['Yes']
 ['No']
 ['No']
 ['Yes']
 ['Yes']
 ['No']
 ['Yes']
 ['No']
 ['Yes']]


# Encoding categorical data

## Encoding the Independent Variable

In [6]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers = [('encoder', OneHotEncoder(), [0])], remainder = 'passthrough')
X = np.array(ct.fit_transform(X))

print(X)

[[1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [0.0 1.0 0.0 30.0 54000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 35.0 58000.0]
 [0.0 0.0 1.0 38.77777777777778 52000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


## Encoding the Dependent Variable

In [7]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
Y = le.fit_transform(Y)

print(Y)

[0 1 0 0 1 1 0 1 0 1]


  y = column_or_1d(y, warn=True)


# Splitting the dataset into the Training set and Test set

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state = 1)

In [9]:
print(X_train)

[[0.0 0.0 1.0 38.77777777777778 52000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 35.0 58000.0]]


In [10]:
print(X_test)

[[0.0 1.0 0.0 30.0 54000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


In [11]:
print(y_train)

[0 1 0 0 1 1 0 1]


In [12]:
print(y_test)

[0 1]


# Feature Scaling

In [13]:
from sklearn.preprocessing import StandardScaler
#sc = StandardScaler()
#X_train = sc.fit_transform(X_train)
#X_test = sc.transform(X_test)

from sklearn.compose import ColumnTransformer
ct = ColumnTransformer(transformers = [('skip', 'passthrough', [0,1,2]),
                                       ('encoder', StandardScaler(), [3,4]),], remainder = 'passthrough')

X_train = ct.fit_transform(X_train)
X_test = ct.transform(X_test)

In [14]:
print(X_train)

[[0.0 0.0 1.0 -0.19159184384578545 -1.0781259408412422]
 [0.0 1.0 0.0 -0.014117293757057777 -0.0701316764163537]
 [1.0 0.0 0.0 0.566708506533324 0.6335624327104549]
 [0.0 0.0 1.0 -0.30453019390224867 -0.3078661727429786]
 [0.0 0.0 1.0 -1.9018011447007988 -1.4204636155515817]
 [1.0 0.0 0.0 1.1475343068237058 1.2326533634535488]
 [0.0 1.0 0.0 1.4379472069688968 1.5749910381638883]
 [1.0 0.0 0.0 -0.7401495441200351 -0.5646194287757332]]


In [15]:
print(X_test)

[[0.0 1.0 0.0 -1.4661817944830124 -0.9069571034860726]
 [1.0 0.0 0.0 -0.44973664397484414 0.20564033932253056]]
