## Libraries Import

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline

## Loading Data

In [2]:
dataset = pd.read_csv('Data.csv')
print(dataset)

        State        Age       Salary Purchased
0  California  42.000000  60000.00000        No
1     Florida  29.000000  55000.00000       Yes
2       Texas  27.000000  72000.00000        No
3     Florida  26.000000  48000.00000        No
4       Texas  39.000000  65111.11111       Yes
5  California  34.000000  54000.00000       Yes
6     Florida  36.888889  57000.00000        No
7  California  46.000000  80000.00000       Yes
8       Texas  51.000000  85000.00000        No
9  California  38.000000  75000.00000       Yes


In [3]:
# featutes - independent variables
X = dataset.iloc[:, :-1].values
print(X)

[['California' 42.0 60000.0]
 ['Florida' 29.0 55000.0]
 ['Texas' 27.0 72000.0]
 ['Florida' 26.0 48000.0]
 ['Texas' 39.0 65111.111110000005]
 ['California' 34.0 54000.0]
 ['Florida' 36.88888889 57000.0]
 ['California' 46.0 80000.0]
 ['Texas' 51.0 85000.0]
 ['California' 38.0 75000.0]]


In [4]:
# target - dependent variable
y = dataset.iloc[:, -1].values
print(y)

['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']


In [5]:
dataset.describe()

Unnamed: 0,Age,Salary
count,10.0,10.0
mean,36.888889,65111.111111
std,8.171012,12350.838428
min,26.0,48000.0
25%,30.25,55500.0
50%,37.444444,62555.555555
75%,41.25,74250.0
max,51.0,85000.0


## Working with missing data

In [6]:
# Add some missing values in the features

dataset[dataset == 51] = np.nan
dataset[dataset == 60000] = np.nan
print(dataset)

        State        Age       Salary Purchased
0  California  42.000000          NaN        No
1     Florida  29.000000  55000.00000       Yes
2       Texas  27.000000  72000.00000        No
3     Florida  26.000000  48000.00000        No
4       Texas  39.000000  65111.11111       Yes
5  California  34.000000  54000.00000       Yes
6     Florida  36.888889  57000.00000        No
7  California  46.000000  80000.00000       Yes
8       Texas        NaN  85000.00000        No
9  California  38.000000  75000.00000       Yes


In [7]:
# featutes 
X = dataset.iloc[:, :-1].values
print(X)

[['California' 42.0 nan]
 ['Florida' 29.0 55000.0]
 ['Texas' 27.0 72000.0]
 ['Florida' 26.0 48000.0]
 ['Texas' 39.0 65111.111110000005]
 ['California' 34.0 54000.0]
 ['Florida' 36.88888889 57000.0]
 ['California' 46.0 80000.0]
 ['Texas' nan 85000.0]
 ['California' 38.0 75000.0]]


In [8]:
# target
y = dataset.iloc[:, -1].values
print(y)

['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']


In [9]:
# Replace the missing values using the SimpleImputer from sklearn

from sklearn.impute import SimpleImputer

# replacing any missing values with the mean of the column
imputer = SimpleImputer(missing_values=np.nan, strategy='mean') 
imputer.fit(X[:, 1:3])
X[:, 1:3] = imputer.transform(X[:, 1:3])

print(X)

[['California' 42.0 65679.01234555554]
 ['Florida' 29.0 55000.0]
 ['Texas' 27.0 72000.0]
 ['Florida' 26.0 48000.0]
 ['Texas' 39.0 65111.111110000005]
 ['California' 34.0 54000.0]
 ['Florida' 36.88888889 57000.0]
 ['California' 46.0 80000.0]
 ['Texas' 35.320987654444444 85000.0]
 ['California' 38.0 75000.0]]


## Encoding Categorical Data

In [10]:
# Encoding an independent variable - 'State'

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

ct = ColumnTransformer(transformers=[('one_hot_encoder', OneHotEncoder(categories='auto'), [0])],
                      remainder='passthrough')

X = np.array(ct.fit_transform(X), dtype=np.float)

print(X)

# An example of the output below - the encoding change after every run
# 'California'> 100
# 'Florida'> 010
# 'Texas'> 001

[[1.00000000e+00 0.00000000e+00 0.00000000e+00 4.20000000e+01
  6.56790123e+04]
 [0.00000000e+00 1.00000000e+00 0.00000000e+00 2.90000000e+01
  5.50000000e+04]
 [0.00000000e+00 0.00000000e+00 1.00000000e+00 2.70000000e+01
  7.20000000e+04]
 [0.00000000e+00 1.00000000e+00 0.00000000e+00 2.60000000e+01
  4.80000000e+04]
 [0.00000000e+00 0.00000000e+00 1.00000000e+00 3.90000000e+01
  6.51111111e+04]
 [1.00000000e+00 0.00000000e+00 0.00000000e+00 3.40000000e+01
  5.40000000e+04]
 [0.00000000e+00 1.00000000e+00 0.00000000e+00 3.68888889e+01
  5.70000000e+04]
 [1.00000000e+00 0.00000000e+00 0.00000000e+00 4.60000000e+01
  8.00000000e+04]
 [0.00000000e+00 0.00000000e+00 1.00000000e+00 3.53209877e+01
  8.50000000e+04]
 [1.00000000e+00 0.00000000e+00 0.00000000e+00 3.80000000e+01
  7.50000000e+04]]


In [14]:
# Encoding the dependent variable - 'Purchased'

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

print(y)

# 0 > No
# 1 > Yes

[0 1 0 0 1 1 0 1 0 1]


## Train & Test Sets - Splitting Data

In [16]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

print(X.shape)
print(X_train.shape)
print(X_test.shape)

(10, 5)
(8, 5)
(2, 5)


In [17]:
print(y.shape)
print(y_train.shape)
print(y_test.shape)

(10,)
(8,)
(2,)


## Feature Scaling

Standardization vs Normalization

In [18]:
from sklearn.preprocessing import StandardScaler

sc_X = StandardScaler()

X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)

print(X_train)

[[-1.         -0.77459667  2.64575131  0.42967955  0.25609847]
 [ 1.         -0.77459667 -0.37796447  0.26685361  1.21635534]
 [-1.          1.29099445 -0.37796447 -1.19857978 -0.72573721]
 [-1.          1.29099445 -0.37796447  0.08593591 -0.53152795]
 [ 1.         -0.77459667 -0.37796447  1.56946108  1.70187847]
 [-1.          1.29099445 -0.37796447 -1.68705758 -1.40546959]
 [ 1.         -0.77459667 -0.37796447  0.91815734  0.31124431]
 [ 1.         -0.77459667 -0.37796447 -0.38445012 -0.82284183]]


## Data Pre-Processing Template

In [19]:
# Importing libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Importing dataset
dataset = pd.read_csv("Data.csv")
X = dataset.iloc[:, :-1].values
Y = dataset.iloc[:, 3].values
dataset.describe()

# Split dataset into train and test set
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.20,random_state=0)
