# Data preprocessing

In [40]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [41]:
dataset = pd.read_csv("data.csv")
dataset.head()

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes


In [42]:
#Extracting features
X = dataset.iloc[:,:-1].values
X

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, nan],
       ['France', 35.0, 58000.0],
       ['Spain', nan, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

In [43]:
# extracting dependent variable
y = dataset['Purchased'].values
# dataset.iloc[:,3].values  returns numpy array
y

array(['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes'],
      dtype=object)

In [44]:
#Missing Data
dataset.isna().sum()

Country      0
Age          1
Salary       1
Purchased    0
dtype: int64

In [45]:
#Handling missing data
#1. can be handled by removing missing data rows which can be bad for our model
#2. or can insert mean of data in missing values

# Taking care of missing data through sickit learn
from sklearn.impute import SimpleImputer as imp
imputer = imp(strategy = 'mean') 
#bydefault stratergy mean
# remember axis = 0  mean  along column
# and axis = 1  means along rows
# now  we have to fit our imputer
imputer = imputer.fit(X[:,1:3])
X[:,1:3] = imputer.transform(X[:,1:3])
print(X)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 63777.77777777778]
 ['France' 35.0 58000.0]
 ['Spain' 38.77777777777778 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


In [46]:
# Encoding Categorical data as machine work on numbers
from sklearn.preprocessing import LabelEncoder
labelencoder_X = LabelEncoder()
X[:,0] = labelencoder_X.fit_transform(X[:,0])
print(X)  #Values after encoding

# There still a problem exist that models works with equation and this mean model 
# think that france:0 <germany:1<spain:2  which isnt like that for correction
#  we use  dummy Encoding

[[0 44.0 72000.0]
 [2 27.0 48000.0]
 [1 30.0 54000.0]
 [2 38.0 61000.0]
 [1 40.0 63777.77777777778]
 [0 35.0 58000.0]
 [2 38.77777777777778 52000.0]
 [0 48.0 79000.0]
 [1 50.0 83000.0]
 [0 37.0 67000.0]]


#Dummy Encoding
 it will give  all category to 1 where they located in table and rest positiowill be 0

#Now we can use ColumnTransformer  transform multiple columns with  different encoding

*ColumnTransformer* enables us to apply transform to particular columns. It help us to fit multiple transformations to multiple columns with a single fit() or fit_transform() statement. For example, we can impute mean in column 1 and OneHotEncode column 2 with a single fit statement.

In [47]:
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder
columntransform = make_column_transformer((OneHotEncoder(categories = 'auto'),[0]),remainder = "passthrough")
X = columntransform.fit_transform(X)
print(X)

[[1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [0.0 1.0 0.0 30.0 54000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 35.0 58000.0]
 [0.0 0.0 1.0 38.77777777777778 52000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


In [48]:
# transform Dependent variable ml model know its category and there is no relation
labelencoder_y = LabelEncoder()
y = labelencoder_y.fit_transform(y)
print(y)

[0 1 0 0 1 1 0 1 0 1]


In [49]:
# TRANSFORMED DATA
print(X)
print("----------------------------")
print(y)

[[1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [0.0 1.0 0.0 30.0 54000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 35.0 58000.0]
 [0.0 0.0 1.0 38.77777777777778 52000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 37.0 67000.0]]
----------------------------
[0 1 0 0 1 1 0 1 0 1]


In [50]:
# Train Test  split

from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size= 0.2,random_state = 0)
# random state is just like seed or you  can say test 1
print(X_train)  #  it have divided th sets

[[0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 37.0 67000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [0.0 0.0 1.0 38.77777777777778 52000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [1.0 0.0 0.0 44.0 72000.0]
 [1.0 0.0 0.0 35.0 58000.0]]


###  Feature Scaling
Scaling feature means to put the different features in same scale
as machine learning follow the eucledian distance between two data points
so if some features are scalably much bigger than other , than that feature will dominates the other therefore latter dosent exist,


There are several ways to scalin your data:
1. Standardisation
     for each observation and each feature we will draw mean value of all the value of the features and we divide it with standard deviation
     
2. Normalisation
    which means we subtract your observation feature X by the minium value of all the feature value and divide it with the deifference between maximum of the feature value and minimum of the feature value
    

In [52]:
#Feature scaling
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
#we will transform x_train
X_train = sc_X.fit_transform(X_train)# as here we already fit the X_train data so
X_test  = sc_X.transform(X_test)  # we dont need to fit test set
# here the question arises that do we need to train the dummy variable
# it depends some do not some do it as it wont change much as everything in same scale but we lose which depicts which country
# but scalling wont break your data
#do we need to feature scaling in y , thats no we dont need to for most part

In [53]:
X_train

array([[-1.        ,  2.64575131, -0.77459667,  0.26306757,  0.12381479],
       [ 1.        , -0.37796447, -0.77459667, -0.25350148,  0.46175632],
       [-1.        , -0.37796447,  1.29099445, -1.97539832, -1.53093341],
       [-1.        , -0.37796447,  1.29099445,  0.05261351, -1.11141978],
       [ 1.        , -0.37796447, -0.77459667,  1.64058505,  1.7202972 ],
       [-1.        , -0.37796447,  1.29099445, -0.0813118 , -0.16751412],
       [ 1.        , -0.37796447, -0.77459667,  0.95182631,  0.98614835],
       [ 1.        , -0.37796447, -0.77459667, -0.59788085, -0.48214934]])