# Data Preprocessing
- Dealing with Duplicate Values
- Dealing with Missing Values
- Scaling
- Dealing with Categorical Data
- Splitting Data For Training and testing

# Some Other steps
- Transformations
- Descretization


In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
df=pd.read_csv("Data.csv")
df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11 entries, 0 to 10
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Country    11 non-null     object 
 1   Age        10 non-null     float64
 2   Salary     10 non-null     float64
 3   Purchased  11 non-null     object 
dtypes: float64(2), object(2)
memory usage: 480.0+ bytes


In [4]:
df.nunique()

Country      3
Age          9
Salary       9
Purchased    2
dtype: int64

In [5]:
print("Countries :",df.Country.unique())
print("Purchased :",df.Purchased.unique())


Countries : ['France' 'Spain' 'Germany']
Purchased : ['No' 'Yes']


# Dealing with Duplicate Values

In [6]:
df.duplicated().sum()

1

In [7]:
df.drop_duplicates(inplace=True)

In [8]:
df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


# ===========================

# Dealing with Missing Values
- If the number of missing values is large with respect to total values we drop the column
- If missing values are in numerical column then replace by mean or median
-If missing values are in categorical Column then they can replaces by mode


In [9]:
'''
If null values are not represented by np.nan in the dataset then 
replace the symbol used to represent null values with np.nan.

'''

'\nIf null values are not represented by np.nan in the dataset then \nreplace the symbol used to represent null values with np.nan.\n\n'

In [10]:
df.isnull().sum()

Country      0
Age          1
Salary       1
Purchased    0
dtype: int64

using Pandas


In [11]:
avg_age=df.Age.mean()
avg_salary=df.Salary.mean()
print("Average Age :",avg_age)
print("Average Salary :",avg_salary)

Average Age : 38.77777777777778
Average Salary : 63777.77777777778


In [12]:
df.Age.replace(np.nan,avg_age)

0    44.000000
1    27.000000
2    30.000000
3    38.000000
4    40.000000
5    35.000000
6    38.777778
7    48.000000
8    50.000000
9    37.000000
Name: Age, dtype: float64

In [13]:
df.Salary.replace(np.nan,avg_salary)

0    72000.000000
1    48000.000000
2    54000.000000
3    61000.000000
4    63777.777778
5    58000.000000
6    52000.000000
7    79000.000000
8    83000.000000
9    67000.000000
Name: Salary, dtype: float64

In [14]:
df               #We need to set inplace=True 

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [15]:
df.Age.replace(np.nan,df.Age.mean(),inplace=True)
df.Salary.replace(np.nan,df.Salary.mean(),inplace=True)

In [16]:
df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,63777.777778,Yes
5,France,35.0,58000.0,Yes
6,Spain,38.777778,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


### Using scikit Learn

In [17]:
df2=pd.read_csv("Data.csv")
df.drop_duplicates(inplace=True)

In [18]:
df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,63777.777778,Yes
5,France,35.0,58000.0,Yes
6,Spain,38.777778,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [19]:
from sklearn.impute import SimpleImputer
imputer=SimpleImputer(missing_values=np.nan,strategy='mean')
df2[['Age',"Salary"]]=imputer.fit_transform(df2[['Age',"Salary"]])
df2

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,64100.0,Yes
5,France,35.0,58000.0,Yes
6,Spain,38.6,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


# ========================================

# Scaling 
- Used to bring features to same scale

    1.standard Scaler
    
    2.Min Max Scaler

In [20]:
X=df2[['Country','Age','Salary']].values
Y=df2[['Purchased']].values

import copy
X2=copy.copy(X)

In [21]:
X

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, 64100.0],
       ['France', 35.0, 58000.0],
       ['Spain', 38.6, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0],
       ['France', 37.0, 67000.0]], dtype=object)

In [22]:
Y

array([['No'],
       ['Yes'],
       ['No'],
       ['No'],
       ['Yes'],
       ['Yes'],
       ['No'],
       ['Yes'],
       ['No'],
       ['Yes'],
       ['Yes']], dtype=object)

## Standard Scaler

- X_sclaed=(X-X_mean)/X_Std
- Performs z-score transforamtion
- Unit variance
- Zero Mean

In [23]:
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()
X[:,1:]=sc.fit_transform(X[:,1:])

In [24]:
X

array([['France', 0.8205484434881531, 0.752334427262497],
       ['Spain', -1.7626596193449222, -1.5332385163197724],
       ['Germany', -1.3067993729626148, -0.961845280424205],
       ['Spain', -0.0911720492764617, -0.2952198385460431],
       ['Germany', 0.2127347816450766, 0.0],
       ['France', -0.5470322956587691, -0.5809164564938268],
       ['Spain', 0.0, -1.152309692389394],
       ['France', 1.4283621053312296, 1.4189598691406589],
       ['Germany', 1.7322689362527681, 1.7998886930710372],
       ['France', -0.24312546473723085, 0.27617339734952423],
       ['France', -0.24312546473723085, 0.27617339734952423]],
      dtype=object)

In [25]:
print(X[:,1].var())        #unit variance
print(X[:,1].mean())       #Zero mean


1.0
-2.4223047810003414e-16


In [26]:
print(X[:,2].var())        #unit variance
print(X[:,2].mean())       #Zero mean


0.9999999999999999
-3.027880976250427e-17


## MinMax Scaler
- X_scaled = (X-X_min)/(X_max-X_min)
- scales the data between range 0 to 1

In [27]:
from sklearn.preprocessing import MinMaxScaler
mm=MinMaxScaler()
X2[:,1:]=mm.fit_transform(X2[:,1:])
X2

array([['France', 0.7391304347826089, 0.6857142857142855],
       ['Spain', 0.0, 0.0],
       ['Germany', 0.1304347826086958, 0.17142857142857149],
       ['Spain', 0.4782608695652175, 0.37142857142857144],
       ['Germany', 0.5652173913043479, 0.45999999999999996],
       ['France', 0.34782608695652173, 0.2857142857142856],
       ['Spain', 0.5043478260869567, 0.11428571428571432],
       ['France', 0.9130434782608696, 0.8857142857142857],
       ['Germany', 1.0, 1.0],
       ['France', 0.43478260869565233, 0.5428571428571427],
       ['France', 0.43478260869565233, 0.5428571428571427]], dtype=object)

# ==================================================

# Dealing With Categorical data
- One Hot Encoding
- Ordinal Encoding 
- Label Encoding

### Using Pandas

In [28]:
df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,63777.777778,Yes
5,France,35.0,58000.0,Yes
6,Spain,38.777778,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [29]:
#One Hot Encoding
dummy1=pd.get_dummies(df.Country)
dummy1

Unnamed: 0,France,Germany,Spain
0,1,0,0
1,0,0,1
2,0,1,0
3,0,0,1
4,0,1,0
5,1,0,0
6,0,0,1
7,1,0,0
8,0,1,0
9,1,0,0


In [30]:
df=pd.concat([dummy1,df],axis=1)

In [31]:
df

Unnamed: 0,France,Germany,Spain,Country,Age,Salary,Purchased
0,1,0,0,France,44.0,72000.0,No
1,0,0,1,Spain,27.0,48000.0,Yes
2,0,1,0,Germany,30.0,54000.0,No
3,0,0,1,Spain,38.0,61000.0,No
4,0,1,0,Germany,40.0,63777.777778,Yes
5,1,0,0,France,35.0,58000.0,Yes
6,0,0,1,Spain,38.777778,52000.0,No
7,1,0,0,France,48.0,79000.0,Yes
8,0,1,0,Germany,50.0,83000.0,No
9,1,0,0,France,37.0,67000.0,Yes


In [32]:
# Ordinal / Label
df.Country.map({'France' : 0, 'Spain' : 1, 'Germany' : 2})


0    0
1    1
2    2
3    1
4    2
5    0
6    1
7    0
8    2
9    0
Name: Country, dtype: int64

## using Scikit Learn 

In [33]:
X

array([['France', 0.8205484434881531, 0.752334427262497],
       ['Spain', -1.7626596193449222, -1.5332385163197724],
       ['Germany', -1.3067993729626148, -0.961845280424205],
       ['Spain', -0.0911720492764617, -0.2952198385460431],
       ['Germany', 0.2127347816450766, 0.0],
       ['France', -0.5470322956587691, -0.5809164564938268],
       ['Spain', 0.0, -1.152309692389394],
       ['France', 1.4283621053312296, 1.4189598691406589],
       ['Germany', 1.7322689362527681, 1.7998886930710372],
       ['France', -0.24312546473723085, 0.27617339734952423],
       ['France', -0.24312546473723085, 0.27617339734952423]],
      dtype=object)

## Column Transformer

In [34]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

ct=ColumnTransformer(transformers=[('encoder',OneHotEncoder(),[0])],remainder='passthrough')
ct.fit_transform(X)

array([[1.0, 0.0, 0.0, 0.8205484434881531, 0.752334427262497],
       [0.0, 0.0, 1.0, -1.7626596193449222, -1.5332385163197724],
       [0.0, 1.0, 0.0, -1.3067993729626148, -0.961845280424205],
       [0.0, 0.0, 1.0, -0.0911720492764617, -0.2952198385460431],
       [0.0, 1.0, 0.0, 0.2127347816450766, 0.0],
       [1.0, 0.0, 0.0, -0.5470322956587691, -0.5809164564938268],
       [0.0, 0.0, 1.0, 0.0, -1.152309692389394],
       [1.0, 0.0, 0.0, 1.4283621053312296, 1.4189598691406589],
       [0.0, 1.0, 0.0, 1.7322689362527681, 1.7998886930710372],
       [1.0, 0.0, 0.0, -0.24312546473723085, 0.27617339734952423],
       [1.0, 0.0, 0.0, -0.24312546473723085, 0.27617339734952423]],
      dtype=object)

### Applying Multiple Transformations

In [35]:
df[['Country','Age','Salary','Purchased']]

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,63777.777778,Yes
5,France,35.0,58000.0,Yes
6,Spain,38.777778,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [36]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

X4=df[['Country','Age','Salary','Purchased']]
ct2=ColumnTransformer(transformers=[('Encoder1',OneHotEncoder(),[0]),('Scaler1' , StandardScaler(), [1,2]),
                                        ('Encoder2', OrdinalEncoder(), [3])])

ct2.fit_transform(X4)

array([[ 1.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         7.58874362e-01,  7.49473254e-01,  0.00000000e+00],
       [ 0.00000000e+00,  0.00000000e+00,  1.00000000e+00,
        -1.71150388e+00, -1.43817841e+00,  1.00000000e+00],
       [ 0.00000000e+00,  1.00000000e+00,  0.00000000e+00,
        -1.27555478e+00, -8.91265492e-01,  0.00000000e+00],
       [ 0.00000000e+00,  0.00000000e+00,  1.00000000e+00,
        -1.13023841e-01, -2.53200424e-01,  0.00000000e+00],
       [ 0.00000000e+00,  1.00000000e+00,  0.00000000e+00,
         1.77608893e-01,  6.63219199e-16,  1.00000000e+00],
       [ 1.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        -5.48972942e-01, -5.26656882e-01,  1.00000000e+00],
       [ 0.00000000e+00,  0.00000000e+00,  1.00000000e+00,
         0.00000000e+00, -1.07356980e+00,  0.00000000e+00],
       [ 1.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         1.34013983e+00,  1.38753832e+00,  1.00000000e+00],
       [ 0.00000000e+00,  1.00000000e+00,  0.000

## Label Encoder

In [37]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
Y = le.fit_transform(Y)
Y


  return f(*args, **kwargs)


array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1])

In [38]:
le.fit_transform(df.Country)

array([0, 2, 1, 2, 1, 0, 2, 0, 1, 0])

# ====================================

# Splitting the data into training and testing sets
- The model is trained using training set
- The performance is evaluated on testing data (unseen data)

In [39]:
X

array([['France', 0.8205484434881531, 0.752334427262497],
       ['Spain', -1.7626596193449222, -1.5332385163197724],
       ['Germany', -1.3067993729626148, -0.961845280424205],
       ['Spain', -0.0911720492764617, -0.2952198385460431],
       ['Germany', 0.2127347816450766, 0.0],
       ['France', -0.5470322956587691, -0.5809164564938268],
       ['Spain', 0.0, -1.152309692389394],
       ['France', 1.4283621053312296, 1.4189598691406589],
       ['Germany', 1.7322689362527681, 1.7998886930710372],
       ['France', -0.24312546473723085, 0.27617339734952423],
       ['France', -0.24312546473723085, 0.27617339734952423]],
      dtype=object)

In [41]:
Y

array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1])

In [42]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(X,Y,test_size=0.3,random_state=5)

In [43]:
x_train

array([['Germany', 0.2127347816450766, 0.0],
       ['France', 1.4283621053312296, 1.4189598691406589],
       ['Spain', -1.7626596193449222, -1.5332385163197724],
       ['France', 0.8205484434881531, 0.752334427262497],
       ['France', -0.24312546473723085, 0.27617339734952423],
       ['Spain', 0.0, -1.152309692389394],
       ['Spain', -0.0911720492764617, -0.2952198385460431]], dtype=object)

In [44]:
y_train

array([1, 1, 1, 0, 1, 0, 0])

In [45]:
x_test

array([['France', -0.5470322956587691, -0.5809164564938268],
       ['Germany', 1.7322689362527681, 1.7998886930710372],
       ['Germany', -1.3067993729626148, -0.961845280424205],
       ['France', -0.24312546473723085, 0.27617339734952423]],
      dtype=object)

In [46]:
y_test

array([1, 0, 0, 1])

In [16]:
a=1
b="v"
c="c"
d="e"
e=3
f=4
g=9

q=[a,b,c,d,e,f,g]

In [17]:
q

[1, 'v', 'c', 'e', 3, 4, 9]

In [19]:
q1=q[4:]

In [20]:
q1.insert(0,1)
q1

[1, 3, 4, 9]

In [22]:
import numpy as np
q1=np.array(q1).reshape(-1,1)

In [23]:
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()
q1=sc.fit_transform(q1)
q1

array([[-1.10264561],
       [-0.42409446],
       [-0.08481889],
       [ 1.61155897]])

In [33]:
q1[:,0][0]

-1.1026456085839622

In [35]:
q1[:,0][3]

1.611558966391945