# Data Preprocessing Tools

In [43]:
!wget https://github.com/devilsboy/DataPreprocessing/blob/main/Data.csv

--2020-11-15 07:27:28--  https://github.com/devilsboy/DataPreprocessing/blob/main/Data.csv
Resolving github.com (github.com)... 140.82.114.3
Connecting to github.com (github.com)|140.82.114.3|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [text/html]
Saving to: ‘Data.csv’

Data.csv                [ <=>                ]  93.02K  --.-KB/s    in 0.04s   

2020-11-15 07:27:28 (2.31 MB/s) - ‘Data.csv’ saved [95250]



## Importing the libraries

In [44]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset

In [45]:
data = pd.read_csv('Data.csv')
data

ParserError: ignored

## Taking care of missing data

In [46]:
from sklearn.impute import SimpleImputer

# 'np.nan' signifies that we are targeting missing values
# and the strategy we are choosing is replacing it with 'mean'
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

imputer.fit(data.iloc[:, 1:3])
data.iloc[:, 1:3] = imputer.transform(data.iloc[:, 1:3])  

# print the dataset
data

Unnamed: 0,0,1,2,3,4,5
0,1.0,0.0,0.0,0.73913,0.685714,0.0
1,0.0,0.0,1.0,0.0,0.0,1.0
2,0.0,1.0,0.0,0.130435,0.171429,0.0
3,0.0,0.0,1.0,0.478261,0.371429,0.0
4,0.0,1.0,0.0,0.565217,0.450794,1.0
5,1.0,0.0,0.0,0.347826,0.285714,1.0
6,0.0,0.0,1.0,0.512077,0.114286,0.0
7,1.0,0.0,0.0,0.913043,0.885714,1.0
8,0.0,1.0,0.0,1.0,1.0,0.0
9,1.0,0.0,0.0,0.434783,0.542857,1.0


## Encoding categorical data

### OneHotEncoding


In [47]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')
# [0] signifies the index of the column we are appliying the encoding on
data = pd.DataFrame(ct.fit_transform(data))
data

Unnamed: 0,0,1,2,3,4,5,6
0,0.0,1.0,0.0,0.0,0.73913,0.685714,0.0
1,1.0,0.0,0.0,1.0,0.0,0.0,1.0
2,1.0,0.0,1.0,0.0,0.130435,0.171429,0.0
3,1.0,0.0,0.0,1.0,0.478261,0.371429,0.0
4,1.0,0.0,1.0,0.0,0.565217,0.450794,1.0
5,0.0,1.0,0.0,0.0,0.347826,0.285714,1.0
6,1.0,0.0,0.0,1.0,0.512077,0.114286,0.0
7,0.0,1.0,0.0,0.0,0.913043,0.885714,1.0
8,1.0,0.0,1.0,0.0,1.0,1.0,0.0
9,0.0,1.0,0.0,0.0,0.434783,0.542857,1.0


### LabelEncoding

In [48]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
data.iloc[:,-1] = le.fit_transform(data.iloc[:,-1])
data

Unnamed: 0,0,1,2,3,4,5,6
0,0.0,1.0,0.0,0.0,0.73913,0.685714,0
1,1.0,0.0,0.0,1.0,0.0,0.0,1
2,1.0,0.0,1.0,0.0,0.130435,0.171429,0
3,1.0,0.0,0.0,1.0,0.478261,0.371429,0
4,1.0,0.0,1.0,0.0,0.565217,0.450794,1
5,0.0,1.0,0.0,0.0,0.347826,0.285714,1
6,1.0,0.0,0.0,1.0,0.512077,0.114286,0
7,0.0,1.0,0.0,0.0,0.913043,0.885714,1
8,1.0,0.0,1.0,0.0,1.0,1.0,0
9,0.0,1.0,0.0,0.0,0.434783,0.542857,1


## Normalizing the data

In [49]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
data = pd.DataFrame(scaler.fit_transform(data))
data

Unnamed: 0,0,1,2,3,4,5,6
0,0.0,1.0,0.0,0.0,0.73913,0.685714,0.0
1,1.0,0.0,0.0,1.0,0.0,0.0,1.0
2,1.0,0.0,1.0,0.0,0.130435,0.171429,0.0
3,1.0,0.0,0.0,1.0,0.478261,0.371429,0.0
4,1.0,0.0,1.0,0.0,0.565217,0.450794,1.0
5,0.0,1.0,0.0,0.0,0.347826,0.285714,1.0
6,1.0,0.0,0.0,1.0,0.512077,0.114286,0.0
7,0.0,1.0,0.0,0.0,0.913043,0.885714,1.0
8,1.0,0.0,1.0,0.0,1.0,1.0,0.0
9,0.0,1.0,0.0,0.0,0.434783,0.542857,1.0


## Splitting the dataset

In [50]:
X = data.iloc[:, :-1].values
y = data.iloc[:, -1].values
print("Independent Variable\n")
print(X)
print("\nDependent Variable\n")
print(y)

Independent Variable

[[0.         1.         0.         0.         0.73913043 0.68571429]
 [1.         0.         0.         1.         0.         0.        ]
 [1.         0.         1.         0.         0.13043478 0.17142857]
 [1.         0.         0.         1.         0.47826087 0.37142857]
 [1.         0.         1.         0.         0.56521739 0.45079365]
 [0.         1.         0.         0.         0.34782609 0.28571429]
 [1.         0.         0.         1.         0.51207729 0.11428571]
 [0.         1.         0.         0.         0.91304348 0.88571429]
 [1.         0.         1.         0.         1.         1.        ]
 [0.         1.         0.         0.         0.43478261 0.54285714]]

Dependent Variable

[0. 1. 0. 0. 1. 1. 0. 1. 0. 1.]


In [51]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [52]:
print(X_train)

[[0.         1.         0.         0.         0.43478261 0.54285714]
 [0.         1.         0.         0.         0.34782609 0.28571429]
 [1.         0.         0.         1.         0.51207729 0.11428571]
 [0.         1.         0.         0.         0.73913043 0.68571429]
 [0.         1.         0.         0.         0.91304348 0.88571429]
 [1.         0.         1.         0.         0.56521739 0.45079365]
 [1.         0.         1.         0.         1.         1.        ]
 [1.         0.         0.         1.         0.         0.        ]]


In [53]:
print(X_test)

[[1.         0.         1.         0.         0.13043478 0.17142857]
 [1.         0.         0.         1.         0.47826087 0.37142857]]
