# Preprocessing

## importing libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## calling the dataset

In [2]:
dataset = pd.read_csv("Data.csv")
X = dataset.iloc[:, :-1].values      ##from the dataset, X is ref to feature/independent variables
Y = dataset.iloc[:, -1].values       ##from the dataset, Y is ref to dependent/target variables

print (X)
print (Y)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 nan]
 ['France' 35.0 58000.0]
 ['Spain' nan 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]
['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']


## Filling the missing data

### using mean technique

In [3]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(X[:, 1:3])
X[:, 1:3] = imputer.transform(X[:, 1:3])
print (X)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 63777.77777777778]
 ['France' 35.0 58000.0]
 ['Spain' 38.77777777777778 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


### using median technique

In [10]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='median')
imputer.fit(X[:, 1:3])
X[:, 1:3] = imputer.transform(X[:, 1:3])
print (X)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 63777.77777777778]
 ['France' 35.0 58000.0]
 ['Spain' 38.77777777777778 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


### Most Frequent Imputation

For categorical variables, you can fill missing values with the most frequent value in the column.

In [16]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent') 
imputer.fit(X[:, 1:3])
X[:, 1:3] = imputer.transform(X[:, 1:3])
print (X)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 48000.0]
 ['France' 35.0 58000.0]
 ['Spain' 27.0 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


### Constant imputation

You can fill missing values with a constant value of your choice. This is useful when missing values have a specific meaning in the data.

In [40]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(missing_values=np.nan, strategy='constant',fill_value=0)
imputer.fit(X[:, 1:3])
X[:, 1:3] = imputer.transform(X[:, 1:3])
print (X)

[[0.0 1.0 0.0 1.0 1.0 0.0 0.0 44.0 72000.0]
 [1.0 0.0 1.0 0.0 1.0 0.0 1.0 27.0 48000.0]
 [1.0 0.0 1.0 0.0 0.0 1.0 0.0 30.0 54000.0]
 [1.0 0.0 1.0 0.0 1.0 0.0 1.0 38.0 61000.0]
 [1.0 0.0 1.0 0.0 0.0 1.0 0.0 40.0 63777.77777777778]
 [0.0 1.0 0.0 1.0 1.0 0.0 0.0 35.0 58000.0]
 [1.0 0.0 1.0 0.0 1.0 0.0 1.0 38.77777777777778 52000.0]
 [0.0 1.0 0.0 1.0 1.0 0.0 0.0 48.0 79000.0]
 [1.0 0.0 1.0 0.0 0.0 1.0 0.0 50.0 83000.0]
 [0.0 1.0 0.0 1.0 1.0 0.0 0.0 37.0 67000.0]]


If missing values are too prevalent or cannot be imputed accurately, you may choose to simply drop rows or columns with missing values.

## Encoding values

### encoding the independent values

Encoding in the context of machine learning is the process of converting categorical variables into a numerical representation that can be used by machine learning algorithms for analysis and modeling.

#### One Hot encoding

Converts categorical variables into binary vectors, where each category becomes a binary feature.

Suitable for nominal variables without any inherent order.

Helps prevent ordinality assumption by the model.

Implemented using libraries like scikit-learn's OneHotEncoder.

In [4]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

ct = ColumnTransformer(transformers=[('encoder',OneHotEncoder(),[0])],remainder='passthrough')
X = np.array(ct.fit_transform(X))
print (X)

[[1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [0.0 1.0 0.0 30.0 54000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 35.0 58000.0]
 [0.0 0.0 1.0 38.77777777777778 52000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


### encoding the dependent values

#### Label Encoding

Label encoding is a method of converting categorical variables into numerical format by assigning a unique integer label to each category. This encoding preserves the ordinal relationship among categories, making it suitable for categorical variables with a natural order

In [5]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
Y = le.fit_transform(Y)
print (Y)


[0 1 0 0 1 1 0 1 0 1]


In [6]:
xy_table = np.column_stack((X, Y))   ##prints as full table
print (xy_table)

[[1.0 0.0 0.0 44.0 72000.0 0]
 [0.0 0.0 1.0 27.0 48000.0 1]
 [0.0 1.0 0.0 30.0 54000.0 0]
 [0.0 0.0 1.0 38.0 61000.0 0]
 [0.0 1.0 0.0 40.0 63777.77777777778 1]
 [1.0 0.0 0.0 35.0 58000.0 1]
 [0.0 0.0 1.0 38.77777777777778 52000.0 0]
 [1.0 0.0 0.0 48.0 79000.0 1]
 [0.0 1.0 0.0 50.0 83000.0 0]
 [1.0 0.0 0.0 37.0 67000.0 1]]


## Splitting the data for training and testing

In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.2,random_state=1)

In [10]:
print (X_train)

[[0.0 0.0 1.0 38.77777777777778 52000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 35.0 58000.0]]


In [11]:
print (X_test)

[[0.0 1.0 0.0 30.0 54000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


In [12]:
print (Y_train)

[0 1 0 0 1 1 0 1]


In [13]:
print (Y_test)

[0 1]
