<a href="https://colab.research.google.com/github/yashasangani/Machine-Learning/blob/main/Feature_Scaling/Column_transformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [28]:
from warnings import filterwarnings
filterwarnings("ignore")

In [29]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder,OneHotEncoder

In [30]:
df = pd.read_csv("https://raw.githubusercontent.com/yashasangani/Datasets/main/covid_toy.csv",delimiter = ",")
df.head()

Unnamed: 0,age,gender,fever,cough,city,has_covid
0,60,Male,103.0,Mild,Kolkata,No
1,27,Male,100.0,Mild,Delhi,Yes
2,42,Male,101.0,Mild,Delhi,No
3,31,Female,98.0,Mild,Kolkata,No
4,65,Female,101.0,Mild,Mumbai,No


In [65]:
df.city.value_counts()

Unnamed: 0_level_0,count
city,Unnamed: 1_level_1
Kolkata,32
Bangalore,30
Delhi,22
Mumbai,16


In [31]:
df.isnull().sum()

Unnamed: 0,0
age,0
gender,0
fever,10
cough,0
city,0
has_covid,0


In [32]:
from sklearn.model_selection import train_test_split

In [33]:
x_train,x_test,y_train,y_test = train_test_split(df.drop(["has_covid"],axis =1),df["has_covid"],test_size = 0.2,random_state = 0)
x_train.shape

(80, 5)

# Feature scaling without "ColumnTransformer"

In [50]:
# adding simple imputer to fever col
si = SimpleImputer()
x_train_fever = si.fit_transform(x_train[['fever']])

# also the test data
x_test_fever = si.transform(x_test[['fever']])
x_train_fever.shape

(80, 1)

In [51]:
# Ordinalencoding -> cough
oe = OrdinalEncoder(categories = [['Mild','Strong']])
x_train_cough = oe.fit_transform(x_train[['cough']])

# also the test data
x_test_cough = oe.transform(x_test[['cough']])
x_train_cough.shape

(80, 1)

In [52]:
# OneHotEncoding -> gender,city
ohe = OneHotEncoder(drop = "first",sparse = False)
x_train_gender_city = ohe.fit_transform(x_train[['gender','city']])

#also the test data
x_test_gender_city = ohe.transform(x_test[['gender','city']])

x_train_gender_city.shape

(80, 4)

In [53]:
#Extracting the Age column
x_train_age = x_train.drop(columns = ["gender","fever","cough","city"]).values

#also the test data
x_test_age = x_test.drop(columns = ["gender","fever","cough","city"]).values

x_train_age.shape

(80, 1)

In [54]:
x_train_transformed = np.concatenate((x_train_age,x_train_gender_city, x_train_fever, x_train_cough),axis = 1)

# also the test data
x_test_transformed = np.concatenate((x_test_age,x_test_fever,x_test_gender_city,x_test_cough),axis=1)

x_train_transformed.shape

(80, 7)

# Feature scaling with "ColumnTransformer"

In [55]:
from sklearn.compose import ColumnTransformer

In [61]:
transformer = ColumnTransformer(transformers=[('trf1',SimpleImputer(),['fever']),
                                     ('trf2',OrdinalEncoder(categories=[["Mild","Strong"]]),["cough"]),
                                     ('trf3',OneHotEncoder(drop ='first',sparse = False),["gender","city"])],
                       remainder = 'passthrough')

In [71]:
t = transformer.fit_transform(x_train)
t.shape

(80, 7)

In [72]:
df1 = pd.DataFrame(t)
df1

Unnamed: 0,0,1,2,3,4,5,6
0,99.0,0.0,0.0,0.0,0.0,0.0,22.0
1,104.0,1.0,0.0,0.0,0.0,0.0,56.0
2,98.0,0.0,0.0,0.0,1.0,0.0,31.0
3,104.0,1.0,0.0,1.0,0.0,0.0,75.0
4,99.0,0.0,1.0,0.0,0.0,0.0,72.0
...,...,...,...,...,...,...,...
75,101.0,1.0,0.0,0.0,1.0,0.0,51.0
76,99.0,0.0,1.0,0.0,0.0,0.0,65.0
77,104.0,0.0,1.0,0.0,0.0,1.0,42.0
78,104.0,0.0,0.0,0.0,0.0,0.0,18.0


In [74]:
transformer.transform(x_test).shape

(20, 7)