## **Encoding features using Column Transformer**

In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder

In [3]:
data=pd.read_csv('/content/covid_toy.csv')

In [4]:
data.sample(5)

Unnamed: 0,age,gender,fever,cough,city,has_covid
49,44,Male,104.0,Mild,Mumbai,No
17,40,Female,98.0,Strong,Delhi,No
39,50,Female,103.0,Mild,Kolkata,No
26,19,Female,100.0,Mild,Kolkata,Yes
16,69,Female,103.0,Mild,Kolkata,Yes


In [6]:
data.shape

(100, 6)

In [5]:
data.isnull().sum() #there are 10  missing values in fever col.

age           0
gender        0
fever        10
cough         0
city          0
has_covid     0
dtype: int64

### **Train-test Split step**

In [7]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(data.drop(columns=['has_covid']),data['has_covid'],
                                                test_size=0.2)

In [8]:
X_train

Unnamed: 0,age,gender,fever,cough,city
28,16,Male,104.0,Mild,Kolkata
31,83,Male,103.0,Mild,Kolkata
9,64,Female,101.0,Mild,Delhi
83,17,Female,104.0,Mild,Kolkata
26,19,Female,100.0,Mild,Kolkata
...,...,...,...,...,...
96,51,Female,101.0,Strong,Kolkata
97,20,Female,101.0,Mild,Bangalore
59,6,Female,104.0,Mild,Kolkata
2,42,Male,101.0,Mild,Delhi


## **1. Aam Zindagi (Without using column transformer)**

In [9]:
# adding simple imputer to fever col
si = SimpleImputer()
X_train_fever = si.fit_transform(X_train[['fever']])

# also the test data
X_test_fever = si.fit_transform(X_test[['fever']])

X_train_fever.shape

(80, 1)

In [10]:
# Ordinalencoding -> cough
oe = OrdinalEncoder(categories=[['Mild','Strong']])
X_train_cough = oe.fit_transform(X_train[['cough']])

# also the test data
X_test_cough = oe.fit_transform(X_test[['cough']])

X_train_cough.shape

(80, 1)

In [11]:
# OneHotEncoding -> gender,city
ohe = OneHotEncoder(drop='first',sparse=False)
X_train_gender_city = ohe.fit_transform(X_train[['gender','city']])

# also the test data
X_test_gender_city = ohe.fit_transform(X_test[['gender','city']])

X_train_gender_city.shape



(80, 4)

In [12]:
# Extracting Age
X_train_age = X_train.drop(columns=['gender','fever','cough','city']).values

# also the test data
X_test_age = X_test.drop(columns=['gender','fever','cough','city']).values

X_train_age.shape

(80, 1)

In [13]:
X_train_transformed = np.concatenate((X_train_age,X_train_fever,X_train_gender_city,X_train_cough),axis=1)
# also the test data
X_test_transformed = np.concatenate((X_test_age,X_test_fever,X_test_gender_city,X_test_cough),axis=1)

X_train_transformed.shape

(80, 7)

## **2. Mentos Zindagi (Using Column Transformer)**

In [14]:
from sklearn.compose import ColumnTransformer

In [15]:
transformer = ColumnTransformer(transformers=[
    ('tnf1',SimpleImputer(),['fever']),
    ('tnf2',OrdinalEncoder(categories=[['Mild','Strong']]),['cough']),
    ('tnf3',OneHotEncoder(sparse=False,drop='first'),['gender','city'])
],remainder='passthrough')

In [16]:
X_train=transformer.fit_transform(X_train)



In [17]:
X_test=transformer.transform(X_test)

In [20]:
X_train_convert=pd.DataFrame(X_train)


In [21]:
X_test_convert=pd.DataFrame(X_test)

In [23]:
X_train_convert.sample(10), X_test_convert.sample(5)

(             0    1    2    3    4    5     6
 71   98.000000  1.0  1.0  0.0  0.0  1.0  23.0
 10  101.000000  1.0  0.0  1.0  0.0  0.0  34.0
 39  100.930556  0.0  1.0  0.0  0.0  1.0  23.0
 51  104.000000  0.0  1.0  0.0  0.0  1.0  42.0
 25  101.000000  0.0  0.0  0.0  0.0  1.0  19.0
 5    99.000000  0.0  1.0  0.0  0.0  0.0  65.0
 27  102.000000  0.0  0.0  0.0  0.0  0.0  69.0
 79  100.000000  1.0  0.0  0.0  0.0  0.0  47.0
 70  104.000000  0.0  1.0  0.0  0.0  1.0  44.0
 3   104.000000  0.0  0.0  0.0  1.0  0.0  17.0,
              0    1    2    3    4    5     6
 15  101.000000  0.0  0.0  0.0  0.0  1.0  65.0
 3   101.000000  0.0  0.0  0.0  0.0  0.0  38.0
 10  100.000000  0.0  1.0  0.0  0.0  0.0  11.0
 4   100.930556  0.0  1.0  1.0  0.0  0.0  38.0
 9   104.000000  1.0  0.0  0.0  1.0  0.0  54.0)