# Column Transformer is used to execute all the methods like encoding and scaling in a single line of code 

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder,OneHotEncoder
from sklearn.impute import SimpleImputer

In [2]:
df=pd.read_csv("//Users//udayladdha//Desktop//DataSets//covid_toy.csv")
df.head()

Unnamed: 0,age,gender,fever,cough,city,has_covid
0,60,Male,103.0,Mild,Kolkata,No
1,27,Male,100.0,Mild,Delhi,Yes
2,42,Male,101.0,Mild,Delhi,No
3,31,Female,98.0,Mild,Kolkata,No
4,65,Female,101.0,Mild,Mumbai,No


In [5]:
df.isnull().sum() # fever has nan value for which we will use simple imputer 

age           0
gender        0
fever        10
cough         0
city          0
has_covid     0
dtype: int64

In [6]:
df["city"].value_counts() # nominal categorical data (one hot encoder)

Kolkata      32
Bangalore    30
Delhi        22
Mumbai       16
Name: city, dtype: int64

In [8]:
df["cough"].value_counts()  # ordinal categorical data (ordinal encoder)

Mild      62
Strong    38
Name: cough, dtype: int64

In [9]:
df["gender"].value_counts() # nominal categorical data(one hot encoder)

Female    59
Male      41
Name: gender, dtype: int64

In [4]:
from sklearn.model_selection import train_test_split

In [10]:
x=df.drop(columns="has_covid")
y=df["has_covid"]

In [11]:
x_train, x_test , y_train , y_test = train_test_split(x,y,test_size=0.2,random_state=32)

In [12]:
x_train

Unnamed: 0,age,gender,fever,cough,city
26,19,Female,100.0,Mild,Kolkata
8,19,Female,100.0,Strong,Bangalore
7,20,Female,,Strong,Mumbai
12,25,Female,99.0,Strong,Kolkata
59,6,Female,104.0,Mild,Kolkata
...,...,...,...,...,...
62,56,Female,104.0,Strong,Bangalore
54,60,Female,99.0,Mild,Mumbai
5,84,Female,,Mild,Bangalore
43,22,Female,99.0,Mild,Bangalore


In [13]:
y_train

26    Yes
8      No
7     Yes
12     No
59    Yes
     ... 
62    Yes
54    Yes
5     Yes
43    Yes
87     No
Name: has_covid, Length: 80, dtype: object

# 1. Without using column transformer

In [14]:
# adding simple imputer to fever column

In [15]:
si=SimpleImputer()

In [17]:
x_train_fever=si.fit_transform(x_train[["fever"]])
x_test_fever=si.transform(x_test[["fever"]])

In [18]:
# ordinal encoding  on cough column

In [19]:
df["cough"].value_counts()

Mild      62
Strong    38
Name: cough, dtype: int64

In [28]:
oe=OrdinalEncoder(categories=[["Mild","Strong"]])

In [34]:
x_train_cough=oe.fit_transform(x_train[["cough"]])
x_test_cough=oe.transform(x_test[["cough"]])

In [35]:
# one hot encoding on gender and city

In [48]:
df["gender"].value_counts()

Female    59
Male      41
Name: gender, dtype: int64

In [49]:
df["city"].value_counts()

Kolkata      32
Bangalore    30
Delhi        22
Mumbai       16
Name: city, dtype: int64

In [56]:
ohe=OneHotEncoder(drop="first",sparse_output=False,dtype=int)

In [59]:
x_train_gender_city=ohe.fit_transform(x_train[["gender","city"]])
x_test_gender_city=ohe.transform(x_test[["gender","city"]])

In [60]:
# extracting age from df

In [77]:
x_train_age=x_train.iloc[:,0:1].values

In [79]:
x_train_age.shape

(80, 1)

In [80]:
x_test_age=x_test.iloc[:,0:1].values

In [81]:
x_test_age.shape

(20, 1)

In [82]:
# concatenatin every column

In [101]:
x_train_concatinated=np.concatenate((x_train_age,x_train_fever,x_train_gender_city,x_train_cough),axis=1)

In [103]:
x_test_concatinated=np.concatenate((x_test_age,x_test_fever,x_test_gender_city,x_test_cough),axis=1)

In [104]:
x_train_concatinated.shape

(80, 7)

In [105]:
x_test_concatinated.shape

(20, 7)

# 2.Using Column Transformer

In [107]:
from sklearn.compose import ColumnTransformer

In [116]:
ct=ColumnTransformer(transformers=[
    ("tnf1",SimpleImputer(),["fever"]),
    ("tnf2",OrdinalEncoder(categories=[["Mild","Strong"]]),["cough"]),
    ("tnf3",OneHotEncoder(sparse_output=False,drop="first"),["gender","city"])
],remainder="passthrough")

In [125]:
ct.fit_transform(x_train).shape

(80, 7)

In [124]:
ct.transform(x_test).shape

(20, 7)