sklearn.compose.ColumnTransformer(transformers, *, remainder='drop', sparse_threshold=0.3, n_jobs=None, transformer_weights=None, verbose=False, verbose_feature_names_out=True)[source]


This estimator allows different columns or column subsets of the input to be transformed separately and the features generated by each transformer will be concatenated to form a single feature space. This is useful for heterogeneous or columnar data, to combine several feature extraction mechanisms or transformations into a single transformer.

In [28]:
import numpy as np
import pandas as pd

In [3]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder

In [5]:
df=pd.read_csv('https://raw.githubusercontent.com/campusx-official/100-days-of-machine-learning/main/day28-column-transformer/covid_toy.csv')
df.head(10)

Unnamed: 0,age,gender,fever,cough,city,has_covid
0,60,Male,103.0,Mild,Kolkata,No
1,27,Male,100.0,Mild,Delhi,Yes
2,42,Male,101.0,Mild,Delhi,No
3,31,Female,98.0,Mild,Kolkata,No
4,65,Female,101.0,Mild,Mumbai,No
5,84,Female,,Mild,Bangalore,Yes
6,14,Male,101.0,Strong,Bangalore,No
7,20,Female,,Strong,Mumbai,Yes
8,19,Female,100.0,Strong,Bangalore,No
9,64,Female,101.0,Mild,Delhi,No


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   age        100 non-null    int64  
 1   gender     100 non-null    object 
 2   fever      90 non-null     float64
 3   cough      100 non-null    object 
 4   city       100 non-null    object 
 5   has_covid  100 non-null    object 
dtypes: float64(1), int64(1), object(4)
memory usage: 4.8+ KB


In [7]:
df['cough'].value_counts()

cough
Mild      62
Strong    38
Name: count, dtype: int64

In [8]:
df['city'].value_counts()

city
Kolkata      32
Bangalore    30
Delhi        22
Mumbai       16
Name: count, dtype: int64

In [9]:
from sklearn.model_selection import train_test_split
x_train,x_test, y_train,y_test=train_test_split(df.iloc[:,0:5],df.iloc[:,-1], test_size=0.2)

In [10]:
x_train

Unnamed: 0,age,gender,fever,cough,city
47,18,Female,104.0,Mild,Bangalore
79,48,Female,103.0,Mild,Kolkata
21,73,Male,98.0,Mild,Bangalore
45,72,Male,99.0,Mild,Bangalore
17,40,Female,98.0,Strong,Delhi
...,...,...,...,...,...
46,19,Female,101.0,Mild,Mumbai
53,83,Male,98.0,Mild,Delhi
3,31,Female,98.0,Mild,Kolkata
39,50,Female,103.0,Mild,Kolkata


In [11]:
x_test

Unnamed: 0,age,gender,fever,cough,city
86,25,Male,104.0,Mild,Bangalore
87,47,Male,101.0,Strong,Bangalore
7,20,Female,,Strong,Mumbai
97,20,Female,101.0,Mild,Bangalore
16,69,Female,103.0,Mild,Kolkata
25,23,Male,,Mild,Mumbai
93,27,Male,100.0,Mild,Kolkata
32,34,Female,101.0,Strong,Delhi
56,71,Male,,Strong,Kolkata
48,66,Male,99.0,Strong,Bangalore


# Without Column Transformer

SimpleImputer:= Univariate imputer for completing missing values with simple strategies.
Replace missing values using a descriptive statistic (e.g. mean, median, or most frequent) along each column, or using a constant value.


In [12]:
# adding simple imputer to fever col
si=SimpleImputer()
x_train_fever=si.fit_transform(x_train[['fever']])

# also the test data
x_test_fever= si.fit_transform(x_test[['fever']])

x_train_fever.shape

(80, 1)

In [15]:
# Ordinalencoding----> cough
oe=OrdinalEncoder(categories=[['Mild','Strong']])
x_train_cough=oe.fit_transform(x_train[['cough']])
x_test_cough=oe.fit_transform(x_test[['cough']])
x_train_cough.shape

(80, 1)

In [17]:
x_train_cough

array([[0.],
       [0.],
       [0.],
       [0.],
       [1.],
       [1.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [1.],
       [1.],
       [0.],
       [0.],
       [0.],
       [1.],
       [1.],
       [1.],
       [0.],
       [1.],
       [0.],
       [1.],
       [1.],
       [0.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [0.],
       [1.],
       [0.],
       [0.],
       [0.],
       [1.],
       [0.],
       [1.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [1.],
       [0.],
       [0.],
       [1.],
       [0.],
       [0.],
       [1.],
       [0.],
       [0.],
       [0.],
       [1.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [1.],
       [0.],
       [1.],
       [0.],
       [1.],
       [1.],
       [0.],
       [1.],
       [1.],
       [1.],
       [0.],
       [0.],

In [19]:
# OneHotEncoding -----> gender, city
ohe=OneHotEncoder(drop='first',sparse=False)
x_train_gender_city=ohe.fit_transform(x_train[['gender','city']])
x_test_gender_city=ohe.fit_transform(x_test[['gender','city']])

x_train_gender_city.shape



(80, 4)

In [20]:
x_train_gender_city
# first column represent gender and remaining 3 represent city

array([[0., 0., 0., 0.],
       [0., 0., 1., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 0., 0.],
       [1., 0., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [1., 0., 1., 0.],
       [0., 0., 0., 1.],
       [1., 1., 0., 0.],
       [1., 0., 1., 0.],
       [0., 0., 0., 1.],
       [0., 0., 0., 0.],
       [1., 0., 0., 1.],
       [1., 1., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 0., 1.],
       [0., 0., 1., 0.],
       [0., 0., 0., 1.],
       [1., 0., 1., 0.],
       [0., 1., 0., 0.],
       [0., 0., 0., 0.],
       [1., 0., 0., 1.],
       [1., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 1.],
       [1., 1., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [1., 0., 1., 0.],
       [1., 0., 0., 0.],
       [1., 0., 1., 0.],


In [21]:
# extrancting Age
x_train_age=x_train.drop(columns=['gender','fever','city','cough']).values

x_test_age=x_test.drop(columns=['gender','fever','city','cough']).values
x_train_age.shape

(80, 1)

In [22]:
x_train_age

array([[18],
       [48],
       [73],
       [72],
       [40],
       [19],
       [11],
       [64],
       [49],
       [83],
       [60],
       [27],
       [34],
       [ 5],
       [69],
       [44],
       [27],
       [54],
       [42],
       [68],
       [84],
       [14],
       [51],
       [75],
       [81],
       [26],
       [69],
       [79],
       [34],
       [49],
       [23],
       [80],
       [47],
       [65],
       [65],
       [83],
       [11],
       [ 5],
       [46],
       [60],
       [ 5],
       [49],
       [16],
       [80],
       [ 8],
       [55],
       [12],
       [73],
       [19],
       [82],
       [22],
       [17],
       [13],
       [65],
       [81],
       [10],
       [71],
       [75],
       [16],
       [65],
       [51],
       [19],
       [42],
       [64],
       [ 6],
       [24],
       [14],
       [70],
       [15],
       [10],
       [82],
       [64],
       [56],
       [51],
       [25],
       [19],
       [83],

In [24]:
# LabelEncoding ---> has_covid
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
y_train_new=le.fit_transform(y_train)
y_test_new=le.fit_transform(y_test)

In [25]:
y_train_new.shape

(80,)

In [26]:
y_train_new

array([0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1])

In [30]:
x_train_transformed=np.concatenate((x_train_age,x_train_fever,x_train_gender_city,x_train_cough),axis=1)

x_test_transformed=np.concatenate((x_test_age,x_test_fever,x_test_gender_city,x_test_cough),axis=1)

x_train_transformed.shape

(80, 7)

In [31]:
x_train_transformed

array([[ 18.        , 104.        ,   0.        ,   0.        ,
          0.        ,   0.        ,   0.        ],
       [ 48.        , 103.        ,   0.        ,   0.        ,
          1.        ,   0.        ,   0.        ],
       [ 73.        ,  98.        ,   1.        ,   0.        ,
          0.        ,   0.        ,   0.        ],
       [ 72.        ,  99.        ,   1.        ,   0.        ,
          0.        ,   0.        ,   0.        ],
       [ 40.        ,  98.        ,   0.        ,   1.        ,
          0.        ,   0.        ,   1.        ],
       [ 19.        , 100.        ,   0.        ,   0.        ,
          0.        ,   0.        ,   1.        ],
       [ 11.        , 100.        ,   1.        ,   0.        ,
          0.        ,   0.        ,   0.        ],
       [ 64.        , 101.        ,   0.        ,   1.        ,
          0.        ,   0.        ,   0.        ],
       [ 49.        , 102.        ,   0.        ,   1.        ,
          0.    

# With Column Transformer

In [32]:
from sklearn.compose import ColumnTransformer

In [43]:
transfomer=ColumnTransformer( =[
    ('tnf1',SimpleImputer(),['fever']),
    ('tnf2',OrdinalEncoder(categories=[['Mild','Strong']]),['cough']),
    ('tnf3',OneHotEncoder(sparse=False,drop='first'),['gender','city'])
],remainder='passthrough')

In [44]:
transfomer.fit_transform(x_train).shape



(80, 7)

In [45]:
transfomer.transform(x_test).shape

(20, 7)