In [1]:
import numpy as np
import pandas as pd

In [2]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder

In [3]:
data = pd.read_csv('Covid_toy.csv')

In [4]:
data.head()

Unnamed: 0.1,Unnamed: 0,age,gender,fever,cough,city,has_covid
0,0,60,Male,103.0,Mild,Kolkata,No
1,1,27,Male,100.0,Mild,Delhi,Yes
2,2,42,Male,101.0,Mild,Delhi,No
3,3,31,Female,98.0,Mild,Kolkata,No
4,4,65,Female,101.0,Mild,Mumbai,No


In [5]:
df = pd.DataFrame(data)

# Remove the 'Unnamed: 0' column
df = df.drop('Unnamed: 0', axis=1)

# Display the modified DataFrame
print(df)

    age  gender  fever   cough       city has_covid
0    60    Male  103.0    Mild    Kolkata        No
1    27    Male  100.0    Mild      Delhi       Yes
2    42    Male  101.0    Mild      Delhi        No
3    31  Female   98.0    Mild    Kolkata        No
4    65  Female  101.0    Mild     Mumbai        No
..  ...     ...    ...     ...        ...       ...
95   12  Female  104.0    Mild  Bangalore        No
96   51  Female  101.0  Strong    Kolkata       Yes
97   20  Female  101.0    Mild  Bangalore        No
98    5  Female   98.0  Strong     Mumbai        No
99   10  Female   98.0  Strong    Kolkata       Yes

[100 rows x 6 columns]


In [6]:
df.head(10)

Unnamed: 0,age,gender,fever,cough,city,has_covid
0,60,Male,103.0,Mild,Kolkata,No
1,27,Male,100.0,Mild,Delhi,Yes
2,42,Male,101.0,Mild,Delhi,No
3,31,Female,98.0,Mild,Kolkata,No
4,65,Female,101.0,Mild,Mumbai,No
5,84,Female,,Mild,Bangalore,Yes
6,14,Male,101.0,Strong,Bangalore,No
7,20,Female,,Strong,Mumbai,Yes
8,19,Female,100.0,Strong,Bangalore,No
9,64,Female,101.0,Mild,Delhi,No


In [7]:
df['cough'].value_counts()

cough
Mild      62
Strong    38
Name: count, dtype: int64

In [8]:
df['city'].value_counts()

city
Kolkata      32
Bangalore    30
Delhi        22
Mumbai       16
Name: count, dtype: int64

In [9]:
df.isnull().sum()

age           0
gender        0
fever        10
cough         0
city          0
has_covid     0
dtype: int64

In [10]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(df.drop(columns=['has_covid']),df['has_covid'],test_size=0.2)

In [11]:
X_train

Unnamed: 0,age,gender,fever,cough,city
32,34,Female,101.0,Strong,Delhi
22,71,Female,98.0,Strong,Kolkata
0,60,Male,103.0,Mild,Kolkata
67,65,Male,99.0,Mild,Bangalore
77,8,Female,101.0,Mild,Kolkata
...,...,...,...,...,...
48,66,Male,99.0,Strong,Bangalore
92,82,Female,102.0,Strong,Kolkata
4,65,Female,101.0,Mild,Mumbai
79,48,Female,103.0,Mild,Kolkata


# 1.Usual method

In [12]:
#adding simple imputer to fever column

In [13]:
si = SimpleImputer()

In [14]:
X_train_fever = si.fit_transform(X_train[['fever']])

In [15]:
X_train_fever

array([[101.  ],
       [ 98.  ],
       [103.  ],
       [ 99.  ],
       [101.  ],
       [ 99.  ],
       [ 99.  ],
       [ 98.  ],
       [100.75],
       [104.  ],
       [102.  ],
       [ 99.  ],
       [100.75],
       [ 98.  ],
       [100.  ],
       [ 99.  ],
       [ 99.  ],
       [101.  ],
       [102.  ],
       [100.75],
       [100.  ],
       [ 99.  ],
       [104.  ],
       [104.  ],
       [101.  ],
       [100.75],
       [ 99.  ],
       [ 98.  ],
       [100.  ],
       [ 98.  ],
       [100.  ],
       [100.  ],
       [104.  ],
       [101.  ],
       [104.  ],
       [101.  ],
       [101.  ],
       [ 98.  ],
       [102.  ],
       [100.  ],
       [102.  ],
       [104.  ],
       [100.75],
       [101.  ],
       [ 98.  ],
       [101.  ],
       [100.  ],
       [102.  ],
       [104.  ],
       [103.  ],
       [103.  ],
       [100.  ],
       [104.  ],
       [100.75],
       [104.  ],
       [100.75],
       [101.  ],
       [101.  ],
       [101.  

In [16]:
#Also add it to test data
X_test_fever = si.fit_transform(X_test[['fever']])

X_train_fever.shape

(80, 1)

In [17]:
#Ordinal Encoding for Cough column

In [18]:
oe = OrdinalEncoder(categories=[['Mild','Strong']])
X_train_cough = oe.fit_transform(X_train[['cough']])

#Also to the test data
X_test_cough = oe.fit_transform(X_test[['cough']])

X_train_cough.shape


(80, 1)

In [19]:
#OneHotEncoding on Gender,City
ohe = OneHotEncoder(drop='first',sparse=False)
X_train_gender_city = ohe.fit_transform(X_train[['gender','city']])

#also the test data
X_test_gender_city = ohe.fit_transform(X_test[['gender','city']])

X_train_gender_city.shape




(80, 4)

In [20]:
X_train_gender_city

array([[0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [1., 0., 1., 0.],
       [1., 0., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 0., 0.],
       [0., 0., 1., 0.],
       [1., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 0., 0.],
       [1., 0., 1., 0.],
       [1., 1., 0., 0.],
       [1., 0., 0., 0.],
       [0., 0., 0., 1.],
       [0., 0., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 1., 0.],
       [1., 0., 1., 0.],
       [0., 0., 0., 1.],
       [1., 0., 1., 0.],
       [1., 0., 0., 0.],
       [0., 0., 1., 0.],
       [1., 0., 1., 0.],
       [1., 1., 0., 0.],
       [0., 0., 1., 0.],
       [1., 0., 1., 0.],
       [0., 0., 1., 0.],
       [1., 0., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [0., 0., 0., 1.],
       [1., 1., 0., 0.],
       [0., 0., 0., 1.],
       [1., 0., 1., 0.],
       [0., 0., 0., 0.],


In [21]:
#Extracting Age
X_train_age = X_train.drop(columns=['gender','fever','cough','city']).values

In [22]:
#also the test data
X_test_age = X_test.drop(columns=['gender','fever','cough','city']).values

X_train_age.shape

(80, 1)

In [23]:
X_train_transformed = np.concatenate((X_train_age,X_train_fever,X_train_gender_city,X_train_cough),axis=1)
X_test_transformed = np.concatenate((X_test_age,X_test_fever,X_test_gender_city,X_test_cough),axis=1)

In [24]:
X_train_transformed

array([[ 34.  , 101.  ,   0.  ,   1.  ,   0.  ,   0.  ,   1.  ],
       [ 71.  ,  98.  ,   0.  ,   0.  ,   1.  ,   0.  ,   1.  ],
       [ 60.  , 103.  ,   1.  ,   0.  ,   1.  ,   0.  ,   0.  ],
       [ 65.  ,  99.  ,   1.  ,   0.  ,   0.  ,   0.  ,   0.  ],
       [  8.  , 101.  ,   0.  ,   0.  ,   1.  ,   0.  ,   0.  ],
       [ 22.  ,  99.  ,   0.  ,   0.  ,   0.  ,   0.  ,   0.  ],
       [ 25.  ,  99.  ,   0.  ,   0.  ,   1.  ,   0.  ,   1.  ],
       [ 12.  ,  98.  ,   1.  ,   0.  ,   0.  ,   0.  ,   1.  ],
       [ 42.  , 100.75,   0.  ,   0.  ,   0.  ,   0.  ,   1.  ],
       [ 18.  , 104.  ,   0.  ,   0.  ,   0.  ,   0.  ,   0.  ],
       [ 69.  , 102.  ,   0.  ,   0.  ,   0.  ,   0.  ,   0.  ],
       [ 59.  ,  99.  ,   0.  ,   1.  ,   0.  ,   0.  ,   1.  ],
       [ 84.  , 100.75,   0.  ,   0.  ,   0.  ,   0.  ,   0.  ],
       [ 34.  ,  98.  ,   1.  ,   0.  ,   1.  ,   0.  ,   1.  ],
       [ 27.  , 100.  ,   1.  ,   1.  ,   0.  ,   0.  ,   0.  ],
       [ 72.  ,  99.  ,  

In [25]:
X_train_transformed.shape

(80, 7)

# 2.Column transformation method

In [26]:
from sklearn.compose import ColumnTransformer

In [31]:
transformer = ColumnTransformer(transformers=[
    ('tnf1',SimpleImputer(),['fever']),
    ('tnf2',OrdinalEncoder(categories=[['Mild','Strong']]),['cough']),
    ('tnf3',OneHotEncoder(sparse=False,drop='first'),['gender','city'])
],remainder='passthrough')

In [34]:
transformer.fit_transform(X_train).shape



(80, 7)

In [35]:
transformer.transform(X_test).shape

(20, 7)