In [114]:
import numpy as np
import pandas as pd

In [115]:
df= pd.read_csv("covid_toy.csv")
df.sample(3)

Unnamed: 0,age,gender,fever,cough,city,has_covid
16,69,Female,103.0,Mild,Kolkata,Yes
94,79,Male,,Strong,Kolkata,Yes
67,65,Male,99.0,Mild,Bangalore,No


In [116]:
df.isnull().sum()

age           0
gender        0
fever        10
cough         0
city          0
has_covid     0
dtype: int64

In [117]:
nepal_cities=["Kathmandu", "Pokhara", "Lalitpur", "Bhaktapur",
            "Biratnagar", "Birgunj", "Dharan", "Butwal",
            "Hetauda", "Nepalgunj"]
df["city"]= np.random.choice(nepal_cities, size=len(df))

In [118]:
df.sample(4)

Unnamed: 0,age,gender,fever,cough,city,has_covid
7,20,Female,,Strong,Bhaktapur,Yes
29,34,Female,,Strong,Pokhara,Yes
79,48,Female,103.0,Mild,Butwal,Yes
55,81,Female,101.0,Mild,Birgunj,Yes


In [119]:
df.to_csv("updated_covid_toy.csv", index=False)

In [120]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test= train_test_split(df.drop(columns=["has_covid"]), df["has_covid"], test_size=0.1, random_state=42)

# Encoding every data manually

In [121]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder


In [122]:
# Adding SimpleImputer in 'fever' column
si= SimpleImputer(strategy='mean')  #strategy='mean' by default if we dont define
X_train_fever= si.fit_transform(X_train[["fever"]])
X_test_fever= si.transform(X_test[["fever"]])
X_train_fever.shape

(90, 1)

In [123]:
# Ordinal Encoding
oe= OrdinalEncoder(categories=[['Mild', 'Strong']])
X_train_cough= oe.fit_transform(X_train[["cough"]])
X_test_cough= oe.transform(X_test[["cough"]])
X_train_cough.shape

(90, 1)

In [124]:
# One Hot Encoding
ohe= OneHotEncoder(drop='first', sparse_output=False, dtype=np.int32)
X_train_gender_city= ohe.fit_transform(X_train[["gender", "city"]])
X_test_gender_city= ohe.transform(X_test[["gender", "city"]])
X_train_gender_city.shape

(90, 10)

##### Now we will extract the age and combine all the encoded value together

In [126]:
X_train_age= X_train.drop(columns=["gender", "fever", "cough", "city"]).values
X_test_age= X_test.drop(columns=["gender", "fever", "cough","city"]).values
X_train_age.shape

(90, 1)

##### Since, every other column is in numpy array due to transform, we use .values() in X_train and test age because it is in pd dataframe

In [127]:
X_train_transformed= np.concatenate((X_train_age, X_train_gender_city, X_train_fever, X_train_cough), axis=1)
X_test_transformed= np.concatenate((X_test_age, X_test_gender_city, X_test_fever, X_test_cough), axis=1)
X_train_transformed.shape

(90, 13)

# Encoding data using ColumnTransformer

##### These all encoding can be done using ColumnTransfromer

In [None]:
from sklearn.compose import ColumnTransformer
transformer= ColumnTransformer(transformers=[('t1', SimpleImputer(), ['fever']),
                                             ('t2', OrdinalEncoder(categories=[['Mild', 'Strong']]), ['cough']),
                                             ('t3', OneHotEncoder(sparse_output=False, drop='first'), ['gender', 'city'])], 
                                             remainder='passthrough')
# 'passthrough' will keep the untransformed column.
# 'drop' will delete the untransformed column.

In [129]:
X_train_new= transformer.fit_transform(X_train)

In [130]:
print(X_train_transformed.shape)
print(X_train_new.shape)


(90, 13)
(90, 13)


##### We can see that both generate same result and ColumnTransformer makes it easy to encode data rather than doing everything manually.

In [131]:
X_train_new

array([[ 98.,   0.,   0., ...,   0.,   0.,  64.],
       [101.,   0.,   1., ...,   0.,   0.,  15.],
       [ 98.,   1.,   1., ...,   0.,   0.,  34.],
       ...,
       [104.,   0.,   1., ...,   0.,   0.,  51.],
       [102.,   1.,   0., ...,   0.,   0.,  82.],
       [100.,   1.,   0., ...,   0.,   0.,  11.]], shape=(90, 13))

In [133]:
X_train_transformed

array([[ 64.,   0.,   0., ...,   0.,  98.,   0.],
       [ 15.,   1.,   0., ...,   0., 101.,   0.],
       [ 34.,   1.,   0., ...,   0.,  98.,   1.],
       ...,
       [ 51.,   1.,   0., ...,   0., 104.,   0.],
       [ 82.,   0.,   0., ...,   0., 102.,   1.],
       [ 11.,   0.,   0., ...,   0., 100.,   1.]], shape=(90, 13))