In [136]:

import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns


In [137]:

df = pd.read_csv("dataset/covid_toy.csv")


In [138]:

print("sample: \n  {}".format(df.sample(2)))

print()

print(f"shpae: {df.shape}")

print()

print(f"info : {df.info()}")


print()

print(f"Missing value count: \n {df.isnull().sum()}")


sample: 
      age  gender  fever   cough    city has_covid
84   69  Female   98.0  Strong  Mumbai        No
23   80  Female   98.0    Mild   Delhi       Yes

shpae: (100, 6)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   age        100 non-null    int64  
 1   gender     100 non-null    object 
 2   fever      90 non-null     float64
 3   cough      100 non-null    object 
 4   city       100 non-null    object 
 5   has_covid  100 non-null    object 
dtypes: float64(1), int64(1), object(4)
memory usage: 4.8+ KB
info : None

Missing value count: 
 age           0
gender        0
fever        10
cough         0
city          0
has_covid     0
dtype: int64


In [139]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(df.iloc[:,:5],df["has_covid"],test_size=0.3)

print(f"shape of X_train : {X_train.shape}")
print(f"shape of X_test : {X_test.shape}")
print(f"shape of y_train : {y_train.shape}")
print(f"shape of y_test : {y_test.shape}")


shape of X_train : (70, 5)
shape of X_test : (30, 5)
shape of y_train : (70,)
shape of y_test : (30,)


<br>
<br>


# Do without Column Transformer:


<br>
<br>

#### Remove Missing Value:

In [140]:

from sklearn.impute import SimpleImputer

# simple imputer for filling nan vlaue for (Fever)

si = SimpleImputer(strategy="mean")

X_train_fever = si.fit_transform(X_train[["fever"]])
X_test_fever = si.transform(X_test[["fever"]])

X_train_fever.shape


(70, 1)

#### Ordinal Encoding:

In [141]:
from sklearn.preprocessing import OrdinalEncoder

oe = OrdinalEncoder(categories=[["Mild","Strong"]])

X_train_cough = oe.fit_transform(X_train[['cough']])
X_test_cough = oe.fit_transform(X_test[['cough']])

X_test_cough.shape

(30, 1)

#### OHE -> City and Gender:

In [142]:
X_train["gender"].value_counts()

gender
Female    38
Male      32
Name: count, dtype: int64

In [143]:
X_train["city"].value_counts()

city
Kolkata      27
Bangalore    18
Delhi        17
Mumbai        8
Name: count, dtype: int64

In [144]:
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(drop="first",sparse_output=False)

X_train_gender_city = ohe.fit_transform(X_train[["gender","city"]])

X_test_gender_city = ohe.fit_transform(X_test[["gender","city"]])

X_test_gender_city.shape

(30, 4)

#### Seperating: Age

In [145]:

# every thing in numpy array we shold also convert this into numpy array:

X_train_age = X_train.drop(columns=["gender","fever","cough","city"]).values
X_test_age = X_test.drop(columns=["gender","fever","cough","city"]).values 

X_train_age.shape

# we need the shpae (70,1) other wise we cann't implement np.concatenate()


(70, 1)

#### concreate everyting:

In [146]:
print(X_train_age.shape)
print(X_train_cough.shape)
print(X_train_gender_city.shape)
print(X_train_fever.shape)

(70, 1)
(70, 1)
(70, 4)
(70, 1)


In [160]:

X_train_transformed = np.concatenate((X_train_age,X_train_cough,X_train_gender_city,X_train_fever),axis=1)
X_test_transformed = np.concatenate((X_test_age,X_test_cough,X_test_gender_city,X_test_fever),axis=1)

X_train_transformed.shape


(70, 7)

<br>
<br>
<br>

# now using column transformer:

<br>
<br>
<br>

In [149]:
df.columns

Index(['age', 'gender', 'fever', 'cough', 'city', 'has_covid'], dtype='object')

In [157]:
from sklearn.compose import ColumnTransformer

ctm = ColumnTransformer(
    transformers=[
        ("tnf1",SimpleImputer(),["fever"]),
        ("tnf2",OrdinalEncoder(categories=[["Mild","Strong"]]),["cough"]),
        ("tnf3",OneHotEncoder(sparse_output=False,drop='first'),["gender","city"]),
    ],
    remainder="passthrough",
)


In [159]:
ctm.fit_transform(X_train).shape

(70, 7)

In [161]:
ctm.transform(X_test).shape

(30, 7)