In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("/home/yash/Downloads/covid_toy.csv")
df.head()

Unnamed: 0,age,gender,fever,cough,city,has_covid
0,60,Male,103.0,Mild,Kolkata,No
1,27,Male,100.0,Mild,Delhi,Yes
2,42,Male,101.0,Mild,Delhi,No
3,31,Female,98.0,Mild,Kolkata,No
4,65,Female,101.0,Mild,Mumbai,No


## With out Column Transformer

In [3]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(df.iloc[:,0:5],df["has_covid"],test_size=0.2,random_state = 0,stratify = df["has_covid"])

In [4]:
# fill missing values in fever column by the mean 
from sklearn.impute import SimpleImputer
si = SimpleImputer()
si.fit(x_train[["fever"]])
x_train_fever = si.transform(x_train[["fever"]]).reshape(-1, 1)
x_test_fever = si.transform(x_test[["fever"]]).reshape(-1, 1)

In [5]:
x_train_age = x_train["age"].values.reshape(-1, 1)
x_test_age = x_test["age"].values.reshape(-1, 1)

In [6]:
# gender and city
x_train_gc = pd.get_dummies(x_train[["gender","city"]])
x_test_gc = pd.get_dummies(x_test[["gender","city"]])

x_test_gc = x_test_gc.reindex(columns=x_train_gc.columns, fill_value=0)

x_train_gc = np.array(x_train_gc)
x_test_gc = np.array(x_test_gc)

In [7]:
from sklearn.preprocessing import OrdinalEncoder
oe = OrdinalEncoder(categories = [["Mild","Strong"]])
oe.fit(x_train[["cough"]])
x_train_cough = oe.transform(x_train[["cough"]])
x_test_cough = oe.transform(x_test[["cough"]])

In [8]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(y_train)
y_train_has_covid = le.transform(y_train)
y_test_has_covid = le.transform(y_test)

In [9]:
x_train = np.hstack((x_train_age,x_train_gc,x_train_fever,x_train_cough))
x_train = pd.DataFrame(x_train)
x_train.columns = ["Age","Female","Male","Bangalore","Delhi","Kolkata","Mumbai","Fever","Cough"]
x_train.head()

Unnamed: 0,Age,Female,Male,Bangalore,Delhi,Kolkata,Mumbai,Fever,Cough
0,34.0,1.0,0.0,0.0,0.0,0.0,1.0,100.736111,1.0
1,24.0,0.0,1.0,0.0,0.0,1.0,0.0,98.0,0.0
2,46.0,0.0,1.0,1.0,0.0,0.0,0.0,103.0,1.0
3,8.0,1.0,0.0,0.0,0.0,1.0,0.0,101.0,0.0
4,15.0,0.0,1.0,0.0,1.0,0.0,0.0,101.0,0.0


In [10]:
x_test = np.hstack((x_test_age,x_test_gc,x_test_fever,x_test_cough))
x_test = pd.DataFrame(x_test)
x_test.columns = ["Age","Female","Male","Bangalore","Delhi","Kolkata","Mumbai","Fever","Cough"]
x_test.head()

Unnamed: 0,Age,Female,Male,Bangalore,Delhi,Kolkata,Mumbai,Fever,Cough
0,49.0,1.0,0.0,0.0,1.0,0.0,0.0,101.0,0.0
1,64.0,0.0,1.0,1.0,0.0,0.0,0.0,102.0,0.0
2,38.0,1.0,0.0,1.0,0.0,0.0,0.0,101.0,0.0
3,65.0,0.0,1.0,0.0,1.0,0.0,0.0,99.0,0.0
4,34.0,0.0,1.0,0.0,0.0,1.0,0.0,98.0,1.0


In [11]:
y_train = pd.DataFrame(y_train_has_covid)
y_test = pd.DataFrame(y_test_has_covid)

## With Column Transformer

In [12]:
a = pd.read_csv("/home/yash/Downloads/covid_toy.csv")
from sklearn.model_selection import train_test_split
train_x,test_x,train_y,test_y = train_test_split(df.iloc[:,0:5],df["has_covid"],test_size=0.2,random_state = 0,stratify = df["has_covid"])

In [13]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

transformer = ColumnTransformer(transformers = [
    ("tf1",SimpleImputer(),["fever"]),
    ("tf2",OrdinalEncoder(categories = [["Mild","Strong"]]),["cough"]),
    ("tf3",OneHotEncoder(sparse_output = False,drop = "first"),["gender","city"]),
    ],remainder = "passthrough")

transformer.fit(train_x)
train_x = transformer.transform(train_x)
test_x = transformer.transform(test_x)

# ColumnTransformer(transformers=[('Name', Transformer, 'columns')], remainder='passthrough'/'drop')

In [14]:
pd.DataFrame(train_x)

Unnamed: 0,0,1,2,3,4,5,6
0,100.736111,1.0,0.0,0.0,0.0,1.0,34.0
1,98.000000,0.0,1.0,0.0,1.0,0.0,24.0
2,103.000000,1.0,1.0,0.0,0.0,0.0,46.0
3,101.000000,0.0,0.0,0.0,1.0,0.0,8.0
4,101.000000,0.0,1.0,1.0,0.0,0.0,15.0
...,...,...,...,...,...,...,...
75,101.000000,0.0,0.0,0.0,0.0,1.0,19.0
76,101.000000,0.0,0.0,0.0,1.0,0.0,83.0
77,103.000000,0.0,0.0,0.0,1.0,0.0,48.0
78,104.000000,0.0,0.0,0.0,1.0,0.0,17.0
