# Data Preparation

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

import torch
import torchvision
from torch.utils.data import Dataset

## Data Wrangling

In [2]:
# Loading CSV, 
df = pd.read_csv("covid-data.csv")

# Renaming columns,
df.rename(columns = {"DATE_DIED": "DECEASED", 
                     "HIPERTENSION": "HYPERTENSION",
                     "CLASIFFICATION_FINAL": "CLASSIFICATION",
                     "USMER": "USMR"}, inplace = True)

# Decoding dataframe,
df = df.astype("string")

df.loc[df["DECEASED"] != "9999-99-99", "DECEASED"] = "Yes"
df.loc[df["DECEASED"] == "9999-99-99", "DECEASED"] = "No"

df.loc[df["SEX"] == "1", "SEX"] = "Female"
df.loc[df["SEX"] == "2", "SEX"] = "Male"

df.loc[df["CLASSIFICATION"] >= "4", "CLASSIFICATION"] = "Negative Case"
df.loc[df["CLASSIFICATION"] == "3", "CLASSIFICATION"] = "Serious Case"
df.loc[df["CLASSIFICATION"] == "2", "CLASSIFICATION"] = "Moderate Case"
df.loc[df["CLASSIFICATION"] == "1", "CLASSIFICATION"] = "Mild Case"

df.loc[df["PNEUMONIA"] == "1", "PNEUMONIA"] = "Yes"
df.loc[df["PNEUMONIA"] == "2", "PNEUMONIA"] = "No"
df.loc[df["PNEUMONIA"] == "99", "PNEUMONIA"] = ""

df.loc[df["DIABETES"] == "1", "DIABETES"] = "Yes"
df.loc[df["DIABETES"] == "2", "DIABETES"] = "No"
df.loc[df["DIABETES"] == "98", "DIABETES"] = ""

df.loc[df["ASTHMA"] == "1", "ASTHMA"] = "Yes"
df.loc[df["ASTHMA"] == "2", "ASTHMA"] = "No"
df.loc[df["ASTHMA"] == "98", "ASTHMA"] = ""

df.loc[df["INMSUPR"] == "1", "INMSUPR"] = "Yes"
df.loc[df["INMSUPR"] == "2", "INMSUPR"] = "No"
df.loc[df["INMSUPR"] == "98", "INMSUPR"] = ""

df.loc[df["HYPERTENSION"] == "1", "HYPERTENSION"] = "Yes"
df.loc[df["HYPERTENSION"] == "2", "HYPERTENSION"] = "No"
df.loc[df["HYPERTENSION"] == "98", "HYPERTENSION"] = ""

df.loc[df["CARDIOVASCULAR"] == "1", "CARDIOVASCULAR"] = "Yes"
df.loc[df["CARDIOVASCULAR"] == "2", "CARDIOVASCULAR"] = "No"
df.loc[df["CARDIOVASCULAR"] == "98", "CARDIOVASCULAR"] = ""

df.loc[df["OBESITY"] == "1", "OBESITY"] = "Yes"
df.loc[df["OBESITY"] == "2", "OBESITY"] = "No"
df.loc[df["OBESITY"] == "98", "OBESITY"] = ""

df.loc[df["RENAL_CHRONIC"] == "1", "RENAL_CHRONIC"] = "Yes"
df.loc[df["RENAL_CHRONIC"] == "2", "RENAL_CHRONIC"] = "No"
df.loc[df["RENAL_CHRONIC"] == "98", "RENAL_CHRONIC"] = ""

df.loc[df["OTHER_DISEASE"] == "1", "OTHER_DISEASE"] = "Yes"
df.loc[df["OTHER_DISEASE"] == "2", "OTHER_DISEASE"] = "No"
df.loc[df["OTHER_DISEASE"] == "98", "OTHER_DISEASE"] = ""

df.loc[df["TOBACCO"] == "1", "TOBACCO"] = "Yes"
df.loc[df["TOBACCO"] == "2", "TOBACCO"] = "No"
df.loc[df["TOBACCO"] == "98", "TOBACCO"] = ""

df.loc[df["USMR"] == "1", "USMR"] = "Yes"
df.loc[df["USMR"] == "2", "USMR"] = "No"

df.loc[df["COPD"] == "1", "COPD"] = "Yes"
df.loc[df["COPD"] == "2", "COPD"] = "No"
df.loc[df["COPD"] == "98", "COPD"] = ""

df.loc[df["PATIENT_TYPE"] == "1", "PATIENT_TYPE"] = "Home"
df.loc[df["PATIENT_TYPE"] == "2", "PATIENT_TYPE"] = "Hospital"

df.loc[df["SEX"] == "Male", "PREGNANT"] = "No"
df.loc[df["PREGNANT"] == "1", "PREGNANT"] = "Yes"
df.loc[df["PREGNANT"] == "2", "PREGNANT"] = "No"
df.loc[df["PREGNANT"] == "98", "PREGNANT"] = ""

df.loc[df["ICU"] == "1", "ICU"] = "Yes"
df.loc[df["ICU"] == "2", "ICU"] = "No"
df.loc[df["ICU"] == "97", "ICU"] = ""
df.loc[df["ICU"] == "99", "ICU"] = ""

df.loc[df["INTUBED"] == "1", "INTUBED"] = "Yes"
df.loc[df["INTUBED"] == "2", "INTUBED"] = "No"
df.loc[df["INTUBED"] == "97", "INTUBED"] = ""
df.loc[df["INTUBED"] == "99", "INTUBED"] = ""

# Binning the age values,
df['AGE'] = df['AGE'].astype(int)
age_bins = [0, 5, 10, 15, 20, 25, 30, 35, 40, 50, 60, 70, 80, 90, 100, 120]
age_labels = ["0-5", "5-10", "10-15", "15-20", "20-25", "25-30", "30-35", "35-40", 
              "40-50", "50-60", "60-70", "70-80", "80-90", "90-100", "100-120"]
df['AGE'] = pd.cut(df['AGE'], bins = age_bins, labels = age_labels)
df['AGE'] = df['AGE'].astype("string")

# Cleaning dataframe.
df.drop(columns = ["ICU", "INTUBED"], inplace = True)
df.replace('', np.nan, inplace = True)
df.dropna(inplace = True)

# Saving dataframe,
df.to_csv("covid-data-cleaned.csv")
df.info()
df

<class 'pandas.core.frame.DataFrame'>
Index: 1018204 entries, 0 to 1048574
Data columns (total 19 columns):
 #   Column          Non-Null Count    Dtype 
---  ------          --------------    ----- 
 0   USMR            1018204 non-null  string
 1   MEDICAL_UNIT    1018204 non-null  string
 2   SEX             1018204 non-null  string
 3   PATIENT_TYPE    1018204 non-null  string
 4   DECEASED        1018204 non-null  string
 5   PNEUMONIA       1018204 non-null  string
 6   AGE             1018204 non-null  string
 7   PREGNANT        1018204 non-null  string
 8   DIABETES        1018204 non-null  string
 9   COPD            1018204 non-null  string
 10  ASTHMA          1018204 non-null  string
 11  INMSUPR         1018204 non-null  string
 12  HYPERTENSION    1018204 non-null  string
 13  OTHER_DISEASE   1018204 non-null  string
 14  CARDIOVASCULAR  1018204 non-null  string
 15  OBESITY         1018204 non-null  string
 16  RENAL_CHRONIC   1018204 non-null  string
 17  TOBACCO      

Unnamed: 0,USMR,MEDICAL_UNIT,SEX,PATIENT_TYPE,DECEASED,PNEUMONIA,AGE,PREGNANT,DIABETES,COPD,ASTHMA,INMSUPR,HYPERTENSION,OTHER_DISEASE,CARDIOVASCULAR,OBESITY,RENAL_CHRONIC,TOBACCO,CLASSIFICATION
0,No,1,Female,Home,Yes,Yes,60-70,No,No,No,No,No,Yes,No,No,No,No,No,Serious Case
1,No,1,Male,Home,Yes,Yes,70-80,No,No,No,No,No,Yes,No,No,Yes,Yes,No,Negative Case
2,No,1,Male,Hospital,Yes,No,50-60,No,Yes,No,No,No,No,No,No,No,No,No,Serious Case
3,No,1,Female,Home,Yes,No,50-60,No,No,No,No,No,No,No,No,No,No,No,Negative Case
4,No,1,Male,Home,Yes,No,60-70,No,Yes,No,No,No,Yes,No,No,No,No,No,Serious Case
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1048570,No,13,Male,Home,No,No,35-40,No,No,No,No,No,No,No,No,No,No,No,Negative Case
1048571,Yes,13,Male,Hospital,No,No,50-60,No,No,No,No,No,Yes,No,No,No,No,No,Negative Case
1048572,No,13,Male,Home,No,No,50-60,No,No,No,No,No,No,No,No,No,No,No,Negative Case
1048573,No,13,Male,Home,No,No,25-30,No,No,No,No,No,No,No,No,No,No,No,Negative Case


## Creating Machine Learning Dataset

In [3]:
class COVID_DATASET(Dataset):
    def __init__(self, X, Y, features, classes):

        # Converting X and Y to tensors,
        self.Y = torch.tensor(Y, dtype = torch.float32)
        self.X = torch.tensor(X, dtype = torch.float32)
        self.features = features
        self.classes = classes
        del X, Y, classes, features

    def __len__(self):
        return len(self.X)

    def __getitem__(self, index):
        return self.X[index], self.Y[index]

In [4]:
# Extracting features and class labels,
classes = list(df["DECEASED"].unique())
features = {}
for var in df.drop(columns = ["DECEASED"]).columns:
    features[var] = []
    if df[var].dtype == "string":
        for unique_value in df[var].unique():
            features[var].append(str(unique_value))
    else:
        pass

df_DECEASED_Yes = df.loc[df["DECEASED"] == "Yes"]
df_DECEASED_No = df.loc[df["DECEASED"] == "No"]
df_DECEASED_No = df_DECEASED_No.sample(n = len(df_DECEASED_Yes))
df = pd.concat([df_DECEASED_No, df_DECEASED_Yes])

# One-hot encoding categorical variables,
df_encoded = pd.get_dummies(df, dtype = int)
del df

df_X = df_encoded.drop(columns = ["DECEASED_No", "DECEASED_Yes"])
df_Y = df_encoded[["DECEASED_No", "DECEASED_Yes"]]
del df_encoded

# Creating training and testing split,
X_train, X_test, Y_train, Y_test = train_test_split(df_X, df_Y, test_size = 0.2, random_state = 69)
del df_X, df_Y

# Converting to numpy arrays,
X_train = X_train.to_numpy(dtype = np.float32)
X_test = X_test.to_numpy(dtype = np.float32)
Y_train = Y_train.to_numpy(dtype = np.int32)
Y_test = Y_test.to_numpy(dtype = np.int32)

# Saving training dataset,
training_dataset = COVID_DATASET(X_train, Y_train, features, classes)
del X_train, Y_train
torch.save(training_dataset, "training.pt")
del training_dataset

# Saving test dataset,
test_dataset = COVID_DATASET(X_test, Y_test, features, classes)
del X_test, Y_test
torch.save(test_dataset, "test.pt")
del test_dataset