<a href="https://colab.research.google.com/github/weasel-codes/covid-patient-recovery/blob/main/Covid19_Patient_Recovery_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Relevant imports

In [1]:
import numpy as np
import matplotlib.pyplot as plot
import pandas as pd

# Importing Dataset

In [2]:
dataset = pd.read_csv('PatientInfo.csv')
# print("\n\nAll Columns : ")
# print(dataset.info())
dataset = dataset[{'sex','age','infection_case','confirmed_date','released_date','deceased_date', 'state'}]
# print("\n\nRelevant Columns : ")
# print(dataset.info())
# print("Shape of Dataset : ", dataset.shape)
print("No. of Nan values per column : ")
dataset.isnull().sum()

No. of Nan values per column : 


state                0
deceased_date     5099
sex               1122
age               1380
confirmed_date       3
infection_case     919
released_date     3578
dtype: int64

# Pre-Processing Dataset

In [3]:
print("Shape of dataset before removing Nan from any column : ", dataset.shape)

dataset = dataset.dropna(subset=['confirmed_date'], how='all')
# print("\nShape of dataset after removing Nan from confirmed_date : ", dataset.shape)
dataset = dataset[dataset.state != 'isolated']
# print("\nShape of dataset after removing Nan from column state = isolated : ", dataset.shape)
# print("No. of Nan values per column after removing Nan values from column state = isolated : ")
# print(dataset.isnull().sum())
dataset = dataset.dropna(subset=['deceased_date', 'released_date'], how='all')
# print("\nShape of dataset after removing Nan from column deceased and released date : ", dataset.shape)
# print("No. of Nan values per column after removing released, deceased, confirmed date : ")
# print(dataset.isnull().sum())
dataset['sex'].fillna(method="ffill", inplace = True)
dataset['age'].fillna(method="ffill", inplace = True)
dataset['infection_case'].fillna(method="ffill", inplace = True)
print("No. of Nan values per column after removing Nans : ")
print(dataset.isnull().sum())

print("\n\nShape of dataset after removing Nan from any column : ", dataset.shape)
print("Sample Dataset : \n", dataset.iloc[0])

Shape of dataset before removing Nan from any column :  (5165, 7)
No. of Nan values per column after removing Nans : 
state                0
deceased_date     1580
sex                  0
age                  0
confirmed_date       0
infection_case       0
released_date       64
dtype: int64


Shape of dataset after removing Nan from any column :  (1646, 7)
Sample Dataset : 
 state                    released
deceased_date                 NaN
sex                          male
age                           50s
confirmed_date         2020-01-23
infection_case    overseas inflow
released_date          2020-02-05
Name: 0, dtype: object


## Processing dataset columns

In [4]:
age = np.array(dataset['age'])
sex = np.array(dataset['sex'])
state = np.array(dataset['state'])
confirm = np.array(dataset['confirmed_date'])
deceased = np.array(dataset['deceased_date'])
infection = np.array(dataset['infection_case'])
release = np.array(dataset['released_date'])

## Generating array

In [5]:
data = np.column_stack((age, sex, infection, confirm, release, deceased, state))
print(data)

[['50s' 'male' 'overseas inflow' ... '2020-02-05' nan 'released']
 ['30s' 'male' 'overseas inflow' ... '2020-03-02' nan 'released']
 ['50s' 'male' 'contact with patient' ... '2020-02-19' nan 'released']
 ...
 ['30s' 'female' 'Itaewon Clubs' ... '2020-06-12' nan 'released']
 ['30s' 'female' 'overseas inflow' ... '2020-06-13' nan 'released']
 ['30s' 'female' 'overseas inflow' ... '2020-06-24' nan 'released']]


## Processing age for removing 's'

In [6]:
for i in range(len(data)):
  data[i,0] = pd.to_numeric(data[i,0].replace('s',''))
print(data)

[[50 'male' 'overseas inflow' ... '2020-02-05' nan 'released']
 [30 'male' 'overseas inflow' ... '2020-03-02' nan 'released']
 [50 'male' 'contact with patient' ... '2020-02-19' nan 'released']
 ...
 [30 'female' 'Itaewon Clubs' ... '2020-06-12' nan 'released']
 [30 'female' 'overseas inflow' ... '2020-06-13' nan 'released']
 [30 'female' 'overseas inflow' ... '2020-06-24' nan 'released']]


## Create new column for no. of days.

In [7]:
print("Shape before adding column : ", data.shape)
new_column = np.zeros((len(data), 1))
data = np.block([data, new_column])
print("Shape after adding column : ", data.shape)

Shape before adding column :  (1646, 7)
Shape after adding column :  (1646, 8)


## Update No. of days
Dataset example :
[age, sex, infection, confirm, release, deceased, state, no_od_days]

In [8]:
for i in range(len(data)) :
  if data[i,6] == 'released' :
    data[i,7] = (np.datetime64(data[i,4]) - np.datetime64(data[i,3]))/ np.timedelta64(1, 'D')
  else :
    if pd.isna(data[i,5]) : #if deceased date is empty
      data[i,7] = (np.datetime64(data[i,4]) - np.datetime64(data[i,3]))/ np.timedelta64(1, 'D')
    else :
      data[i,7] = (np.datetime64(data[i,5]) - np.datetime64(data[i,3]))/ np.timedelta64(1, 'D')
  
print("Check for days change : ", data[0])

Check for days change :  [50 'male' 'overseas inflow' '2020-01-23' '2020-02-05' nan 'released' 13.0]


## Selecting final relevant columns from array for processing data
Dataset example : [age, sex, infection, state, no_of_days]

In [9]:
relevant_data = data[:,[0,1,2,6,7]]

## Separating relevant columns from X and Y
We aonly need no. of days and not other dates 

In [10]:
X = relevant_data[:,[0,1,2,4]]
Y = relevant_data[:,3]
print(X[0])
print(Y[0])

[50 'male' 'overseas inflow' 13.0]
released


## Encoding Dataset
encoding dataset ref : https://machinelearningmastery.com/how-to-one-hot-encode-sequence-data-in-python/

In [11]:
#Encoding Categorical Data with Label Encoding for Gender and One hot Encoding Country 
#Lebel Encoding when there is relation btw values : male or female : gives them 0,1,2
#One Hot Encoding when there is no relation between them : Country : Makes separate column for each with binary values
# reference : https://machinelearningmastery.com/how-to-one-hot-encode-sequence-data-in-python/
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

In [12]:
le = LabelEncoder()

X[:,1] = le.fit_transform(X[:,1]) #For Gender 0 and 1
Y = le.fit_transform(Y) #For Gender 0 and 1

# define one hot encoding
encoder = OneHotEncoder(sparse=False)
ct = ColumnTransformer([('encoder', encoder, [2])],remainder='passthrough')
X = np.array(ct.fit_transform(X))

In [13]:
print("Before encoding X : ", X[0:5])
print("After encoding Y : ", Y[0:5])

Before encoding X :  [[0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
  0.0 0.0 1.0 50 1 13.0]
 [0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
  0.0 0.0 1.0 30 1 32.0]
 [0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0
  0.0 0.0 0.0 50 1 20.0]
 [0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
  0.0 0.0 1.0 20 1 16.0]
 [0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0
  0.0 0.0 0.0 20 0 24.0]]
After encoding Y :  [1 1 1 1 1]


# Separating Dataset into Training and testing

In [14]:
# To create 4 separate set : 2 for training set of dependent and independent vars and same 2 for testing set
# XTrain XTest YTrain YTest

from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=1) #80:20 :: Train:Test

print("Train set for X : ", X_train.shape)
print("Test set for X : ", X_test.shape)
print("Train set for Y : ", Y_train.shape)
print("Test set for Y : ", Y_test.shape)

Train set for X :  (1316, 24)
Test set for X :  (330, 24)
Train set for Y :  (1316,)
Test set for Y :  (330,)
