# Covid19 Therapy Prediction

In [137]:
import pandas as pd                 # pandas is a dataframe library
import matplotlib.pyplot as plt      # matplotlib.pyplot plots data

%matplotlib inline

### Definition of features

| Feature  | Description | Comments |
|--------------|-------------|--------|
| gender   | Gender of patiant | Class variable (m=Male, f=Female)
| dob | Date of birth |
| ethnicity | Ethnicity of patient |
| s_date | Covid19 Symptoms StartDate |
| h_date | Hospitalization Date |
| therapy | Class variable (1=Therapy A, 2=Therapy B) |  |

In [138]:
from random import choice, random
import time
from datetime import date, datetime

gender_map = {'m': 0, 'f': 1}
ethnicity_map = {'white_british': 0, 'white_irish': 1, 'white_gypsy': 2, 'white_other': 3, 
                'mixed_white_and_black_carribbean': 4, 'mixed_white_and_black_african': 5,
                'mixed_white_and_asain': 6, 'mixed_other': 7,
                'asian_indian': 8, 'asian_pakistani': 9, 'asian_chinese': 10, 'asian_other': 11,
                'black_african': 12, 'black_caribbean': 13, 'black_other': 14}
therapy_map = {'therapy A': 0, 'therapy B': 1, 'therapy C': 2}

genders = [*gender_map]
ethnicities = [*ethnicity_map]
therapies = [*therapy_map]

ISO_FMT = '%Y-%m-%d'

def random_date(start, end):
    stime = time.mktime(time.strptime(start, ISO_FMT))
    etime = time.mktime(time.strptime(end, ISO_FMT))
    ptime = stime + random() * (etime - stime)
    return time.strftime(ISO_FMT, time.localtime(ptime))

def get_age(dob):
    return date.today().year - datetime.strptime(dob, ISO_FMT).year
    

def random_sample_data():
    return {
        'gender': choice(genders),
        'ethnicity': choice(ethnicities),
        'dob': random_date('1930-01-01', '2020-01-01'),
        'therapy': choice(therapies) 
    }
    
samples_count = 1000
samples_df = pd.DataFrame.from_dict([random_sample_data() for _ in range(samples_count)])
samples_df.head()

Unnamed: 0,dob,ethnicity,gender,therapy
0,1979-03-21,asian_pakistani,m,therapy C
1,1944-03-26,asian_pakistani,f,therapy A
2,1972-12-22,black_african,f,therapy A
3,1931-06-27,mixed_white_and_black_african,m,therapy C
4,1949-05-20,white_irish,m,therapy B


# Mold Data

In [139]:
samples_df['gender'] = samples_df['gender'].map(gender_map)

samples_df['ethnicity'] = samples_df['ethnicity'].map(ethnicity_map)

samples_df['age'] = samples_df['dob'].apply(get_age)

samples_df['therapy'] = samples_df['therapy'].map(therapy_map)

In [140]:
samples_df.head()

Unnamed: 0,dob,ethnicity,gender,therapy,age
0,1979-03-21,9,0,2,41
1,1944-03-26,9,1,0,76
2,1972-12-22,12,1,0,48
3,1931-06-27,5,0,2,89
4,1949-05-20,1,0,1,71


###  Check for null values

In [141]:
samples_df.isnull().values.any()

False

### Check class distribution

Check id labeled data is evenly distributed

In [142]:
num_obs = len(samples_df)
for therapy in therapies:
    num_therapy = len(samples_df.loc[samples_df['therapy'] == therapy_map[therapy]])
    print("Number of {} cases:  {} ({:2.2f}%)".format(therapy, num_therapy, (num_therapy/num_obs) * 100))

Number of therapy A cases:  322 (32.20%)
Number of therapy B cases:  346 (34.60%)
Number of therapy C cases:  332 (33.20%)


### Prepare features/predcited data sets

In [143]:
from sklearn import preprocessing

feature_col_names = ['age', 'gender', 'ethnicity']
predicted_class_names = ['therapy']

X = samples_df[feature_col_names].values     # predictor feature columns
y = samples_df[predicted_class_names].values # predicted class (1=true, 0=false)

standard_scaler = preprocessing.StandardScaler()
X_scaled = standard_scaler.fit_transform(X)

In [144]:
X_scaled_df = pd.DataFrame(X_scaled, columns=feature_col_names)
X_scaled_df.describe()

Unnamed: 0,age,gender,ethnicity
count,1000.0,1000.0,1000.0
mean,1.218886e-16,-1.036948e-16,8.482104000000001e-17
std,1.0005,1.0005,1.0005
min,-1.794009,-0.9940179,-1.708101
25%,-0.873577,-0.9940179,-0.7727961
50%,0.06623233,-0.9940179,-0.07131704
75%,0.8413328,1.006018,0.8639884
max,1.655188,1.006018,1.565467


### Spliting the data

70% for training, 30% for testing

In [145]:
from sklearn.model_selection import train_test_split

split_test_size = 0.30
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=split_test_size, random_state=42) 

In [146]:
print("{0:0.2f}% in training set".format((len(X_train)/len(samples_df.index)) * 100))
print("{0:0.2f}% in test set".format((len(X_test)/len(samples_df.index)) * 100))

70.00% in training set
30.00% in test set


# Training Initial Algorithm - Naive Bayes

In [147]:
from sklearn.naive_bayes import GaussianNB

nb_model = GaussianNB()
nb_model.fit(X_train, y_train.ravel())

GaussianNB(priors=None, var_smoothing=1e-09)

### Performance on Training Data

In [148]:
# import the performance metrics library
from sklearn import metrics

In [149]:
nb_predict_train = nb_model.predict(X_train)

print("Accuracy: {0:.4f}".format(metrics.accuracy_score(y_train, nb_predict_train)))

Accuracy: 0.3800


### Performance on Testing Data

In [150]:
nb_predict_test = nb_model.predict(X_test)

print("Accuracy: {0:.4f}".format(metrics.accuracy_score(y_test, nb_predict_test)))

Accuracy: 0.2833


### Metrics

In [151]:
print("Confusion Matrix")
print("{0}".format(metrics.confusion_matrix(y_test, nb_predict_test)))
print("")

print("Classification Report")
print(metrics.classification_report(y_test, nb_predict_test))

Confusion Matrix
[[15 63 26]
 [16 49 29]
 [18 63 21]]

Classification Report
              precision    recall  f1-score   support

           0       0.31      0.14      0.20       104
           1       0.28      0.52      0.36        94
           2       0.28      0.21      0.24       102

    accuracy                           0.28       300
   macro avg       0.29      0.29      0.27       300
weighted avg       0.29      0.28      0.26       300



# Random Forest

In [152]:
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(random_state=42, n_estimators=10)
rf_model.fit(X_train, y_train.ravel()) 

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

### Predict Training Data

In [153]:
rf_predict_train = rf_model.predict(X_train)
print("Accuracy: {0:.4f}".format(metrics.accuracy_score(y_train, rf_predict_train)))

Accuracy: 0.8986


### Predict Test Data

In [154]:
rf_predict_test = rf_model.predict(X_test)
print("Accuracy: {0:.4f}".format(metrics.accuracy_score(y_test, rf_predict_test)))

Accuracy: 0.3233


In [155]:
print(metrics.confusion_matrix(y_test, rf_predict_test) )
print("")
print("Classification Report")
print(metrics.classification_report(y_test, rf_predict_test))

[[41 40 23]
 [30 35 29]
 [47 34 21]]

Classification Report
              precision    recall  f1-score   support

           0       0.35      0.39      0.37       104
           1       0.32      0.37      0.34        94
           2       0.29      0.21      0.24       102

    accuracy                           0.32       300
   macro avg       0.32      0.32      0.32       300
weighted avg       0.32      0.32      0.32       300

