In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier

train_data = pd.read_excel("data/cases_2021_train_processed_2.xlsx")
test_data = pd.read_excel("data/cases_2021_test_processed_unlabelled_2.xlsx")

# 1.1 Feature Selection

In [None]:
train = train_data.copy()
train = train[['age', 'country', 'chronic_disease_binary', 'Case_Fatality_Ratio','outcome_group']]
test = test_data.copy()
test = test[['age', 'country', 'chronic_disease_binary', 'Case_Fatality_Ratio']]
train_data, test_data = train,test 

# 1.2 Feature Mapping

In [None]:
train = train_data.copy()
train['country'] = pd.factorize(train['country'])[0]
train['chronic_disease_binary'] = pd.factorize(train['chronic_disease_binary'])[0]
new_label = {"outcome_group": {"deceased": 0, "hospitalized": 1, "nonhospitalized": 2}}
train.replace(new_label, inplace = True)
test = test_data.copy()
test['country'] = pd.factorize(test['country'])[0]
test['chronic_disease_binary'] = pd.factorize(test['chronic_disease_binary'])[0]
train_data, test_data = train,test 

# 1.3 Balancing Classes

In [None]:
def show_train_dataset_pie_chart(train_dataset: pd.DataFrame, title: str):
    plt.figure()
    data = train_dataset.groupby("outcome_group").size()
    print("\n" + title + ",")
    print(data)
    data = [int(data[0]), int(data[1]), int(data[2])]
    labels = ["deceased", "hospitalized", "nonhospitalized"]
    colours = sns.color_palette('pastel')[0:4]
    plt.pie(x=data, labels=labels, colors=colours, autopct='%.0f%%')
    plt.title(title)
    plt.show()

show_train_dataset_pie_chart(train_data, "Before Balancing")

deceased = train_data[train_data["outcome_group"] == 0]
new_deceased = deceased.sample(frac=10, replace=True, random_state=1)
new_deceased.reset_index(inplace=True, drop=True)

hospitalized = train_data[train_data["outcome_group"] == 1]
hospitalized_sample = np.random.choice(hospitalized.index, 3000, replace=True)
new_hospitalized = hospitalized.drop(hospitalized_sample)
new_hospitalized.reset_index(inplace=True, drop=True)

nonhospitalized = train_data[train_data["outcome_group"] == 2]
new_nonhospitalized = nonhospitalized.sample(frac=3.3, replace=True, random_state=1)
new_nonhospitalized.reset_index(inplace=True, drop=True)

new_train = pd.concat([new_deceased, new_hospitalized, new_nonhospitalized])
new_train.sort_index(axis = 0, inplace=True)
new_train.reset_index(inplace=True, drop=True)

show_train_dataset_pie_chart(new_train, "After Balancing")

train_data = new_train

# 1.4 Building Models

### Train/Validation Split,

In [None]:
train_data, validation_data = train_test_split(train_data, test_size=0.2)

### XG Boost,

In [46]:
# Takes about 1 - 2 minutes to run.

# Decide number of k-fold splits
k = 5
# Create model with blank parameters
xgb_model = xgb.XGBClassifier(random_state = 1)
# Create space of possible parameters
parameter_search_space = {
    "learning_rate": [0.2, 0.3],
    "max_depth": [6, 8, 10],
    "n_estimators": [150, 250],
    "objective": ["multi:softmax"],
    "num_class": [3]
}
# Create grid search cross validation object
grid_search_cv = GridSearchCV(
    estimator=xgb_model,
    param_grid=parameter_search_space,
    scoring="f1_macro",
    cv=k,
    verbose=10
)
# Put data and labels in proper format
data = train_data.iloc[:, :4].values
labels = train_data.iloc[:, 4].values.reshape(-1, 1)
# Fit grid search object
grid_search_cv.fit(data, labels)
# Print and save results.
print("XG Boost GridSearchCV best score = " + str(grid_search_cv.best_score_))
print("XG Boost GridSearchCV best parameters = " + str(grid_search_cv.best_params_))
predictions = grid_search_cv.predict(data)
_, _, fscore, _ = precision_recall_fscore_support(predictions, labels)
print("XG Boost GridSearchCV deceased class f1-score = " + str(fscore[0]))
accuracy = accuracy_score(predictions, labels)
print("XG Boost GridSearchCV accuracy score = " + str(accuracy))
pd.DataFrame(grid_search_cv.cv_results_).to_csv("xgboost_results.csv")
xgb_model = grid_search_cv.best_estimator_

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV 1/5; 1/12] START learning_rate=0.2, max_depth=6, n_estimators=150, num_class=3, objective=multi:softmax
[CV 1/5; 1/12] END learning_rate=0.2, max_depth=6, n_estimators=150, num_class=3, objective=multi:softmax;, score=0.811 total time=   0.6s
[CV 2/5; 1/12] START learning_rate=0.2, max_depth=6, n_estimators=150, num_class=3, objective=multi:softmax
[CV 2/5; 1/12] END learning_rate=0.2, max_depth=6, n_estimators=150, num_class=3, objective=multi:softmax;, score=0.802 total time=   0.6s
[CV 3/5; 1/12] START learning_rate=0.2, max_depth=6, n_estimators=150, num_class=3, objective=multi:softmax
[CV 3/5; 1/12] END learning_rate=0.2, max_depth=6, n_estimators=150, num_class=3, objective=multi:softmax;, score=0.818 total time=   0.6s
[CV 4/5; 1/12] START learning_rate=0.2, max_depth=6, n_estimators=150, num_class=3, objective=multi:softmax
[CV 4/5; 1/12] END learning_rate=0.2, max_depth=6, n_estimators=150, num_class=3, objectiv

### Random Forest,

In [None]:
# Takes about 3-4 minutes to run.

# Decide number of k-fold splits
k = 5
# Create model with blank parameters
rf_model = RandomForestClassifier(random_state = 44)
# Create space of possible parameters
parameter_search_space = {
    'n_estimators': [50, 100, 150, 200],
    'criterion': ["gini", "entropy"],
    'max_features': [1, 2, 3, 4]
  }
# Create grid search cross validation object
grid_search_cv = GridSearchCV(
    estimator=rf_model,
    param_grid=parameter_search_space,
    scoring="f1_macro",
    cv=k,
    verbose=10
)
# Put data and labels in proper format
data = train_data.iloc[:, :4].values
labels = train_data.iloc[:, 4].values.ravel()
# Fit grid search object
grid_search_cv.fit(data, labels)
# Print and save results.
print("Random Forest GridSearchCV best score = " + str(grid_search_cv.best_score_))
print("Random Forest GridSearchCV best parameters = " + str(grid_search_cv.best_params_))
predictions = grid_search_cv.predict(data)
_, _, fscore, _ = precision_recall_fscore_support(predictions, labels)
print("Random Forest GridSearchCV deceased class f1-score = " + str(fscore[0]))
accuracy = accuracy_score(predictions, labels)
print("Random Forest GridSearchCV accuracy score = " + str(accuracy))
pd.DataFrame(grid_search_cv.cv_results_).to_csv("rf_results.csv")
rf_model = grid_search_cv.best_estimator_

### MLP Classifier

In [None]:
# Takes about 12 minutes to run.

from sklearn.neural_network import MLPClassifier
# x_train = train_data[['age', 'country', 'chronic_disease_binary', 'Case_Fatality_Ratio']]
# y_train = train_data[['outcome_group']]
mlp_gs = MLPClassifier()
parameter_space = {
    'hidden_layer_sizes': [(10,30,10),(20,)],
    'activation': ['tanh', 'relu'],
    'solver': ['sgd', 'adam'],
    'alpha': [0.0001, 0.05],
    'learning_rate': ['constant','adaptive'],
}

data = train_data.iloc[:, :4].values
labels = train_data.iloc[:, 4].values.reshape(-1, 1)
grid_search_cv = GridSearchCV(
    estimator=mlp_gs,
    param_grid=parameter_space,
    scoring="f1_macro",
    cv=5,
    verbose=10
)
grid_search_cv.fit(data,labels.ravel())

print("MLP Classifier GridSearchCV best score = " + str(grid_search_cv.best_score_))
print("MLP Classifier GridSearchCV best parameters = " + str(grid_search_cv.best_params_))
predictions = grid_search_cv.predict(data)
_, _, fscore, _ = precision_recall_fscore_support(predictions, labels)
print("MLP Classifier GridSearchCV deceased class f1-score = " + str(fscore[0]))
accuracy = accuracy_score(predictions, labels)
print("MLP Classifier GridSearchCV accuracy score = " + str(accuracy))
pd.DataFrame(grid_search_cv.cv_results_).to_csv("MLP Classifier_results.csv")
mlp_gs = grid_search_cv.best_estimator_


### Saving Hyper Parameter Results As Text Files

### Random Forest,

In [69]:
# UNCOMMENT TO REPRODUCE randomforest_tuning.txt FILE
"""
from sklearn.model_selection import KFold
data = train_data.iloc[:, :4].values
labels = train_data.iloc[:, 4].values.ravel()
rf_model = RandomForestClassifier(random_state = 44)
kf = KFold(n_splits=k)
n_estimators = [50, 100, 150, 200]
criterion = ["gini", "entropy"]
max_features = [1, 2, 3, 4]
text_file = open("randomforest_tuning.txt", "w")
for c in criterion:
    for n in n_estimators:
        for m in max_features:
            rf_model.set_params(criterion = c, n_estimators = n, max_features = m)
            deceased_f1score = 0
            mean_macro_f1score = 0
            accuracy = 0
            for i, (train_index, test_index) in enumerate(kf.split(data)):
                train_fold_data = np.take(data, train_index, 0)
                train_fold_labels = np.take(labels, train_index, 0)
                test_fold_data = np.take(data, test_index, 0)
                test_fold_labels = np.take(labels, test_index, 0)
                rf_model.fit(train_fold_data, train_fold_labels)
                predictions = rf_model.predict(test_fold_data)
                _, _, fscore, _ = precision_recall_fscore_support(predictions, test_fold_labels)
                deceased_f1score += fscore[0]
                mean_macro_f1score += ( fscore[0] + fscore[1] + fscore[2] ) / 3
                accuracy += accuracy_score(predictions, test_fold_labels)
            deceased_f1score = deceased_f1score / k
            mean_macro_f1score = mean_macro_f1score / k
            accuracy = accuracy / k
            output = "\ncriterion="+str(c)+",n_estimators="+str(n)+",max_features="+str(m)+" -> mean_macro_f1_score="+str(mean_macro_f1score)+",deceased_f1score="+str(deceased_f1score)+",accuracy="+str(accuracy)
            print(output)
            n = text_file.write(output)
text_file.close()
"""



criterion=gini,n_estimators=50,max_features=1 -> mean_macro_f1_score=0.8150136925404992,deceased_f1score=0.7294147046682057,accuracy=0.8198258665714221

criterion=gini,n_estimators=152,max_features=2 -> mean_macro_f1_score=0.8156660073405828,deceased_f1score=0.7304317157789292,accuracy=0.8204024219531991

criterion=gini,n_estimators=153,max_features=3 -> mean_macro_f1_score=0.8155120830777107,deceased_f1score=0.7303376452170739,accuracy=0.8202377112264131

criterion=gini,n_estimators=153,max_features=4 -> mean_macro_f1_score=0.8151214020911279,deceased_f1score=0.7302632352557803,accuracy=0.8198258750511753

criterion=gini,n_estimators=100,max_features=1 -> mean_macro_f1_score=0.8153279628980364,deceased_f1score=0.7294036085533686,accuracy=0.8201141866610768

criterion=gini,n_estimators=153,max_features=2 -> mean_macro_f1_score=0.8156672860312512,deceased_f1score=0.7303850836168241,accuracy=0.8204024304329524

criterion=gini,n_estimators=153,max_features=3 -> mean_macro_f1_score=0.8155

### XGBoost,

In [67]:
# UNCOMMENT TO REPRODUCE xgboost_tuning.txt FILE
"""
from sklearn.model_selection import KFold
xgb_model_clone = xgb.XGBClassifier(random_state = 1)
kf = KFold(n_splits=k)
learning_rate = [0.2, 0.3]
max_depth = [6, 8, 10]
n_estimators = [150, 250]
objective = "multi:softmax"
num_class = 3
xgb_model_clone.set_params(objective = "multi:softmax", num_class = 3)
text_file = open("xgboost_tuning.txt", "w")
for l in learning_rate:
    for n in n_estimators:
        for d in max_depth:
            xgb_model_clone.set_params(learning_rate = l, n_estimators = n, max_depth=d)
            deceased_f1score = 0
            mean_macro_f1score = 0
            accuracy = 0
            for i, (train_index, test_index) in enumerate(kf.split(data)):
                train_fold_data = np.take(data, train_index, 0)
                train_fold_labels = np.take(labels, train_index, 0)
                test_fold_data = np.take(data, test_index, 0)
                test_fold_labels = np.take(labels, test_index, 0)
                xgb_model_clone.fit(train_fold_data, train_fold_labels)
                predictions = xgb_model_clone.predict(test_fold_data)
                _, _, fscore, _ = precision_recall_fscore_support(predictions, test_fold_labels)
                deceased_f1score += fscore[0]
                mean_macro_f1score += ( fscore[0] + fscore[1] + fscore[2] ) / 3
                accuracy += accuracy_score(predictions, test_fold_labels)
            deceased_f1score = deceased_f1score / k
            mean_macro_f1score = mean_macro_f1score / k
            accuracy = accuracy / k
            output = "\nlearning_rate="+str(l)+",n_estimators="+str(n)+",max_depth="+str(d)+" -> mean_macro_f1_score="+str(mean_macro_f1score)+",deceased_f1score="+str(deceased_f1score)+",accuracy="+str(accuracy)
            print(output)
            n = text_file.write(output)
text_file.close()
"""



learning_rate=0.2,n_estimators=150,max_depth=6 -> mean_macro_f1_score=0.8135220243145304,deceased_f1score=0.7261081386042167,accuracy=0.8186316714034568

learning_rate=0.2,n_estimators=153,max_depth=8 -> mean_macro_f1_score=0.8137886347202773,deceased_f1score=0.7270035925773769,accuracy=0.8187139758883305

learning_rate=0.2,n_estimators=153,max_depth=10 -> mean_macro_f1_score=0.815123462767669,deceased_f1score=0.7291226916112434,accuracy=0.8199905518589485

learning_rate=0.2,n_estimators=250,max_depth=6 -> mean_macro_f1_score=0.8139689928476109,deceased_f1score=0.7272859564100248,accuracy=0.8188374834941603

learning_rate=0.2,n_estimators=153,max_depth=8 -> mean_macro_f1_score=0.8137886347202773,deceased_f1score=0.7270035925773769,accuracy=0.8187139758883305

learning_rate=0.2,n_estimators=153,max_depth=10 -> mean_macro_f1_score=0.815123462767669,deceased_f1score=0.7291226916112434,accuracy=0.8199905518589485

learning_rate=0.3,n_estimators=150,max_depth=6 -> mean_macro_f1_score=0.813

### MLP,

In [68]:
# UNCOMMENT TO REPRODUCE mlps_tuning.txt FILE
"""
from sklearn.model_selection import KFold
data = train_data.iloc[:, :4].values
labels = train_data.iloc[:, 4].values.ravel()
mlp_gs = MLPClassifier()
hidden_layer_sizes = [(10,30,10),(20,)]
activation =  ['tanh', 'relu']
solver =  ['sgd', 'adam']
alpha =  [0.0001, 0.05]
learning_rate =  ['constant','adaptive']

kf = KFold(n_splits=k)
text_file = open("mlp_tuning.txt", "w")
for h in hidden_layer_sizes:
    for act in activation:
        for s in solver:
            for a in alpha:
                for l in learning_rate:
                    mlp_gs.set_params(hidden_layer_sizes = h, activation = act, solver = s, alpha = a, learning_rate = l)
                    deceased_f1score = 0
                    mean_macro_f1score = 0
                    accuracy = 0
                    for i, (train_index, test_index) in enumerate(kf.split(data)):
                        train_fold_data = np.take(data, train_index, 0)
                        train_fold_labels = np.take(labels, train_index, 0)
                        test_fold_data = np.take(data, test_index, 0)
                        test_fold_labels = np.take(labels, test_index, 0)
                        rf_model.fit(train_fold_data, train_fold_labels)
                        predictions = rf_model.predict(test_fold_data)
                        _, _, fscore, _ = precision_recall_fscore_support(predictions, test_fold_labels)
                        deceased_f1score += fscore[0]
                        mean_macro_f1score += ( fscore[0] + fscore[1] + fscore[2] ) / 3
                        accuracy += accuracy_score(predictions, test_fold_labels)
                    deceased_f1score = deceased_f1score / k
                    mean_macro_f1score = mean_macro_f1score / k
                    accuracy = accuracy / k
                    output = "\nhidden_layer_sizes="+str(h)+",activation="+str(act)+",solver="+str(s)+",alpha="+str(a)+",learning_rate="+str(l)+" -> mean_macro_f1_score="+str(mean_macro_f1score)+",deceased_f1score="+str(deceased_f1score)+",accuracy="+str(accuracy)
                    print(output)
                    n = text_file.write(output)
text_file.close()
"""


hidden_layer_sizes=(10, 30, 10),activation=tanh,solver=sgd,alpha=0.0001,learning_rate=constant -> mean_macro_f1_score=0.8153298056812126,deceased_f1score=0.7303349953227495,accuracy=0.8200317804191644

hidden_layer_sizes=(10, 30, 10),activation=tanh,solver=sgd,alpha=0.0001,learning_rate=adaptive -> mean_macro_f1_score=0.8153298056812126,deceased_f1score=0.7303349953227495,accuracy=0.8200317804191644

hidden_layer_sizes=(10, 30, 10),activation=tanh,solver=sgd,alpha=0.05,learning_rate=constant -> mean_macro_f1_score=0.8153298056812126,deceased_f1score=0.7303349953227495,accuracy=0.8200317804191644

hidden_layer_sizes=(10, 30, 10),activation=tanh,solver=sgd,alpha=0.05,learning_rate=adaptive -> mean_macro_f1_score=0.8153298056812126,deceased_f1score=0.7303349953227495,accuracy=0.8200317804191644

hidden_layer_sizes=(10, 30, 10),activation=tanh,solver=adam,alpha=0.0001,learning_rate=constant -> mean_macro_f1_score=0.8153298056812126,deceased_f1score=0.7303349953227495,accuracy=0.8200317804

# 1.5 Check for overfitting

### XG Boost,

In [None]:
# Checking for overfitting on XG Boost model by comparing results on train versus validation datasets.
train_data_formatted = train_data.iloc[:, :4].values
train_labels_truth = train_data.iloc[:, 4].values.reshape(-1, 1)
train_labels_predicted = xgb_model.predict(train_data_formatted)
train_data_score = f1_score(train_labels_predicted, train_labels_truth, average = "macro")

validation_data_formatted = validation_data.iloc[:, :4].values
validation_labels_truth = validation_data.iloc[:, 4].values.reshape(-1, 1)
validation_labels_predicted = xgb_model.predict(validation_data_formatted)
validation_data_score = f1_score(validation_labels_predicted, validation_labels_truth, average = "macro")

print("Training Dataset F1-Score = " + str(train_data_score))
print("Validation Dataset F1-Score = " + str(validation_data_score))

### Random Forest,

In [None]:
# Checking for overfitting on Random Forest model by comparing results on train versus validation datasets.
train_data_formatted = train_data.iloc[:, :4].values
train_labels_truth = train_data.iloc[:, 4].values.ravel()
train_labels_predicted = rf_model.predict(train_data_formatted)
train_data_score = f1_score(train_labels_predicted, train_labels_truth, average = "macro")

validation_data_formatted = validation_data.iloc[:, :4].values
validation_labels_truth = validation_data.iloc[:, 4].values.ravel()
validation_labels_predicted = rf_model.predict(validation_data_formatted)
validation_data_score = f1_score(validation_labels_predicted, validation_labels_truth, average = "macro")

print("Training Dataset F1-Score = " + str(train_data_score))
print("Validation Dataset F1-Score = " + str(validation_data_score))

accuracy = accuracy_score(validation_labels_predicted, validation_labels_truth)
print("Validation dataset accuracy score = " + str(accuracy)) 

### MLP Classifier

In [None]:
train_data_formatted = train_data.iloc[:, :4].values
train_labels_truth = train_data.iloc[:, 4].values.reshape(-1, 1)
train_labels_predicted = mlp_gs.predict(train_data_formatted)
train_data_score = f1_score(train_labels_predicted, train_labels_truth, average = "macro")

validation_data_formatted = validation_data.iloc[:, :4].values
validation_labels_truth = validation_data.iloc[:, 4].values.reshape(-1, 1)
validation_labels_predicted = mlp_gs.predict(validation_data_formatted)
validation_data_score = f1_score(validation_labels_predicted, validation_labels_truth, average = "macro")

print("Training Dataset F1-Score = " + str(train_data_score))
print("Validation Dataset F1-Score = " + str(validation_data_score))

# 1.7 Prediction on test sets

In [None]:
import csv
# USING XG BOOST FOR NOW BUT WE CAN SUB THIS OUT FOR BEST PERFORMING MODEL LATER
testing_data = test_data.iloc[:, :4].values
predicted_labels = xgb_model.predict(testing_data)
# CHANGE MODEL NAME TO BEST PERFORMING MODEL LATER
model_name = "xgboost"
result_data_frame = pd.DataFrame(testing_data, columns=["age", "country", "chronic_disease_binary", "Case_Fatality_Ratio"])

# This function is from the TA
def create_submission_file(y_preds, file_name):
    with open(file_name, 'w') as csvfile:
        wr = csv.writer(csvfile, quoting=csv.QUOTE_ALL)
        wr.writerow(["Id", "Prediction"])
        for i, pred in enumerate(y_preds):
            wr.writerow([str(i), str(pred)])
create_submission_file(predicted_labels, "submission_"+model_name+".csv")

### Random Forest

In [None]:
# Depending on the final model selection, we can delete one of these

import csv
testing_data = test_data.iloc[:, :4].values
predicted_labels = rf_model.predict(testing_data)

model_name = "random_forest"
result_data_frame = pd.DataFrame(testing_data, columns=["age", "country", "chronic_disease_binary", "Case_Fatality_Ratio"])

# This function is from the TA
def create_submission_file(y_preds, file_name):
    with open(file_name, 'w') as csvfile:
        wr = csv.writer(csvfile, quoting=csv.QUOTE_ALL)
        wr.writerow(["Id", "Prediction"])
        for i, pred in enumerate(y_preds):
            wr.writerow([str(i), str(pred)])
create_submission_file(predicted_labels, "submission_"+model_name+".csv")