In [None]:
import pandas as pd
import numpy as np
from imblearn.over_sampling import SMOTE
import xgboost as xgb
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, fbeta_score, accuracy_score, f1_score, make_scorer
from sklearn.model_selection import StratifiedKFold, GridSearchCV, KFold, cross_validate
from imblearn.under_sampling import RandomUnderSampler

from sklearn import svm
from sklearn.model_selection import cross_validate
from sklearn.model_selection import KFold, cross_val_score
import os

In [None]:
train = pd.read_csv('cases_2021_train_processed_2.csv')
test = pd.read_csv('cases_2021_test_processed_unlabelled_2.csv')

pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1000)

<h3>### PART 1.1 FEATURE SELECTION ### </h3>

In [None]:
# Country and province probably aren't needed
# That information is implied in the other data (incident rate, confirmed, etc.)
# There are too many different countries to distinguish all as levels
# AND some are present in train but not test and vice versa

train = train.drop(columns=['province', 'country'])
test = test.drop(columns=['province', 'country'])

In [None]:
# Exclude latitude and longitude
# Once again this is somewhat implicit in the other values

train = train.drop(columns=['latitude', 'longitude'])
test = test.drop(columns=['latitude', 'longitude'])

In [None]:
# We don't want to use any of Confirmed, Deaths, Recovered, or Active in their og form
# There is such a range in values that it will mess with results
# Ideally we can capture the same info by converting to ratios
# PLUS we have less variables this way
# Case_Fatality_Ratio = Deaths / Confirmed
# Incident_Rate = Not sure how exactly it's derived
# (something with population and cases I think)
### OUR OWN ###
# Case_Active_Ratio = Active / Confirmed
# We can scrap Recovered
# It's going to be super correlated with Case_Fatality_Ratio and Case_Active_Ratio

train['Case_Active_Ratio'] = train['Active'] / train['Confirmed'] * 100
train = train.drop(columns=['Confirmed', 'Deaths', 'Recovered', 'Active'])

test['Case_Active_Ratio'] = test['Active'] / test['Confirmed'] * 100
test = test.drop(columns=['Confirmed', 'Deaths', 'Recovered', 'Active'])

<h3>### PART 1.2 MAPPING THE FEATURES ###</h3>

In [None]:
# Mapping outcome_group
# dec = 0
# hosp = 1
# non-hosp = 2
train.outcome_group = pd.Categorical(train.outcome_group)
train.outcome_group = train.outcome_group.cat.codes

In [None]:
# Mapping sex
# 0 = Female
# 1 = Male
train.sex = pd.Categorical(train.sex)
train.sex = train.sex.cat.codes

test.sex = pd.Categorical(test.sex)
test.sex = test.sex.cat.codes

In [None]:
# Mapping chronic_disease_binary
# 0 = False
# 1 = True
train.chronic_disease_binary = pd.Categorical(train.chronic_disease_binary)
train.chronic_disease_binary = train.chronic_disease_binary.cat.codes

test.chronic_disease_binary = pd.Categorical(test.chronic_disease_binary)
test.chronic_disease_binary = test.chronic_disease_binary.cat.codes

In [None]:
# Mapping date confirmation
# value = month, eg 2020-04-23 = 4
train['date_confirmation'] = pd.DatetimeIndex(train['date_confirmation']).month
test['date_confirmation'] = pd.DatetimeIndex(test['date_confirmation']).month

<h3>### PART 1.3 Balancing Classes ###

In [None]:
# Can try to use SMOTE oversampling
# Idk whether it will help the way it should but it's worth a try

# Need to split out our validation set before this
# Approx 80/20 train/validation split
np.random.seed(459)
train_ind = np.random.rand(len(train)) < 0.8

validation = train[~train_ind]
train = train[train_ind]

In [None]:
# Now we oversample JUST our train set
X_train = train.drop(columns=['outcome_group'])
Y_train = train['outcome_group']

In [None]:
smote = SMOTE(random_state = 459)

# Fit the SMOTE
X_oversample, Y_oversample = smote.fit_resample(X_train, Y_train)

In [None]:
# This is our final training set
train = X_oversample
train['outcome_group'] = Y_oversample

train.to_csv("train.csv")
validation.to_csv("validation.csv")
test.to_csv("test.csv")

<h3>### PART 1.4.1 KNN ###</h3>

In [None]:
# read
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
validation = pd.read_csv('validation.csv')

# processing for knn
train_knn = train.iloc[:, 1:]
test_knn = test.iloc[:, 1:].values
cols = validation.columns.tolist()
cols = cols[:-2] + cols[-1:] + cols[7:8]
validation_knn = validation[cols].iloc[:, 1:]

#x_train, y_train
X_train_knn = train_knn.iloc[:, :-1].values
y_train_knn = train_knn.iloc[:, 7].values
#x_val, y_val
X_validation_knn = validation_knn.iloc[:, :-1].values
y_validation_knn = validation_knn.iloc[:, 7].values

In [None]:
# hyper-para tuning using grid search
def custom_scorer(y_validation_knn, y_pred_knn):
    return f1_score(y_validation_knn, y_pred_knn, average = None) [0]

scoring_knn = {'accuracy' : make_scorer(accuracy_score), 
               'f1_dec' : make_scorer(custom_scorer),
               'f1_score' : make_scorer(f1_score, average='macro')}

knn_para = {
    "n_neighbors": range(3, 13),
    "weights": ["uniform", "distance"],
    "algorithm": ["auto", "ball_tree", "kd_tree", "brute"],
    "metric": ["euclidean", "manhattan"]
}

knn = KNeighborsClassifier()
kfold = KFold(n_splits=5, random_state=5, shuffle=True)
# sfold = StratifiedKFold(n_splits=5, random_state=4, shuffle=True)

search = GridSearchCV(estimator=knn, param_grid=knn_para, n_jobs=-1, cv=kfold, scoring=scoring_knn, refit='f1_dec', error_score=0)
results = search.fit(X_train_knn, y_train_knn)

In [None]:
# using best parameters from tuning 
knn_best = knn.set_params(**results.best_params_)
knn_best.fit(X_train_knn, y_train_knn)
y_pred_knn = knn_best.predict(X_validation_knn)

# generating knn_predict.csv
val_id_pred = pd.DataFrame(y_pred_knn, columns = ['KNN_labels'])
val_id_pred.to_csv('KNN_predict.csv')

# generating knn.txt for all parameter combinations and given metric

In [None]:
%%capture cap --no-stderr
for param, acc, f1, f1d in zip(results.cv_results_['params'], results.cv_results_['mean_test_accuracy'], results.cv_results_['mean_test_f1_score'], results.cv_results_['mean_test_f1_dec']):
    print ("\nParameters:", param, "\nMean accuracy\t   : ", acc, "\nMean macro F1 score: ", f1, "\nMean F1 deceased   : ", f1d)

In [None]:
with open('knn_tuning.txt', 'w') as f:
        f.write(cap.stdout)

In [None]:
# performace for best parameters

res = cross_validate(knn_best, X_validation_knn, y_validation_knn, cv = kfold, scoring = scoring_knn)

print("All macro F1 scores across validation data: ")
print(*res['test_f1_score'], sep=', ')
print("\nAll F1 scores across validation data for 'deceased':")
print(*res['test_f1_dec'], sep=', ')

print("\nMean accuracy across val    : {:0.3f}".format(res['test_accuracy'].mean()))
print("Mean macro F1 across val    : {:0.3f}".format(res['test_f1_score'].mean()))
print("Mean macro F1 for 'deceased': {:0.3f}".format(res['test_f1_dec'].mean()))

<h3>### PART 1.4.2 SVM ###</h3>

In [None]:
# Get Current Working directory
dir = os.getcwd()


# Load Data
train = pd.read_csv(dir + "/train.csv")
validation = pd.read_csv(dir + "/validation.csv")


# Remove column 1 (unnamed column)
train = train.iloc[: , 1:]
validation = validation.iloc[: , 1:]

# Split Labeled Data
train_y = train['outcome_group']
train_x = train.drop(columns = 'outcome_group')

validation_y = validation['outcome_group']
validation_x = validation.drop(columns = 'outcome_group')

kfold = KFold(n_splits=5)

def deceased_f1(truth, pred):
    return f1_score(truth, pred, average=None)[0]

scorers = {
    'f1_score': make_scorer(fbeta_score, beta=1, average='macro'),
    'accuracy': make_scorer(accuracy_score),
    'f1_micro': make_scorer(deceased_f1),
}

# kernels = ['rbf', 'sigmoid','linear']
# C_range = [0.1, 1, 2, 3, 4, 5, 6, 7, 10]
# gamma_range = [2, 1, 0.1, 0.01, 0.001, 0.0001]

# Parameter for the best results
param_grid = {'C': [6],
              'gamma': [.5],
              'kernel': ['rbf']
              }

grid = GridSearchCV(svm.SVC(), param_grid, scoring=scorers, cv = 5, refit = "accuracy")

# Fitting the model
grid.fit(validation_x, validation_y)

In [None]:
# See results for SVM
results = cross_validate(estimator=grid,
                            X=X_valid,
                            y=y_valid,
                            cv=kfold,
                            scoring=scorers)

print("Macro F1-score:", results['test_f1_score'])
print("Overall accuracy:", results['test_accuracy'])
print("F1-score on deceased:", results['test_f1_micro'])

print("Mean macro F1-score:", np.mean(results['test_f1_score']))
print("Mean F1-score on deceased:", np.mean(results['test_f1_micro']))
print("Mean overall accuracy:", np.mean(results['test_accuracy']))

In [None]:
# Save predictions for validation set
grid_predictions = grid.predict(X_valid)

pd.DataFrame(grid_predictions, columns= ['SVM_labels']).to_csv("SVM_predictions.csv")

<h3>### PART 1.4.3 XGB ###</h3>

In [None]:
X_train, y_train = get_X_y(train)
X_valid, y_valid = get_X_y(validation)

xgb_model = xgb.XGBClassifier(objective='multi:softmax', use_label_encoder=False, eval_metric=['merror'])

xgb.set_config(verbosity=1)

In [None]:
def deceased_f1(truth, pred):
    return f1_score(truth, pred, average=None)[0]

scorers = {
    'f1_score': make_scorer(fbeta_score, beta=1, average='macro'),
    'accuracy': make_scorer(accuracy_score),
    'f1_micro': make_scorer(deceased_f1)
}

# Params were altered across multiple grid search runs
# As described in report
params = {
    'min_child_weight': [1],
    'gamma': [0.5],
    'subsample': [1],
    'colsample_bytree': [0.4],
    'n_estimators' : [1500],
    'max_depth': [5],
    'learning_rate': [0.005]
}

grid = GridSearchCV(xgb_model,
                    param_grid = params,
                    n_jobs = -1,
                    scoring=scorers,
                    cv = 5,
                    refit = "accuracy")

In [None]:
classifier = xgb.XGBClassifier(silent=False,
                               min_child_weight=1,
                               gamma=0.5,
                               subsample=1,
                               colsample_bytree=0.4,
                               n_estimators=1500,
                               max_depth=5,
                               learning_rate=0.005,
                               # early_stopping_rounds=10,
                               objective='multi:softmax',
                               nthread=4)

eval_set = [(X_train, y_train), (X_valid, y_valid)]
# eval_metric = ["auc","merror"]
classifier.fit(X_train, y_train, eval_set=eval_set, eval_metric=['merror'])

### Performance Check ###
print('### Train ###')
print_performance_xgb(train, classifier)
print('### Validation ###')
print_performance_xgb(validation, classifier)

train1 = classifier.predict_proba(X_train)
valid1 = classifier.predict_proba(validation.loc[:, ~validation.columns.isin(['outcome_group'])])

print(np.asarray(y_train))
print(np.argmax(train1, axis=1))

y_true = list(y_train)
y_pred = list(np.argmax(train1, axis=1))

print(fbeta_score(y_true, y_pred, beta=1.0, average='macro'))

y_true = list(validation['outcome_group'])
y_pred = list(np.argmax(valid1, axis=1))

print(fbeta_score(y_true, y_pred, beta=1.0, average='macro'))

<h3>### PART 1.5 Overfitting ###</h3>

<h3>### PART 1.6 Comparative Study ###</h3>

In [None]:
def get_X_y(data):
    return data.loc[:, ~data.columns.isin(['outcome_group'])], list(data['outcome_group'])

# Used to see how we performed on each class
def print_performance(truth, pred):
    yt = truth
    yp = pred

    f1_s = fbeta_score(yt, yp, beta=1.0, average='macro')
    print("F1 Score: ", f1_s)

    print("Accuracy: ", sum(yt == yp)/len(yt))

    np_y = np.vstack((yt, yp)).T

    class_0 = np_y[np_y[:,0] == 0]
    class_1 = np_y[np_y[:,0] == 1]
    class_2 = np_y[np_y[:,0] == 2]

    print("Detection Rate Class 0 (Deceased): ", np.sum(class_0[:,0] == class_0[:,1])/class_0.shape[0])
    print("Detection Rate Class 1 (Hospitalized): ", np.sum(class_1[:, 0] == class_1[:, 1])/class_1.shape[0])
    print("Detection Rate Class 2 (Non-Hospitalized): ", np.sum(class_2[:, 0] == class_2[:, 1])/class_2.shape[0])
    print("By class macro F1: ", f1_score(yt, yp, average=None))


svm = pd.read_csv('SVM_predictions.csv')['SVM_labels']
print(svm)

knn = pd.read_csv('KNN_predict.csv')['KNN_labels']
print(knn)

xgb = pd.read_csv('xgb_predictions.csv')['Prediction']
print(xgb)

truth = pd.read_csv('validation.csv')['outcome_group']
print(truth)

print("### KNN PERFORMANCE ###")
print_performance(truth, knn)
print("\n### SVM PERFORMANCE ###")
print_performance(truth, svm)
print("\n### XGB PERFORMANCE ###")
print_performance(truth, xgb)

<h3>### PART 1.7 Predictions on Test ###</h3>

In [None]:
test_preds = list(np.argmax(classifier.predict_proba(test), axis=1))
test_preds = pd.DataFrame(test_preds, columns=['Prediction'])
test_preds['Id'] = test_preds.index
test_preds.to_csv('predictions.csv', index=False)

print(test_preds['Prediction'].value_counts())