# 1 Import Dataset

In [None]:
#### IMPORTS ####
import pandas as pd # for data manipulation and .describe()

# #### READ IN DATA ####
cov = pd.read_csv("datasets/latestdata.csv") # original (can be switched out for smaller or outcome to run quicker)

# Inspect data
print(cov.count()) # print(cov["sex"].describe() # also good



# 2 Data Preparation

## 2.1 Drop irrelevant columns and rows

In [None]:
#### SELECT ONLY THE RELEVANT COLUMNS ####
# Interesting problems which don't have quite enough data
# fields = ["age", "sex", "outcome", "date_admission_hospital", "date_onset_symptoms", "country"] # 228 rows
# fields = ["outcome", "date_admission_hospital", "date_onset_symptoms"] # 234 rows
# fields = ["outcome", "date_admission_hospital", "date_confirmation"] # 262 rows
# fields = ["outcome", "date_confirmation", "date_onset_symptoms"] # 3505 rows

# Identify the columns required for the problem
fields = ["outcome", "age", "sex", "date_confirmation", "date_onset_symptoms", "country"] # 3493 rows - mostly from the phillipines
# Without the dates - worth having a look into
# fields = ["outcome", "country", "age", "sex"] # 33599 rows

# Select these columns from the dataset
dataset = cov[fields]

#### DATA CLEANING ####
 
# Drop the rows which are missing information
dataset = dataset.dropna(subset=fields)

# Store the set for future use
dataset.to_csv("datasets/dataset1.csv")
print('stored dataset')

In [None]:
# Instead of doing the above steps you can load the processed dataset
import pandas as pd
dataset = pd.read_csv('datasets/dataset1.csv')
print('read dataset from dataset1.csv')
# Drop the unnamed column
dataset = dataset.drop(columns=['Unnamed: 0'])

## 2.2 Feature encoding

In [None]:
# Tidy the outcome column
dataset = dataset.replace(to_replace={'outcome': {
    'died':0,
    'death':0,
    'Deceased':0,
    'dead':0,
    'stable':1,
    'treated in an intensive care unit (14.02.2020)':1,
    'Symptoms only improved with cough. Currently hospitalized for follow-up.':1, # TODO drop and compare results
    'severe':0,        
    'Hospitalized':1, # TODO drop and compare
    'discharge':1,
    'discharged':1,
    'Discharged':1,
    'Alive':1,
    'recovered':1,
    }})

print(dataset['age'].head)

# Tidy the ages column # TODO these need feature scaling
dataset = dataset.replace(to_replace={'age': {
    '0-9':5,
    '10-19':15,
    '20-29':25,
    '30-39':35,
    '40-49':45,
    '50-59':55,
    '60-69':65,
    '70-79':75,
    '80-89':85,
    '90-99':95,
    }}, regex=True)
# age_df = pd.get_dummies(dataset['age'])

# Apply feature scaling to the ages
from sklearn.preprocessing import MinMaxScaler


scaler = MinMaxScaler()


dataset[['age']] = scaler.fit_transform(dataset[['age']])

# dataset['age'] = pd.DataFrame(scaler.fit_transform(pd.DataFrame(dataset['age'])),columns=['age'])

print(dataset['age'].head)



# Replace the two dates columns with days_waiting
gaps = []
from datetime import date, datetime, timedelta
for i in range(len(dataset['date_confirmation'])):
    dataset['date_confirmation'][i] = datetime.strptime(dataset['date_confirmation'][i], r'%d.%m.%Y')
    dataset['date_onset_symptoms'][i] = datetime.strptime(dataset['date_onset_symptoms'][i], r'%d.%m.%Y')
    gaps.append(dataset['date_confirmation'][i] - dataset['date_onset_symptoms'][i])

dataset['days_waiting'] = gaps
dataset = dataset.drop(columns=['date_confirmation', 'date_onset_symptoms'])
dataset['days_waiting'] = dataset['days_waiting'].dt.days
dataset = dataset[dataset['days_waiting'] >= 0]

# Encode the sex data as integers
dataset = dataset.replace(to_replace={'sex': {
    'male':0,
    'female':1
}})

# Encode the country data using one hot encoding
countries_df = pd.get_dummies(dataset['country'])
dataset = pd.concat([dataset, countries_df], axis=1)
dataset = dataset.drop(columns=['country'])

# Save the cleaned up dataset for future use
dataset.to_csv('datasets/dataset1_clean.csv')

## 2.3 Reweighting

## 2.4 Split the dataset into features/labels and test/train

In [None]:
# Split the data into features and labels
# features = dataset[['days_waiting', 'age', 'sex', 'country']]
features = dataset.drop(['outcome'], axis=1)
labels = dataset['outcome']

print('features\n', features.value_counts())
print('labels\n', labels.value_counts())

# Split the data into training and testing
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.1, random_state=0)

# 3 Visualisation

# 4 Model Training

In [None]:
# Import and fit a Support Vector Machine
from sklearn import svm
svm_classifier = svm.SVC(gamma='auto', class_weight={0: 2832/624, 1:1})
svm_classifier.fit(X_train, y_train)

# Immport and fit a logistic regression model
from sklearn import linear_model
# LR_classifier = linear_model.LogisticRegression(solver='lbfgs', multi_class='multinomial', max_iter=1000) # MULTINOMIAL There are five solvers that can be used to obtain the weights 
LR_classifier = linear_model.LogisticRegression(solver='lbfgs', max_iter=1000, class_weight={0: 2832/624, 1:1})
LR_classifier.fit(X_train, y_train)

# Import and fit a decision tree
from sklearn import tree
tree_classifier = tree.DecisionTreeClassifier(class_weight={0: 2832/624, 1:1})
tree_classifier.fit(X_train, y_train)

# Import and fit a naive bayes model
from sklearn.naive_bayes import GaussianNB
# from sklearn.naive_bayes import MultinomialNB
nb_classifier = GaussianNB()
# nb_classifier = MultinomialNB()
nb_classifier.fit(X_train, y_train)

# # import and fit an XGboost model
import xgboost as xgb
xgb_classifier = xgb.XGBClassifier()#use_label_encoder=False)
xgb_classifier.fit(X_train, y_train)

# 5 Model Evaluation

## 5.1 Use the models to make some predictions

In [None]:
# Make predictions on the test dataset with each model
svm_predictions = svm_classifier.predict(X_test)
LR_predictions = LR_classifier.predict(X_test)
tree_predictions = tree_classifier.predict(X_test)
nb_predictions = nb_classifier.predict(X_test)
xgb_predictions = xgb_classifier.predict(X_test)



## 5.2 Measure the accuracy of the predictions

In [None]:
# Measure the accuracy of each set of predictions
from sklearn import metrics
svm_accuracy = metrics.accuracy_score(y_test, svm_predictions)
LR_accuracy = metrics.accuracy_score(y_test, LR_predictions)
tree_accuracy = metrics.accuracy_score(y_test, tree_predictions)
# random forest
nb_accuracy = metrics.accuracy_score(y_test, nb_predictions)
xgb_accuracy = metrics.accuracy_score(y_test, xgb_predictions)

print(f'SVM score: {svm_accuracy}\nLR score: {LR_accuracy}\nTree score: {tree_accuracy}\nNaive Bayes score: {nb_accuracy}\nXGB score: {xgb_accuracy}')



## 5.3 Produce a more detailed report

In [None]:
# Produce classification reports

print(f'Classification report for svm (remember 0:died, 1:hospitalised, 2:recovered (0.805)): \n{metrics.classification_report(y_test, svm_predictions)}')
# These scores are all around the 80% mark.
# I'd like to know what the true and false positive rates are (precision = % of Positives that are correct and recall = % of negatives that are found)
# I think it'll be necessary to adjust the sampling or reweight the inputs
# get rid of hospitalised - they probably recovered
# reweight the died class: for every entry repeat 4 times
# make a few plots such as age/country and days waiting
# Should I be doing k-fold validation? Do I have enough data for that?
# There are also several model parameters that could be tuned