In [None]:
import pandas as pd
from joblib import load
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report, precision_score, recall_score,f1_score, roc_curve
from sklearn.utils import resample
from datetime import datetime, timedelta
import numpy as np

In [None]:
def preproc(data, features, target):
            
    dummies = [
     'result',
     'have_worked_in_hospital_care_facility',
     'have_worked_in_hospital_clinic',
     'have_worked_in_hospital_home_health',
     'have_worked_in_hospital_inpatient',
     'have_worked_in_hospital_other',
     'have_worked_in_hospital_outpatient',
     'have_worked_in_hospital_school_clinic',
     'contact_health_worker',
     'gender',
     'hcw',
     'need_inside_help',
     'need_outside_help',
     'needs_help',
     'housebound_problems',
     'help_available',
     'mobility_aid',
     'has_diabetes',
     'has_heart_disease',
     'has_lung_disease',
     'has_kidney_disease',
     'persistent_cough',
     'fatigue',
     'delirium',
     'shortness_of_breath',
     'fever',
     'diarrhoea',
     'abdominal_pain',
     'chest_pain',
     'hoarse_voice',
     'skipped_meals',
     'loss_of_smell',
     'headache',
     'eye_soreness',
     'nausea',
     'dizzy_light_headed',
     'red_welts_on_face_or_lips',
     'blisters_on_feet',
     'unusual_muscle_pains',
     'sore_throat'
    ]

    data['study_day'] = (pd.to_datetime(data.day_updated_at) - (pd.to_datetime(data.invite_date) - timedelta(days=1))).dt.days
    data = data[data.study_day <3]
    
    sob_f_dict = {'mild':1, 'severe':1, 'no':0, 'significant':1, np.nan: 0}
    data[['shortness_of_breath', 'fatigue']] = data[['shortness_of_breath', 'fatigue']].applymap(lambda x: sob_f_dict[x])
    data[dummies] = data[dummies].fillna(0).astype(int)
    data['bmi_clean'] = data['bmi_clean'].replace(np.inf,data[(data['bmi_clean'] >= 15) | (data['bmi_clean'] < 66)].bmi_clean.median())
    # replace BMI outliers with the median
    data['bmi_clean'] = np.where(
        (data['bmi_clean'] < 15) | (data['bmi_clean'] > 66), data[(data['bmi_clean'] >= 15) | (data['bmi_clean'] < 66)].bmi_clean.median(), data['bmi_clean']
    )
    
    df_agg = data.groupby(['test_id'])[features + [target]].max().reset_index()
    
    X = df_agg.drop(columns = ['result', 'test_id', 'date_taken_specific'])
    y = df_agg['result']
    
    return X, y

In [None]:
# Get sensitivity and specificity scores
def sens_spec(y, yp):
    return recall_score(y,yp), classification_report(y, yp, output_dict=True)['False']['recall']

In [None]:
# Map probability to 4-category classifier
def mapping(x):
    if x >= thresholds[idx_high_spec]:
        return 4
    elif  (x < thresholds[idx_high_spec]) & (x >= thresholds[idx_optimal]):
        return 3
    elif  (x < thresholds[idx_optimal]) & (x >= thresholds[idx_high_sens]):
        return 2
    else:
        return 1      

In [None]:
# Load test data - replace csv with the validation file
test_df = pd.read_csv('../../covid-early-detection/data/NewlyTested_Upto040520.csv', index_col=0).reset_index(drop=True).drop(columns='unusual_muscle_pains')
test_df['max_test'] = test_df.max_test.map({1:False,2:True})

In [None]:
REPS = 50
TARGET = ['max_test']

## 48 hours predictions

In [None]:
N = 2

In [None]:
two_days_test_df = test_df.groupby('patient_id').filter(lambda x: len(x)==N).reset_index(drop=True)

In [None]:
clf, idx_optimal, idx_high_sens, idx_high_spec, thresholds, fpr, tpr, FEATURES, ALL_SYMPTOMS, PAT_FEATURES = load('./Grouped_RF_2_12_05.joblib')

In [None]:
grouped_test_df = pd.concat([two_days_test_df.groupby('patient_id')[ALL_SYMPTOMS].agg(lambda x:x.sum()/N),
                             two_days_test_df.groupby('patient_id')[PAT_FEATURES+TARGET].mean()], axis=1).reset_index(drop=True)

In [None]:
auc_vals = []
sens_vals = []
spec_vals = []
cl_types = []
res_list = []

for r in range(REPS):
    
    ids = resample(grouped_test_df.index.unique())
    X_test = grouped_test_df.loc[grouped_test_df.index.isin(ids), FEATURES]
    y_test = grouped_test_df.loc[grouped_test_df.index.isin(ids), TARGET].values[:,0]
    X_test.loc[:,'p_predicted_covid'] = clf.predict_proba(X_test.loc[:, FEATURES])[:,1]

    nhs_prediction = (X_test['fever']+X_test['loss_of_smell']+X_test['persistent_cough'])>0
    true_y = y_test
    
    cl_type = 'nhs'
    cl_types.append(cl_type)
    auc_vals.append(roc_auc_score(true_y, nhs_prediction))
    sens_nhs, spec_nhs = sens_spec(true_y, nhs_prediction)
    sens_vals.append(sens_nhs)
    spec_vals.append(spec_nhs)
    
    for cl_type, idx in zip(['optimal','high sensitivity', 'high specificity'], [idx_optimal, idx_high_sens, idx_high_spec]):
        
        X_test.loc[:,'predicted_covid'] = X_test.loc[:,'p_predicted_covid'] >thresholds[idx]

        predicted_covid = X_test.predicted_covid
        p_predicted_covid = X_test.p_predicted_covid
        cl_types.append(cl_type)
        auc_vals.append(roc_auc_score(true_y, p_predicted_covid))
        sens_val, spec_val = sens_spec(true_y, predicted_covid)
        sens_vals.append(sens_val)
        spec_vals.append(spec_val) 
    
    X_test.loc[:,'4cat_predicted_covid'] = list(map(lambda x: mapping(x), X_test.loc[:,'p_predicted_covid']))
    aux_df = pd.concat([X_test.reset_index(drop=True), pd.DataFrame(y_test,columns=['swab_test'])], axis=1)
    res_list.append(aux_df.groupby(['4cat_predicted_covid','swab_test']).size().reset_index().pivot_table(index='4cat_predicted_covid',columns='swab_test',values=0))


aux_df = pd.DataFrame({'cl_type':cl_types, 'Sensitivity': sens_vals, 'Specificity': spec_vals})
results_1_df = aux_df.groupby(['cl_type'])[['Sensitivity','Specificity']].agg(['mean','std']).reset_index()
results_2_df = pd.concat(res_list, axis=0).groupby('4cat_predicted_covid').mean()

In [None]:
round(results_1_df,2)

In [None]:
results_2_df/results_2_df.sum().sum()*100

In [None]:
results_2_df/results_2_df.sum()*100

## 72 hours predictions

In [None]:
N = 3

In [None]:
three_days_test_df = test_df.groupby('patient_id').filter(lambda x: len(x)==N).reset_index(drop=True)

In [None]:
clf, idx_optimal, idx_high_sens, idx_high_spec, thresholds, fpr, tpr, FEATURES, ALL_SYMPTOMS, PAT_FEATURES = load('./Grouped_RF_3_12_05.joblib')

In [None]:
grouped_test_df = pd.concat([three_days_test_df.groupby('patient_id')[ALL_SYMPTOMS].agg(lambda x:x.sum()/N),
                             three_days_test_df.groupby('patient_id')[PAT_FEATURES+TARGET].mean()], axis=1).reset_index(drop=True)

In [None]:
auc_vals = []
sens_vals = []
spec_vals = []
cl_types = []
res_list = []

for r in range(REPS):
    
    ids = resample(grouped_test_df.index.unique())
    X_test = grouped_test_df.loc[grouped_test_df.index.isin(ids), FEATURES]
    y_test = grouped_test_df.loc[grouped_test_df.index.isin(ids), TARGET].values[:,0]
    X_test.loc[:,'p_predicted_covid'] = clf.predict_proba(X_test.loc[:, FEATURES])[:,1]
    
    nhs_prediction = (X_test['fever']+X_test['loss_of_smell']+X_test['persistent_cough'])>0
    true_y =  y_test
    
    cl_type = 'nhs'
    cl_types.append(cl_type)
    auc_vals.append(roc_auc_score(true_y, nhs_prediction))
    sens_nhs, spec_nhs = sens_spec(true_y, nhs_prediction)
    sens_vals.append(sens_nhs)
    spec_vals.append(spec_nhs)

    for cl_type, idx in zip(['optimal','high sensitivity', 'high specificity'], [idx_optimal, idx_high_sens, idx_high_spec]):
        
        X_test.loc[:,'predicted_covid'] = X_test.loc[:,'p_predicted_covid'] >thresholds[idx]
        
        predicted_covid = X_test.predicted_covid
        p_predicted_covid = X_test.p_predicted_covid
        
        cl_types.append(cl_type)
        auc_vals.append(roc_auc_score(true_y, p_predicted_covid))
        sens_val, spec_val = sens_spec(true_y, predicted_covid)
        sens_vals.append(sens_val)
        spec_vals.append(spec_val)
    
    X_test.loc[:,'4cat_predicted_covid'] = list(map(lambda x: mapping(x), X_test.loc[:,'p_predicted_covid']))
    aux_df = pd.concat([X_test.reset_index(drop=True), pd.DataFrame(y_test,columns=['swab_test'])], axis=1)
    res_list.append(aux_df.groupby(['4cat_predicted_covid','swab_test']).size().reset_index().pivot_table(index='4cat_predicted_covid',columns='swab_test',values=0))


aux_df = pd.DataFrame({'cl_type':cl_types, 'Sensitivity': sens_vals, 'Specificity': spec_vals})
results_1_df = aux_df.groupby(['cl_type'])[['Sensitivity','Specificity']].agg(['mean','std']).reset_index()
results_2_df = pd.concat(res_list, axis=0).groupby('4cat_predicted_covid').mean()

In [None]:
round(results_1_df,2)

In [None]:
results_2_df/results_2_df.sum().sum()*100

In [None]:
results_2_df/results_2_df.sum()*100

## Logit model (72 hours predictions)

This model requires slightly different preprocessing on the validation file

In [None]:
TARGET='result'

In [None]:
clf, idx_optimal, idx_high_sens, idx_high_spec, thresholds, fpr, tpr, FEATURES, ALL_SYMPTOMS, PAT_FEATURES = load('./Logit_16_11.joblib')

In [None]:
# replace this with your path to the validation file
path = '../../anna-may-data-science/hackathon/data/val_set.csv'
test_df = pd.read_csv(path)

In [None]:
grouped_test_df, processed_y = preproc(test_df[test_df.official], FEATURES, TARGET)
grouped_test_df[TARGET] = processed_y

In [None]:
FEATURES.remove('date_taken_specific')

In [None]:
auc_vals = []
sens_vals = []
spec_vals = []
cl_types = []
res_list = []

for r in range(REPS):
    
    ids = resample(grouped_test_df.index.unique())
    X_test = grouped_test_df.loc[grouped_test_df.index.isin(ids), FEATURES]
    y_test = np.array(grouped_test_df.loc[grouped_test_df.index.isin(ids), TARGET].values, dtype=bool)
    X_test.loc[:,'p_predicted_covid'] = clf.predict_proba(X_test.loc[:, FEATURES])[:,1]
    
    nhs_prediction = (X_test['fever']+X_test['loss_of_smell']+X_test['persistent_cough'])>0
    true_y =  y_test
    
    cl_type = 'nhs'
    cl_types.append(cl_type)
    auc_vals.append(roc_auc_score(true_y, nhs_prediction))
    sens_nhs, spec_nhs = sens_spec(true_y, nhs_prediction)
    sens_vals.append(sens_nhs)
    spec_vals.append(spec_nhs)

    for cl_type, idx in zip(['optimal','high sensitivity', 'high specificity'], [idx_optimal, idx_high_sens, idx_high_spec]):
        
        X_test.loc[:,'predicted_covid'] = X_test.loc[:,'p_predicted_covid'] >thresholds[idx]
        
        predicted_covid = X_test.predicted_covid
        p_predicted_covid = X_test.p_predicted_covid

        cl_types.append(cl_type)
        auc_vals.append(roc_auc_score(true_y, p_predicted_covid))
        sens_val, spec_val = sens_spec(true_y, predicted_covid)
        sens_vals.append(sens_val)
        spec_vals.append(spec_val)
    
    X_test.loc[:,'4cat_predicted_covid'] = list(map(lambda x: mapping(x), X_test.loc[:,'p_predicted_covid']))
    aux_df = pd.concat([X_test.reset_index(drop=True), pd.DataFrame(y_test,columns=['swab_test'])], axis=1)
    res_list.append(aux_df.groupby(['4cat_predicted_covid','swab_test']).size().reset_index().pivot_table(index='4cat_predicted_covid',columns='swab_test',values=0))


aux_df = pd.DataFrame({'cl_type':cl_types, 'Sensitivity': sens_vals, 'Specificity': spec_vals})
results_1_df = aux_df.groupby(['cl_type'])[['Sensitivity','Specificity']].agg(['mean','std']).reset_index()
results_2_df = pd.concat(res_list, axis=0).groupby('4cat_predicted_covid').mean()

In [None]:
round(results_1_df,2)

In [None]:
results_2_df/results_2_df.sum().sum()*100

In [None]:
results_2_df/results_2_df.sum()*100