In [23]:
import pandas as pd
from joblib import load
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report, precision_score, recall_score,f1_score, roc_curve
from sklearn.utils import resample

In [24]:
# Get sensitivity and specificity scores
def sens_spec(y, yp):
    return recall_score(y,yp), classification_report(y, yp, output_dict=True)['False']['recall']

In [25]:
# Map probability to 4-category classifier
def mapping(x):
    if x >= thresholds[idx_high_spec]:
        return 4
    elif  (x < thresholds[idx_high_spec]) & (x >= thresholds[idx_optimal]):
        return 3
    elif  (x < thresholds[idx_optimal]) & (x >= thresholds[idx_high_sens]):
        return 2
    else:
        return 1      

In [26]:
# Load test data - replace csv with the validation file
test_df = pd.read_csv('../../covid-early-detection/data/NewlyTested_Upto090520.csv', index_col=0).reset_index(drop=True).drop(columns='unusual_muscle_pains')
test_df['max_test'] = test_df.max_test.map({1:False,2:True})

In [27]:
REPS = 50
TARGET = ['max_test']

## 48 hours predictions

In [28]:
N = 2

In [29]:
two_days_test_df = test_df.groupby('patient_id').filter(lambda x: len(x)==N).reset_index(drop=True)

In [30]:
clf, idx_optimal, idx_high_sens, idx_high_spec, thresholds, fpr, tpr, FEATURES, ALL_SYMPTOMS, PAT_FEATURES = load('./Grouped_RF_2_12_05.joblib')

In [31]:
grouped_test_df = pd.concat([two_days_test_df.groupby('patient_id')[ALL_SYMPTOMS].agg(lambda x:x.sum()/N),
                             two_days_test_df.groupby('patient_id')[PAT_FEATURES+TARGET].mean()], axis=1).reset_index(drop=True)

In [32]:
auc_vals = []
sens_vals = []
spec_vals = []
cl_types = []
res_list = []

for r in range(REPS):
    
    ids = resample(grouped_test_df.index.unique())
    X_test = grouped_test_df.loc[grouped_test_df.index.isin(ids), FEATURES]
    y_test = grouped_test_df.loc[grouped_test_df.index.isin(ids), TARGET].values[:,0]
    X_test.loc[:,'p_predicted_covid'] = clf.predict_proba(X_test.loc[:, FEATURES])[:,1]

    for cl_type, idx in zip(['optimal','high sensitivity', 'high specificity'], [idx_optimal, idx_high_sens, idx_high_spec]):
        
        X_test.loc[:,'predicted_covid'] = X_test.loc[:,'p_predicted_covid'] >thresholds[idx]
        
        predicted_covid = X_test.predicted_covid
        p_predicted_covid = X_test.p_predicted_covid
        true_y =  y_test
        cl_types.append(cl_type)
        auc_vals.append(roc_auc_score(true_y, p_predicted_covid))
        sens_val, spec_val = sens_spec(true_y, predicted_covid)
        sens_vals.append(sens_val)
        spec_vals.append(spec_val)
    
    X_test.loc[:,'4cat_predicted_covid'] = list(map(lambda x: mapping(x), X_test.loc[:,'p_predicted_covid']))
    aux_df = pd.concat([X_test.reset_index(drop=True), pd.DataFrame(y_test,columns=['swab_test'])], axis=1)
    res_list.append(aux_df.groupby(['4cat_predicted_covid','swab_test']).size().reset_index().pivot_table(index='4cat_predicted_covid',columns='swab_test',values=0))


aux_df = pd.DataFrame({'cl_type':cl_types, 'Sensitivity': sens_vals, 'Specificity': spec_vals})
results_1_df = aux_df.groupby(['cl_type'])[['Sensitivity','Specificity']].agg(['mean','std']).reset_index()
results_2_df = pd.concat(res_list, axis=0).groupby('4cat_predicted_covid').mean()

In [33]:
results_1_df

Unnamed: 0_level_0,cl_type,Sensitivity,Sensitivity,Specificity,Specificity
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,std,mean,std
0,high sensitivity,0.636728,0.062033,0.721405,0.009351
1,high specificity,0.140368,0.046178,0.985019,0.002789
2,optimal,0.165415,0.051843,0.966559,0.004352


In [34]:
results_2_df/results_2_df.sum().sum()*100

swab_test,False,True
4cat_predicted_covid,Unnamed: 1_level_1,Unnamed: 2_level_1
1,69.982881,1.069293
2,23.78247,1.384375
3,1.791252,0.124048
4,1.453842,0.411839


In [35]:
results_2_df/results_2_df.sum()*100

swab_test,False,True
4cat_predicted_covid,Unnamed: 1_level_1,Unnamed: 2_level_1
1,72.139533,35.767635
2,24.51537,46.307054
3,1.846453,4.149378
4,1.498645,13.775934


## 72 hours predictions

In [36]:
N = 3

In [37]:
three_days_test_df = test_df.groupby('patient_id').filter(lambda x: len(x)==N).reset_index(drop=True)

In [38]:
clf, idx_optimal, idx_high_sens, idx_high_spec, thresholds, fpr, tpr, FEATURES, ALL_SYMPTOMS, PAT_FEATURES = load('./Grouped_RF_3_12_05.joblib')

In [39]:
grouped_test_df = pd.concat([three_days_test_df.groupby('patient_id')[ALL_SYMPTOMS].agg(lambda x:x.sum()/N),
                             three_days_test_df.groupby('patient_id')[PAT_FEATURES+TARGET].mean()], axis=1).reset_index(drop=True)

In [40]:
auc_vals = []
sens_vals = []
spec_vals = []
cl_types = []
res_list = []

for r in range(REPS):
    
    ids = resample(grouped_test_df.index.unique())
    X_test = grouped_test_df.loc[grouped_test_df.index.isin(ids), FEATURES]
    y_test = grouped_test_df.loc[grouped_test_df.index.isin(ids), TARGET].values[:,0]
    X_test.loc[:,'p_predicted_covid'] = clf.predict_proba(X_test.loc[:, FEATURES])[:,1]

    for cl_type, idx in zip(['optimal','high sensitivity', 'high specificity'], [idx_optimal, idx_high_sens, idx_high_spec]):
        
        X_test.loc[:,'predicted_covid'] = X_test.loc[:,'p_predicted_covid'] >thresholds[idx]
        
        predicted_covid = X_test.predicted_covid
        p_predicted_covid = X_test.p_predicted_covid
        true_y =  y_test
        cl_types.append(cl_type)
        auc_vals.append(roc_auc_score(true_y, p_predicted_covid))
        sens_val, spec_val = sens_spec(true_y, predicted_covid)
        sens_vals.append(sens_val)
        spec_vals.append(spec_val)
    
    X_test.loc[:,'4cat_predicted_covid'] = list(map(lambda x: mapping(x), X_test.loc[:,'p_predicted_covid']))
    aux_df = pd.concat([X_test.reset_index(drop=True), pd.DataFrame(y_test,columns=['swab_test'])], axis=1)
    res_list.append(aux_df.groupby(['4cat_predicted_covid','swab_test']).size().reset_index().pivot_table(index='4cat_predicted_covid',columns='swab_test',values=0))


aux_df = pd.DataFrame({'cl_type':cl_types, 'Sensitivity': sens_vals, 'Specificity': spec_vals})
results_1_df = aux_df.groupby(['cl_type'])[['Sensitivity','Specificity']].agg(['mean','std']).reset_index()
results_2_df = pd.concat(res_list, axis=0).groupby('4cat_predicted_covid').mean()

In [41]:
results_1_df

Unnamed: 0_level_0,cl_type,Sensitivity,Sensitivity,Specificity,Specificity
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,std,mean,std
0,high sensitivity,0.538742,0.042474,0.742903,0.005061
1,high specificity,0.11331,0.021265,0.98062,0.001701
2,optimal,0.225574,0.031402,0.958785,0.002429


In [42]:
results_2_df/results_2_df.sum().sum()*100

swab_test,False,True
4cat_predicted_covid,Unnamed: 1_level_1,Unnamed: 2_level_1
1,72.006586,1.418853
2,20.924002,0.961434
3,2.116242,0.345588
4,1.878601,0.348695


In [43]:
results_2_df/results_2_df.sum()*100

swab_test,False,True
4cat_predicted_covid,Unnamed: 1_level_1,Unnamed: 2_level_1
1,74.290705,46.148017
2,21.587731,31.270523
3,2.183371,11.240212
4,1.938193,11.341248
