In [1]:
import string
import pandas as pd
from config import data_dir
import os 
from collections import Counter
import pickle

In [2]:
os.getcwd()

'C:\\Users\\PC-1\\PycharmProjects\\mdtel'

In [3]:
table = str.maketrans('', '', string.punctuation)

In [4]:
communities = ['diabetes', 'sclerosis', 'depression']

### Table 3: Medical term mentions in 100 posts for 3 OHC Communities

In [5]:
manually_labeled_dir = data_dir + 'manual_labeled'

all_res = []

for comm in communities:
    comm_path = manually_labeled_dir + os.sep + comm + ".xlsx"
    comm_df = pd.read_excel(comm_path)
    comm_df = comm_df[~comm_df['manual_tag'].isna()]
    
    comm_df['tokenized_txt_words'] = comm_df['tokenized_txt'].apply(lambda x: x.split(" "))
    comm_df['tokenized_txt_words'].apply(lambda lst: [w.translate(table) for w in lst])
    
    all_words = []

    for words in comm_df['tokenized_txt_words'].values:
        all_words += words
    all_words_num = sum(Counter(all_words).values())
    
    comm_df['manual_tag_lst'] = comm_df['manual_tag'].apply(lambda x: [y.strip() for y in x.split(',')])
    all_med_terms = []
    for lst in comm_df['manual_tag_lst'].values: 
        all_med_terms += lst
    relevant_med_terms = sum(Counter(all_med_terms).values())
    all_res.append({'Words': all_words_num, 'Relevant Medical Terms': relevant_med_terms})
    
test_set_stats = pd.DataFrame(all_res, index=communities, columns=['Words', 'Relevant Medical Terms'])
test_set_stats

Unnamed: 0,Words,Relevant Medical Terms
diabetes,3881,477
sclerosis,8687,611
depression,8106,716


## Table 4: MDTEL UMLS Entity Linking Performance

In [6]:
!python src\contextual_relevance\evaluate_contextual_relevance_model.py $data_dir

Got data_dir: E:\mdtel_data\data\
            f1_score  roc_auc  recall   acc
diabetes        0.89     0.90    0.99  0.85
sclerosis       0.85     0.90    0.88  0.85
depression      0.79     0.89    0.88  0.81


In [7]:
with open(data_dir + r"contextual_relevance\output_models\trained_models.pickle", 'rb') as f:
    trained_models = pickle.load(f)

### Calculating High-recall candidates filtered out

In [8]:
cms = {comm: trained_models[comm]['confusion_matrix'] for comm in communities}

In [9]:
cms

{'diabetes': array([[31, 16],
        [ 1, 66]], dtype=int64), 'sclerosis': array([[95, 21],
        [12, 92]], dtype=int64), 'depression': array([[79, 25],
        [ 8, 61]], dtype=int64)}

The number of high recall candidates is the sum of all elements in the confusion matrix. 
Some are indeed medical terms, and some are not. 

In [10]:
number_of_high_recall_candidates = {comm: sum(sum(trained_models[comm]['confusion_matrix'])) for comm in communities}

The number of high recall candidates filtered our, is the left column - the column of the terms predicted as "Negative" - not a medical terms. 
Here as well, part of this terms are indeed not medical terms (TN), and some are mistakes (FN).

In [11]:
high_recall_candidates_filtered_out_num = {comm: sum(trained_models[comm]['confusion_matrix'][:, 0]) for comm in communities}

The proportion of the filtered our, is the number of high candidates filtered out (predicted as "Negatives") divided by the number of high recall candidates (Sum of the confusion matrix)

In [12]:
high_recall_candidates_filtered_out = {str(round(high_recall_candidates_filtered_out_num[comm]/number_of_high_recall_candidates[comm]*100, 2)) + "%" for comm in communities}

In [13]:
high_recall_candidates_filtered_out_col = pd.DataFrame(high_recall_candidates_filtered_out, columns=['High-recall candidates filtered out'], index=communities)

In [14]:
high_recall_candidates_filtered_out_col

Unnamed: 0,High-recall candidates filtered out
diabetes,48.64%
sclerosis,50.29%
depression,28.07%


### Calculating accuracy full algorithm

In [15]:
number_of_high_recall_candidates_series = pd.Series(number_of_high_recall_candidates)

In [16]:
test_set_stats['Words']

diabetes      3881
sclerosis     8687
depression    8106
Name: Words, dtype: int64

Now calculating additional true negatives.
Those are all of the words there weren't candidate as high recall list. 
They are true negative because they didn't appear in the confusion matrix, therefore they are not misclassified. 

In [17]:
additional_true_negatives = test_set_stats['Words'] - number_of_high_recall_candidates_series
additional_true_negatives

diabetes      3767
sclerosis     8467
depression    7933
dtype: int64

In [18]:
full_alg_true_pos = {comm: cms[comm][0][0] + additional_true_negatives[comm] for comm in communities}
full_alg_true_pos

{'diabetes': 3798, 'sclerosis': 8562, 'depression': 8012}

Now inserting the true number of true negatives into the confusion matrix

In [19]:
for comm in cms:
    cms[comm][0][0] = full_alg_true_pos[comm]

In [20]:
def calculate_acc_full_alg_for_cms_and_comm(cms, comm):
    tn = cms[comm][0][0]
    tp = cms[comm][1][1]
    negatives = sum(cms[comm][0])
    positives = sum(cms[comm][1])
    acc = (tn + tp) / (negatives + positives)
    acc = str(round(acc * 100, 2)) + "%"
    return acc

And calculating 'Accuracy Full algorithm'

In [21]:
accuracy_full_alg = {comm: calculate_acc_full_alg_for_cms_and_comm(cms, comm) for comm in communities}
acc_full_alg_df = pd.DataFrame(accuracy_full_alg.values(), columns=['Accuracy Full Algorithm'], index=communities)
acc_full_alg_df

Unnamed: 0,Accuracy Full Algorithm
diabetes,99.56%
sclerosis,99.62%
depression,99.59%
