In [75]:
import os
import sys
import csv
import tqdm
from itertools import permutations

# from fuzzyset import FuzzySet

# from cfuzzyset import cFuzzySet as FuzzySet

from rank_bm25 import BM25Okapi, BM25L, BM25Plus

In [85]:
# Load all the strings from meddra 23.1 and map them to the preferred term id
fh = open('../data/meddra_pt_llt_map_omop_v23.1.csv')
reader = csv.reader(fh)
header = next(reader)

meddra_strings = dict()

for pt_concept_id, pt_concept_name, pt_meddra_id, llt_concept_id, llt_concept_name, llt_meddra_id in reader:
    
    meddra_strings[pt_concept_name.lower()] =  pt_meddra_id
    meddra_strings[llt_concept_name.lower()] = pt_meddra_id
        

fh.close()
meddra_strings_sorted = sorted(meddra_strings.keys())
meddra_ids_23_1 = set(meddra_strings.values())

tokenized_corpus = [term.split() for term in meddra_strings_sorted]
bm25 = BM25Okapi(tokenized_corpus, k1=1.2, b=0.75, epsilon=1.0)
# bm25 = BM25L(tokenized_corpus, k1=1.2, b=0.75, delta=0.5)
# bm25 = BM25Plus(tokenized_corpus, k1=1.2, b=0.75, delta=1.0)

In [86]:
# Load the training data from the provided XML documents and parsed
# by the ../src/normalization_construct_trainingdata_step2_buildmap.py script.

fh = open('../data/normalization/train_xml_normalization_map_step2.txt')
reader = csv.reader(fh)
header = next(reader)

training_map = dict()

for source_xml, string, meddra_pt_id, is_abbreviation, expanded_term in reader:
    
    if not meddra_pt_id in meddra_ids_23_1:
        continue
    
    query_string = string.lower()
    
    if is_abbreviation == 'True':
        query_string = expanded_term.lower()
    
    training_map[query_string] = meddra_pt_id
    
fh.close()

In [87]:
# check that the expected meddra ids are in our reference set of meddra strings

meddra_ids_map = set(training_map.values())
len(meddra_ids_map), len(meddra_ids_23_1), len(meddra_ids_map-meddra_ids_23_1)

(1336, 13858, 0)

In [89]:
terms_to_match = set()
for term in training_map.keys():
    if term in meddra_strings:
        # exact match, skip
        continue
    
    terms_to_match.add(term)

len(terms_to_match), len(training_map)

(1392, 2648)

In [90]:
exact_matches = len(training_map)-len(terms_to_match)
exact_matches

1256

In [91]:
query = list(terms_to_match)[6]
tokenized_query = query.split()
doc_scores = bm25.get_top_n(tokenized_query, meddra_strings_sorted, n=10)
print(query, training_map[query])
for term in doc_scores:
    print(meddra_strings[term], term)
# doc_scores, query

elevations in fasting serum ldl 10024910
10024900 ldl
10020635 fasting hyperglycaemia
10020635 fasting hyperglycemia
10020993 fasting hypoglycaemia
10020993 fasting hypoglycemia
10005342 arsenic in serum
10024910 increased ldl
10024909 low ldl
10051718 ldl apheresis
10024900 ldl cholesterol


In [92]:
correct = 0
incorrect = 0

for a in tqdm.tqdm(terms_to_match):
    
    tokenized_query = a.split()
    top_term = bm25.get_top_n(tokenized_query, meddra_strings_sorted, n=1)[0]
    pred_meddra_pt_id = meddra_strings[top_term]

    if pred_meddra_pt_id == training_map[a]:
        correct += 1
    else:
        incorrect += 1

correct/(incorrect+correct), (correct+exact_matches)/(incorrect+correct+exact_matches)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1392/1392 [00:56<00:00, 24.51it/s]


(0.3275862068965517, 0.6465256797583081)

In [95]:
# is the correct mapping within the top N?
correct = 0
incorrect = 0

for a in tqdm.tqdm(terms_to_match):
    
    tokenized_query = a.split()
    top_terms = bm25.get_top_n(tokenized_query, meddra_strings_sorted, n=10)
    pred_meddra_pt_ids = [meddra_strings[t] for t in top_terms]

    if training_map[a] in pred_meddra_pt_ids:
        correct += 1
    else:
        incorrect += 1

correct/(incorrect+correct), (correct+exact_matches)/(incorrect+correct+exact_matches)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1392/1392 [00:58<00:00, 23.96it/s]


(0.6494252873563219, 0.8157099697885196)

In [94]:
# what is the performance of the top prediction if we 
# only look at those where the right answer is in the top 10?
correct = 0
incorrect = 0

for a in tqdm.tqdm(terms_to_match):
    
    tokenized_query = a.split()
    top_terms = bm25.get_top_n(tokenized_query, meddra_strings_sorted, n=10)
    pred_meddra_pt_ids = [meddra_strings[t] for t in top_terms]
    if not training_map[a] in pred_meddra_pt_ids:
        continue
    
    pred_meddra_pt_id = meddra_strings[top_terms[0]]

    if pred_meddra_pt_id == training_map[a]:
        correct += 1
    else:
        incorrect += 1

correct/(incorrect+correct), (correct+exact_matches)/(incorrect+correct+exact_matches)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1392/1392 [00:58<00:00, 23.89it/s]


(0.504424778761062, 0.7925925925925926)