In [6]:
import os
import sys
import csv
import tqdm
import nltk
import string

from nltk.stem.porter import PorterStemmer
from cfuzzyset import cFuzzySet as FuzzySet
from rank_bm25 import BM25Okapi, BM25L, BM25Plus

In [4]:
stemmer = PorterStemmer()

def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

def tokenize(text, sort=True):
    text = "".join([ch for ch in text if ch not in string.punctuation])
    tokens = nltk.word_tokenize(text)
    
    stems =  stem_tokens(tokens, stemmer)
    if sort:
        stems = sorted(stems)
    
    return ' '.join(stems)

tokenize("type 2 diabetes.")

'2 diabet type'

In [5]:
# Load the training data from the provided XML documents and parsed
# by the ../src/normalization_construct_trainingdata_step2_buildmap.py script.

fh = open('../data/normalization/train_xml_normalization_map_step2.txt')
reader = csv.reader(fh)
header = next(reader)

training_map = dict()

for source_xml, raw_string, meddra_pt_id, is_abbreviation, expanded_term in reader:
    
    query_string = raw_string.lower()
    
    if is_abbreviation == 'True':
        query_string = expanded_term.lower()
    
    training_map[tokenize(query_string)] = meddra_pt_id
    
fh.close()

In [7]:
# Load all the strings from meddra 23.1 and map them to the preferred term id
fh = open('../data/meddra_pt_llt_map_omop_v23.1.csv')
reader = csv.reader(fh)
header = next(reader)

meddra_strings = dict()
meddra_fuzzyset = FuzzySet()

for pt_concept_id, pt_concept_name, pt_meddra_id, llt_concept_id, llt_concept_name, llt_meddra_id in reader:
    
    meddra_strings[tokenize(pt_concept_name.lower())] =  pt_meddra_id
    meddra_strings[tokenize(llt_concept_name.lower())] = pt_meddra_id
    
    meddra_fuzzyset.add(tokenize(pt_concept_name.lower()))
    meddra_fuzzyset.add(tokenize(llt_concept_name.lower()))
    

fh.close()

# BM25 setup
meddra_strings_sorted = sorted(meddra_strings.keys())
tokenized_corpus = [term.split() for term in meddra_strings_sorted]
bm25 = BM25Okapi(tokenized_corpus, k1=1.2, b=0.75, epsilon=1.0)

len(meddra_strings)    

58590