In [1227]:
import pandas as pd
import numpy as np
import re
import string
from tqdm import tqdm_notebook
from tqdm import tqdm
import datetime as dt
import pickle
from collections import Counter

import nltk

import pdaactconn as pc
from trialexplorer.mesh_terms import MeSHCatalog
from trialexplorer import AACTStudySet
from trialexplorer import studysimilarity as ssim

import matplotlib.pyplot as plt
%matplotlib inline

tqdm.pandas()

In [3]:
# selecting all interventional studies
conn = pc.AACTConnection(source=pc.AACTConnection.REMOTE)
ss = AACTStudySet.AACTStudySet(conn= conn, tqdm_handler=tqdm_notebook)
ss.add_constraint("study_type = 'Interventional'")
ss.load_studies()

255092 studies loaded!


In [4]:
# loading all dimensional data
ss.add_dimensions('conditions')
ss.refresh_dim_data()

Successfuly added these 1 dimensions: ['conditions']
Failed to add these 0 dimensions: []


HBox(children=(IntProgress(value=0, max=511), HTML(value='')))

Syncing the temp table temp_cur_studies in 511 chunks x 500 records each

Creating index on the temp table
 - Loading dimension conditions
 -- Loading raw data
 -- Sorting index


In [244]:
c = ss.dimensions['conditions']
c.data.head()

Unnamed: 0_level_0,id,name,downcase_name
nct_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
NCT00000102,10058348,Congenital Adrenal Hyperplasia,congenital adrenal hyperplasia
NCT00000106,10057277,Rheumatic Diseases,rheumatic diseases
NCT00000108,10057274,Cardiovascular Diseases,cardiovascular diseases
NCT00000108,10057275,Coronary Disease,coronary disease
NCT00000110,10057273,Obesity,obesity


# 1. Extracting Adj and Verbs

In [15]:
all_tag_counts = {}

for cur_cond in tqdm(c.data['downcase_name'].values):
    cur_tokens = nltk.word_tokenize(cur_cond)
    pos_tags = nltk.pos_tag(cur_tokens)
    for word, tag in pos_tags:
        if tag not in all_tag_counts:
            all_tag_counts[tag] = Counter()
        all_tag_counts[tag][word] += 1
        

100%|██████████| 420244/420244 [01:44<00:00, 4023.31it/s]


In [16]:
all_tag_counts.keys()

dict_keys(['JJ', 'NN', 'NNS', ',', 'VBN', 'IN', 'VBP', 'RB', 'VBG', 'CC', 'POS', 'VB', 'DT', '(', ')', 'CD', 'VBZ', 'VBD', 'PRP$', 'FW', ':', 'TO', 'RP', 'SYM', 'UH', 'JJS', 'JJR', 'MD', 'NNP', "''", '.', '``', 'EX', 'WDT', '$', '#', 'WP', 'RBR', 'PRP', 'RBS', 'WRB', 'PDT', 'LS'])

In [19]:
for k, v in all_tag_counts.items():
    print(k)
    print(v.most_common(10))
    print('')

JJ
[('healthy', 10077), ('chronic', 9485), ('acute', 5392), ('coronary', 3841), ('pulmonary', 3835), ('metastatic', 3702), ('multiple', 3636), ('solid', 3468), ('renal', 3340), ('ovarian', 3291)]

NN
[('cancer', 39826), ('disease', 18944), ('stage', 15822), ('cell', 13729), ('lymphoma', 12040), ('carcinoma', 11373), ('breast', 10292), ('pain', 9798), ('leukemia', 9651), ('disorder', 9380)]

NNS
[('diabetes', 10498), ('diseases', 5203), ('disorders', 4874), ('infections', 4816), ('neoplasms', 4726), ('mellitus', 2812), ('tumors', 2599), ('volunteers', 1649), ('syndromes', 1332), ('patients', 1242)]

,
[(',', 28260)]

VBN
[('associated', 368), ('relapsed', 336), ('generalized', 271), ('related', 261), ('acquired', 247), ('transmitted', 208), ('treated', 201), ('delayed', 136), ('localized', 121), ('differentiated', 110)]

IN
[('of', 11445), ('with', 3751), ('in', 2915), ('for', 658), ('on', 460), ('after', 352), ('by', 312), ('without', 293), ('diffuse', 253), ('during', 240)]

VBP
[('me

In [38]:
jj_count = 0
vb_count = 0
for k, v in all_tag_counts.items():
    if k[:2] == 'JJ':
        jj_count += len(v)
    if k[:2] == 'VB':
        vb_count += len(v)
        
print('Unique adjective count : %s' % jj_count)
print('Unique verb count : %s' % vb_count)

Unique adjective count : 6932
Unique verb count : 2356


In [39]:
def extract_adj_and_vb(condition_str):
    cur_cond = condition_str.lower()
    cur_tokens = nltk.word_tokenize(cur_cond)
    pos_tags = nltk.pos_tag(cur_tokens)
    
    all_jj, all_vb = [], []
    for word, tag in pos_tags:
        if tag[:2] == 'JJ':
            if word not in all_jj:
                all_jj.append(word)
        if tag[:2] == 'VB':
            if word not in all_vb:
                all_vb.append(word)
    return all_jj, all_vb

In [44]:
extract_adj_and_vb('Stage II Contiguous Adult Diffuse Mixed Cell Lymphoma')

(['contiguous', 'mixed'], [])

# Extract type, grade, stage, AJCC

In [887]:
all_keywords = ['type', 'genotyp_', 'grade', 'stage', 'ajcc', 'hepatitis']

word_nums = [
    'zero', 'one', 'two', 'three', 'four',
    'five', 'six', 'seven', 'eight', 'nine',
    'ten', 'eleven', 'twelve', 'thirteen', 'fourteen',
    'fifteen', 'sixteen', 'seventeen', 'eighteen', 'ninteen',
    'twenty'
]

word2num = dict(zip(word_nums, range(0, 21)))


### first just capture what comes after the space, see what our universe of data looks like

In [888]:
# distinct counters and dicts for each kw
dict_list = []
counter_list = []
for i in range(len(all_keywords)):
    dict_list.append({})
    counter_list.append(Counter())
    
raw_counters = dict(zip(all_keywords, counter_list))
orig_cond_dict = dict(zip(all_keywords, dict_list))

for orig_cond in tqdm(c.data['downcase_name'].values):
    for kw in all_keywords:
        kw_splits = orig_cond.split(kw)
        i = 0
        for kw_split in kw_splits:
            if i > 0:  # ignore first split
                el_list = kw_split.strip().split(' ')
                if len(el_list) > 0:
                    el1 = el_list[0]
                    raw_counters[kw][el1] += 1
                                    
                    if el1 in orig_cond_dict[kw]:
                        orig_cond_dict[kw][el1].append(orig_cond)
                    else:
                        orig_cond_dict[kw][el1] = [orig_cond]
            i += 1

100%|██████████| 420244/420244 [00:00<00:00, 472825.00it/s]


In [889]:
raw_counters['stage'].most_common(20)

[('iv', 3548),
 ('iii', 2599),
 ('ii', 1602),
 ('iiib', 993),
 ('i', 986),
 ('iiia', 898),
 ('iva', 740),
 ('iiic', 699),
 ('ivb', 685),
 ('renal', 655),
 ('iib', 491),
 ('iia', 458),
 ('ib', 301),
 ('ivc', 267),
 ('ia', 250),
 ('iic', 109),
 ('0', 91),
 ('small', 76),
 ('', 72),
 ('breast', 58)]

In [890]:
orig_cond_dict['stage']['s']

['subjects with resectable local or locally advanced, non-metastatic (t2-t4, n0-n3, m0; stages ii and iii) and histologically-confirmed intestinal gc',
 'cancer of the gi system-stages ii iii and iv',
 'stages ii-iii breast cancer',
 'chronic kidney disease stages 3-5',
 'locoregional metastases in malignant melanoma stages iiib/c',
 'stages iii skin melanoma',
 'stages ii skin melanoma',
 'breast cancer stages i through iii',
 'figo stages ii to iv']

### Preprocessing:

In [1023]:
######################################################
# stage 1 preprocessing that occurs on the raw string:
######################################################

def convert_word2num(cur_text):
    """ finds all of the english numbers in a string and replace with numerical numbers """
    rex = '\s' + "|".join(word_nums) + '\s'
    found_list = re.findall(rex, cur_text)
    if len(found_list) > 0:
        for found_text in found_list:
            sub_to = word2num[found_text.strip()]
            cur_text = re.sub(rex, str(sub_to), cur_text)
    return cur_text

def convert_roman_to_str(cur_text):
    """ remove roman numeral charsets """
    # only first 7
    rn_map = {
        chr(8544): 'i',
        chr(8545): 'ii',
        chr(8546): 'iii',
        chr(8547): 'iv',
        chr(8548): 'v',
        chr(8549): 'vi',
        chr(8550): 'vii',
        chr(8560): 'i',
        chr(8561): 'ii',
        chr(8562): 'iii',
        chr(8563): 'iv',
        chr(8564): 'v',
        chr(8565): 'vi',
        chr(8566): 'vii',
    }
    
    for k, v in rn_map.items():
        cur_text = cur_text.replace(k, v)
    return cur_text


def preprocess_cond_str(cur_cond):
    """ all preprocessing routines """
    # word2num
    cur_cond = convert_word2num(cur_cond)
    
    # "through" delim
    cur_cond = cur_cond.replace(' through ', '-')
    
    # "and" delim
    cur_cond = cur_cond.replace(' and ', ',')
    
    # plural keywords
    pl_kw = {
        'stages': 'stage',
        'grades': 'grade',
        'types': 'type',
        'genotypes': 'genotyp_',
        'genotype': 'genotyp_'
    }
    for k, v in pl_kw.items():
        cur_cond = cur_cond.replace(k, v)
    
    # connector delims
    range_delims = {
        " - ": "-",
        " -": "-",
        "- ": "-",
        " / ": "/",
        " /": "/",
        "/ ": "/",
    }
    for k, v in range_delims.items():
        cur_cond = cur_cond.replace(k, v)
        
    # inherient roman numerals in symbol
    cur_cond = convert_roman_to_str(cur_cond)
    
    # ajcc abbrev.
    ajcc_re = r'\s?\(american joint committee on cancer\s?\)\s?' 
    cur_cond = re.sub(ajcc_re, '', cur_cond)
    return cur_cond

#########################################################
# stage 2 preprocessing that occurs at the kw_split level
#########################################################

def preprocess_kw_split(kw_split):
    """ preprocessing for the kw split level """
    
    # replace leading dashes
    kw_split = re.sub(r'^\s?-', '', kw_split)
    
    # strip spaces
    kw_split = kw_split.strip()
    
    return kw_split


#########################################################
# stage 3 preprocessing that occurs at the element level
#########################################################

def preprocess_element(element):
    """ proprocessing for the element level """
    # replace roman numerals with numbers
    # possible extension, extend to full number system
    roman_map = {
        'i': 1,
        'ii': 2,
        'iii': 3,
        'iv': 4,
        'v': 5,
        'vi': 6,
        'vii': 7,
        'viii': 8,
        'ix': 9,
        'x': 10,
        'xi': 11,
        'xii': 12,
        'xiii': 13,
        'xiv': 14,
        'xv': 15,
        'xvi': 16,
        'xvii': 17,
        'xviii': 18,
        'xix': 19,
        'xx': 20,
    }
    
    src_roman = '^i[vx]|^i{1,3}|^vi{1,3}|^xi[vx]|^xi{1,3}|^xvi{0,3}|^[xv]'
    find_list = re.findall(src_roman, element)
    if len(find_list) > 0:
        found_str = find_list[0]
        to_rep = roman_map[found_str]
        element = re.sub(src_roman, str(to_rep), element)
    
    # replace brackets
    element = element.replace(')', '')
    element = element.replace('(', '')
    
    # replace 'diabetes' with ''
    element = element.replace('diabetes', '')
    
    return element

### extractor regex:

In [1142]:
# some kws use 1 regex pattern, ajcc uses another
src1_delimiter = '[\s*\,\-\;\)\(]'

src1 = r'^\d[\w\(\)]*' + src1_delimiter + '*|' + \
        '^i{2,3}[\w\(\)]*' + src1_delimiter + '*|' + \
        '^vi{2,3}[\w\(\)]*' + src1_delimiter + '*|' + \
        '^iv?[a-eg-mo-ru-z]?\d+[\w\(\)]*' + src1_delimiter + '+|' + \
        '^iv?[a-eg-mo-ru-z]?\d+[\w\(\)]*$|' + \
        '^iv?[a-eg-mo-ru-z]?\d*' + src1_delimiter + '+|' + \
        '^iv?[a-eg-mo-ru-z]?\d*$|' + \
        '^v\d+[\w\(\)]*' + src1_delimiter + '+|' + \
        '^v\d+[\w\(\)]*$|' + \
        '^v\d*' + src1_delimiter + '+|' + \
        '^v\d*$|' + \
        '^vi[a-z]\d+[\w\(\)]*' + src1_delimiter + '+|' + \
        '^vi[a-z]\d+[\w\(\)]*$|' + \
        '^vi[a-z]\d*' + src1_delimiter + '+|' + \
        '^vi[a-z]\d*$|' + \
        '^[A-Fa-f]\d+[\w\(\)]*' + src1_delimiter + '+|' + \
        '^[A-Fa-f]\d+[\w\(\)]*$|' + \
        '^[A-Fa-f]\d*' + src1_delimiter + '+|' + \
        '^[A-Fa-f]\d*$|' + \
        '^her\d\+?'

#'^i[^A-za-z ][\w\(\)]*[\s*\,\-]*|' + \
#'^vi*[^A-za-z ][\w\(\)]*[\s*\,\-]*|' + \

src2 = r'^v\d[\w\(\)]*[\s*\,\-]*'  # ajcc

src3 = r'^[abcde]'
regex_dict = {
    'grade': src1,
    'type': src1,
    'genotyp_': src1,
    'stage': src1,
    'ajcc': src2,
    'hepatitis': src3,
}

# pattern for delimiter
src_delim = r'[\s*\,\-]+$'

In [1161]:
cur_text = 'melanoma stage iiib-ivm1a'
preped_cond = preprocess_cond_str(cur_text)
print(preped_cond)
extract_elements_delims(preped_cond, src1, 'stage', do_print=True)

melanoma stage iiib-ivm1a
['melanoma ', ' iiib-ivm1a']
iiib-ivm1a


(['3b', '4m1a'], ['-'])

## Extract function

In [1219]:
def extract_elements_delims(cur_text, regex_str, kw, do_element_pp=True, do_print=False):
    elements = []
    delimiters = []
    
    kw_splits = cur_text.split(kw)
    if do_print:
        print(kw_splits)
        
    if len(kw_splits) == 1:
        return elements, delimiters  # kw not found
    
    for i, raw_kw_split in enumerate(kw_splits):
        if i == 0:  # if the first split has no space, then the kw was part of another word
            if len(raw_kw_split) != 0 and raw_kw_split[-1] in string.ascii_letters:
                if do_print: print('not a real split')
                break  # not a real split
                
        if i > 0:  # ignore first split
            cur_kw_split = preprocess_kw_split(raw_kw_split)
            if do_print: print(cur_kw_split)
            do_loop = True
            while do_loop:
                # find the element and delimiter
                cur_ele_list = re.findall(regex_str, cur_kw_split)
                if len(cur_ele_list) > 0:
                    remaining = cur_ele_list[0]
                    # if we have an element then look for the delimiter
                    delim = re.findall(src_delim, remaining)
                    if len(delim) > 0:
                        if delim[0].strip() == '-':
                            delim_char = '-'
                        else:
                            delim_char = ','
                        delimiters.append(delim_char)
                        # remove delimiter
                        remaining = re.sub(src_delim, '', remaining)
                    else:
                        do_loop = False

                    # append element
                    if do_element_pp:
                        elements.append(preprocess_element(remaining).strip())
                    else:
                        elements.append(remaining.strip())
                    # remove the element
                    cur_kw_split = re.sub(regex_str, '', cur_kw_split)
                else:
                    do_loop = False
    return elements, delimiters


def digstr2parts(digstr):
    """ returns a tuple of the digit and the string parts, if no num part, returns Null for that part """
    re_pattern = '^[0-9]+'
    f_list = re.findall(re_pattern, digstr)
    if len(f_list) == 0:
        return None, digstr  # no number part
    else:
        num_part = int(f_list[0])
        chr_part = re.sub(re_pattern, '', digstr)
        return num_part, chr_part
                

def full_extract(orig_cond):
    """ fully extract elements and delimiters from a condition string """
    preped_cond = preprocess_cond_str(orig_cond)
    extracted_dict = {}
    has_val = False
    
    for kw in all_keywords:
        regex_str = regex_dict[kw]
        do_element_pp = True
        if kw == 'ajcc':
            do_element_pp = False  # no not subsitute romans for ajcc
        
        elements, delimiters = extract_elements_delims(preped_cond, regex_str, kw, do_element_pp=do_element_pp)
        if len(elements) > 0:
            has_val = True
            
        
        # count delimiters:
        dash_delim_count = 0
        for delim in delimiters:
            if delim == '-':
                dash_delim_count += 1
        if dash_delim_count >= 1:
            print(orig_cond)
            print(elements, delimiters)
        
        # post process the elements and delimiters for all elements:
        all_elements = post_process_element_delimiters(elements, delimiters)
            
        extracted_dict[kw] = all_elements
        
    if has_val:
        return extracted_dict
    else:
        return None

In [1220]:
# WIP
def post_process_element_delimiters(elements, delimiters):
    enumeratable_letters = 'abcdef'
    
    if '-' not in delimiters:
        return elements
    else:
        cur_elements = []
        prev_start_num = None
        prev_end_num = None
        prev_start_letter = None
        prev_end_letter = None
        
        for i, delim in enumerate(delimiters):
            # print(i, delim)
            if delim == '-':
                if len(elements) <= i + 1:
                    # print('returning cond')
                    return cur_elements  # return if there is no closing element for the current delimiter
                else:
                    start_num, start_letter = digstr2parts(elements[i])
                    end_num, end_letter = digstr2parts(elements[i+1])
                    #print(start_num, end_num, start_letter, end_letter)
                    
                    ####################
                    # handle cases here
                    ####################
                    
                    # case 1: we have both start and end nums
                    if start_num and end_num:  # both not None
                        first_loop = True
                        for i in range(start_num, end_num):
                            cur_ele = str(i)
                            if first_loop: 
                                cur_ele += start_letter
                            
                            cur_elements.append(cur_ele)
                            first_loop = False
                            
                        # on the end_num enumerate the letters
                        if end_letter != '' and end_letter in enumeratable_letters:
                            for j in range(ord('a'), ord(end_letter) + 1):
                                append_chr = chr(j)
                                cur_ele = str(end_num) + append_chr
                                cur_elements.append(cur_ele)
                        else:
                            cur_elements.append(str(end_num))
                    
                    # case 2: have only letters - enumerate all the letters
                    elif start_num is None and end_num is None:
                        if start_letter != '' and end_letter != '' and \
                        start_letter in enumeratable_letters and \
                        end_letter in enumeratable_letters:
                            for j in range(ord(start_letter), ord(end_letter) + 1):
                                cur_ele = chr(j)
                                cur_elements.append(cur_ele)
                    
                    # case 3: we have start num but no end nums - enumerate only the start num letters
                    elif start_num and end_num is None:
                        if start_letter != '' and end_letter != '' and \
                        start_letter in enumeratable_letters and \
                        end_letter in enumeratable_letters:
                            for j in range(ord(start_letter), ord(end_letter) + 1):
                                append_chr = chr(j)
                                cur_ele = str(start_num) + append_chr
                                cur_elements.append(cur_ele)
                                
                    # case 4: no start num, but end num and prev num - enumerate all numbers except first
                    # and no end letter
                    elif start_num is None and end_num and prev_start_num and end_letter == '':
                        for i in range(prev_start_num + 1, end_num + 1):
                            cur_ele = str(i)
                            cur_elements.append(cur_ele)
                            
                    # case 5: no start num, but end num and prev num - enumerate all numbers except first
                    # and has end letter
                    elif start_num is None and end_num and prev_start_num and end_letter != '':
                        for i in range(prev_start_num + 1, end_num):
                            cur_ele = str(i)
                            cur_elements.append(cur_ele)
                            
                        if end_letter in enumeratable_letters:
                            for j in range(ord('a'), ord(end_letter) + 1):
                                append_chr = chr(j)
                                cur_ele = str(end_num) + append_chr
                                cur_elements.append(cur_ele)
                        else:
                            cur_ele = str(end_num) + end_letter
                            cur_elements.append(cur_ele)
                        
      
                    # storing prev loop values
                    prev_start_num = start_num
                    prev_end_num = end_num
                    prev_start_letter = start_letter
                    prev_end_letter = end_letter
                        
                            
        return cur_elements
                
#post_process_element_delimiters(['b'], ['-'])
post_process_element_delimiters(['3a', 'c', '4'], ['-', '-', ','])

['3a', '3b', '3c', '4']

# testing full extract:

In [1221]:
all_extracted = {}
for orig_cond in tqdm(c.data['downcase_name'].unique()):
    extracted_dict = full_extract(orig_cond)
    if extracted_dict:
        all_extracted[orig_cond] = extracted_dict

 32%|███▏      | 20785/64114 [00:00<00:00, 69076.26it/s]

early stage breast cancer (stage 1-3)
['1', '3'] ['-']
low grade b-cell lymphoma, not otherwise specified
['b'] ['-']
type i-allergy
['1'] ['-']
breast cancer early stage breast cancer (stage 1-3)
['1', '3'] ['-']
stages ii-iii breast cancer
['2', '3'] ['-', ',']
patients with a diagnosis of who grade i-iii meningioma and hemangiopericytoma with adequate bone marrow function
['1', '3'] ['-', ',']
we investigated the relationship between plasma fgf23 levels and endothelial dysfunction in a sizable series of incident stage 3-4 ckd patients.
['3', '4'] ['-', ',']
early stage breast cancer (stage 0-iii)
['0', '3'] ['-']
stage i-iii small cell lung cancer
['1', '3'] ['-', ',']
stage i-iii non-small cell lung cancer
['1', '3'] ['-', ',']
patients with stage 2-4 chronic kidney disease
['2', '4'] ['-', ',']
indolent or intermediate grade b-cell malignancy
['b'] ['-']
chronic kidney disease stages 3-5
['3', '5'] ['-']
erbb2-positive stage i-iii breast cancer
['1', '3'] ['-', ',']
stage ii-iii
[

 64%|██████▍   | 41035/64114 [00:00<00:00, 67954.20it/s]

stage i-ivb colon cancer
['1', '4b'] ['-', ',']
stage i-ivb rectal cancer
['1', '4b'] ['-', ',']
high grade b-cell lymphoma with myc and bcl2 or bcl6 rearrangements
['b'] ['-']
type-2-diabetes mellitus
['2'] ['-']
bone pain in stage i - iii breast cancer
['1', '3'] ['-', ',']
inoperable esophageal cancer stage i-iii
['1', '3'] ['-']
high-grade b-cell lymphoma (hgbl)
['b'] ['-']
locally advanced or metastatic non small cell lung cancer stage iiib - iv
['3b', '4'] ['-']
stage iii-iv or recurrent endometrial cancer
['3', '4'] ['-', ',']
post-menopausal er+ stage i-iiia primary operable breast cancer
['1', '3a'] ['-', ',']
newly diagnosed operative invasive ductal breast carcinoma stage i-ii
['1', '2'] ['-']
type 2-diabetes
['2'] ['-']
stage ia-iia breast cancer
['1a', '2a'] ['-', ',']
figo stage iii-iv
['3', '4'] ['-']
high grade b-cell lymphoma
['b'] ['-']
refractory high grade b-cell lymphoma with myc and bcl2 or bcl6 rearrangements
['b'] ['-']
recurrent high grade b-cell lymphoma with 

 86%|████████▌ | 54846/64114 [00:00<00:00, 68471.48it/s]

stage ib-iiia non-small cell lung carcinoma
['1b', '3a'] ['-', ',']
high-grade b-cell lymphoma with myc, bcl2, and bcl6 rearrangements
['b'] ['-']
gaucher disease type 1-gaucher disease type 3
['1', '3'] ['-']
stage 0-iii breast cancer
['0', '3'] ['-', ',']
stage iiib-c melanoma
['3b', 'c'] ['-', ',']
glycogen storage disease type ii-pompe's disease
['2'] ['-']
small intestinal high grade b-cell lymphoma, not otherwise specified
['b'] ['-']
high grade b-cell lymphoma, not otherwise specified
['b'] ['-']
unresectable stage iiib-iv malignant melanoma
['3b', '4'] ['-', ',']
high-grade b-cell lymphoma
['b'] ['-']
refractory high grade b-cell lymphoma
['b'] ['-']
recurrent high grade b-cell lymphoma
['b'] ['-']
ckd stage 1-4
['1', '4'] ['-']
stage i-ii head and neck cancer
['1', '2'] ['-', ',']
advanced (stage iiib-c-iv) ovarian, primary peritoneal and fallopian tube cancer
['3b', 'c', '4'] ['-', '-', ',']
prevention of hpv types 16- and 18-related cervical cancer, cervical intraepithelial 

100%|██████████| 64114/64114 [00:00<00:00, 68283.41it/s]

math teachers (grades 2-8)
['2', '8'] ['-']
math students (grades 2-8)
['2', '8'] ['-']
stage ii-ivb operable hnscc oral cavity
['2', '4b'] ['-', ',']
high grade b-cell lymphoma with myc and bcl2 and/or bcl6 rearrangements
['b'] ['-']
stage ia - ib non small cell lung cancer
['1a', '1b'] ['-', ',']
early stage non-small-cell lung cancer (stage 1-2)
['1', '2'] ['-']
melanoma stage iiib-ivm1a
['3b', '4m1a'] ['-']
stage ib-iib cutaneous t-cell lymphoma
['1b', '2b'] ['-', ',']





In [1226]:
all_extracted['erbb2-positive stage i-iii breast cancer']

{'type': [],
 'genotyp_': [],
 'grade': [],
 'stage': ['1', '2', '3'],
 'ajcc': [],
 'hepatitis': []}

## All texts

In [1145]:
all_keywords

['type', 'genotyp_', 'grade', 'stage', 'ajcc', 'hepatitis']

In [1146]:
# distinct counters and dicts for each kw
dict_list = []
counter_list = []
skipped_list = []
for i in range(len(all_keywords)):
    dict_list.append({})
    skipped_list.append([])
    counter_list.append(Counter())
    
raw_counters = dict(zip(all_keywords, counter_list))
orig_kw_dict = dict(zip(all_keywords, dict_list))
skipped_dict = dict(zip(all_keywords, skipped_list))

#for orig_cond in tqdm(['diphtheria; haemophilus influenzae type b; hepatitis b; tetanus; whole cell pertussis']):
for orig_cond in tqdm(c.data['downcase_name'].values):
    preped_cond = preprocess_cond_str(orig_cond)
    for kw in all_keywords:
        regex_str = regex_dict[kw]
        do_element_pp = True
        if kw == 'ajcc':
            do_element_pp = False  # no not subsitute romans for ajcc
        
        elements, delimiters = extract_elements_delims(preped_cond, regex_str, kw, do_element_pp=do_element_pp)
        
        if len(elements) == 0 and kw in preped_cond:
            if preped_cond not in skipped_dict[kw]:
                skipped_dict[kw].append(preped_cond)
        
        for element in elements:
            raw_counters[kw][element] += 1
            
            if element in orig_kw_dict[kw]:
                orig_kw_dict[kw][element].append(orig_cond)
            else:
                orig_kw_dict[kw][element] = [orig_cond]
            

100%|██████████| 420244/420244 [00:05<00:00, 72694.10it/s]


In [1147]:
skipped_dict['stage']

['end-stage renal disease',
 'small cell lung cancer ex10sive stage',
 'end stage renal disease',
 'ex10sive stage small cell lung cancer',
 'limited stage small cell lung cancer',
 'renal disease, end-stage',
 'end-stage kidney disease',
 'end stage liver disease',
 'early-stage breast cancer',
 'third stage of labour',
 'early stage esophageal adenocarcinoma',
 'end-stage renal failure',
 'stage, intraocular melanoma',
 'ex10sive stage unresectable',
 "early stage parkinson's disease",
 'renal failure, end stage',
 'end-stage renal disease patients on hemodialysis',
 'early stage breast cancer',
 "advanced stage diffuse large b-cell non-hodgkin's lymphoma",
 'early stage parkinson disease',
 'end-stage chronic renal disease',
 'end stage renal disease (esrd)',
 "advanced stage parkinson's disease",
 'labor stage, first',
 'stage t3-4nxm0 gastric cancer',
 'ex10sive stage lung small cell carcinoma',
 'end-stage heart failure',
 'endstage renal disease',
 'second stage labor',
 'end st

In [1148]:
raw_counters['stage'].most_common(20)

[('4', 3613),
 ('3', 2673),
 ('2', 1635),
 ('1', 1017),
 ('3b', 1012),
 ('3a', 905),
 ('4a', 741),
 ('3c', 702),
 ('4b', 689),
 ('2b', 494),
 ('2a', 461),
 ('1b', 305),
 ('4c', 267),
 ('1a', 252),
 ('2c', 110),
 ('0', 94),
 ('5', 33),
 ('1c', 29),
 ('b', 23),
 ('c', 11)]

In [1160]:
orig_kw_dict['stage']['5d']

['chronic-kidney disease stage 5d on stable hemodialysis',
 'chronic-kidney disease stage 5d on stable hemodialysis']

In [1149]:
skipped_dict['type']

['adult nasal type extranodal nk/t-cell lymphoma',
 'stereotyped behavior',
 'adult type ovarian granulosa cell tumor',
 'childhood nasal type extranodal nk/t-cell lymphoma',
 'salivary gland malignant mixed cell type tumor',
 'dementia, alzheimer type',
 'neoplasms by histologic type',
 'schizoaffective disorder, manic type',
 "psychosis,behavioral disturbances associated with dementia of the alzheimer's type",
 'locally advanced undifferentiated carcinoma nasopharyngeal type ucnt',
 'dementia of alzheimer type',
 'pneumonic-type adenocarcinoma (p-adc)',
 'senile dementia, alzheimer type',
 'other subtype of b-cell lymphoma',
 'hypersensitivity, delayed type',
 '10sion-type headache',
 'wildtype',
 'intraabdominal cancers (various type)',
 "dementia of the alzheimer's type",
 'childhood botryoid-type embryonal rhabdomyosarcoma',
 'influenza a virus, h5n1 subtype',
 'high-type anal fistulae',
 'psoriasis-type psoriasis',
 'plaque-type psoriasis',
 '10sion type headache',
 'adult type d

In [1150]:
raw_counters['type'].most_common(20)

[('2', 5425),
 ('1', 2119),
 ('b', 124),
 ('3', 40),
 ('a', 22),
 ('1a', 13),
 ('c', 12),
 ('6', 11),
 ('4', 11),
 ('2b', 9),
 ('16', 8),
 ('2a', 7),
 ('c1', 7),
 ('18', 6),
 ('5', 6),
 ('11', 4),
 ('2;', 4),
 ('7', 3),
 ('b3', 3),
 ('d', 2)]

In [1153]:
skipped_dict['grade']

['low grade astrocytomas',
 'low-grade astrocytoma, nos',
 'low-grade lymphoma',
 'high grade glioma',
 'lymphoma, low-grade',
 'childhood high-grade cerebellar astrocytoma',
 'childhood high-grade cerebral astrocytoma',
 'childhood low-grade cerebral astrocytoma',
 'childhood low-grade cerebellar astrocytoma',
 'intermediate-grade lymphoma',
 'high-grade squamous intraepithelial lesion',
 'low-grade squamous intraepithelial lesion',
 'high-grade salivary gland carcinoma',
 'high-grade salivary gland mucoepidermoid carcinoma',
 'low-grade salivary gland carcinoma',
 'low-grade salivary gland mucoepidermoid carcinoma',
 'low grade lymphoma',
 "low-grade or follicular b-cell non-hodgkin's lymphoma",
 'localised high grade osteosarcoma of the limbs',
 'high-grade lymphoma',
 'pressure ulcer, grade',
 'recurrent high-grade gliomas',
 'progressive low-grade gliomas',
 'high grade gliomas',
 'lymphoma, intermediate-grade',
 'lymphoma, high-grade',
 'high-grade gliomas',
 'myelodysplastic syn

In [1154]:
raw_counters['grade'].most_common(20)

[('3', 537),
 ('2', 447),
 ('1', 418),
 ('b', 43),
 ('4', 36),
 ('3a', 17),
 ('3b', 11),
 ('d', 2),
 ('8', 2),
 ('c', 1)]

In [1155]:
skipped_dict['hepatitis']

['hepatitis, viral, human',
 'nonalcoholic steatohepatitis',
 'hepatitis',
 'hepatitis, chronic active',
 'alcoholic hepatitis',
 'hepatitis, alcoholic',
 'hepatitis, chronic',
 'sequelae of viral hepatitis',
 'non alcoholic steatohepatitis',
 'chronic hepatitis',
 'autoimmune hepatitis',
 'non-alcoholic steatohepatitis (nash)',
 'non-alcoholic steatohepatitis',
 'nash-nonalcoholic steatohepatitis',
 'non-alcoholic steatohepatitis(nash)',
 'acute ebv hepatitis',
 'acute cmv hepatitis',
 'acute hepatitis',
 'liver cirrhosis due to virus c chronic hepatitis',
 'nonalcoholic steatohepatitis (nash)',
 'steatohepatitis',
 'acute on chronic hepatitis',
 'nevirapine induced hepatitis',
 'viral hepatitis',
 'hepatitis, toxic',
 'nash (non-alcoholic steato-hepatitis)',
 'chronic alcoholic hepatitis',
 'infections, human immunodeficiency virus,hepatitis',
 'acute alcoholic hepatitis',
 'chronic hcv-related hepatitis',
 'ischemic hepatitis',
 'hypoxic hepatitis',
 'severe alcoholic hepatitis',
 '

In [1156]:
raw_counters['hepatitis'].most_common(20)

[('c', 1592), ('b', 883), ('a', 63), ('d', 23), ('e', 16)]

In [1157]:
raw_counters['ajcc'].most_common(20)

[('v7', 2178), ('v8', 2050), ('v6', 564)]

In [1158]:
all_keywords

['type', 'genotyp_', 'grade', 'stage', 'ajcc', 'hepatitis']

In [1159]:
for cur_c in [raw_counters[x] for x in all_keywords]:
    print(list(cur_c.keys()))

['2', '1', '6', '11', '16', '18', 'b', '4', '3', '1a', 'b;', '2a', '2b', 'a', '5', 'c', '2dm', 'a2', 'a3', 'c1', '2c', '0', 'd', '7', '2d', 'b3', 'b2', '3b', '2;', '8', '10', '14', '3a', '2e', '9']
['1', '3', '1a', '4', '1b', '5', '2']
['1', '2', '3', '4', '3b', 'b', '3a', 'c', 'd', '8']
['1', '2', '3', '4a', '4b', '1b', '4', '2b', '2c', '2a', '3b', '0', '3c', '3a', '1a', '4s', '4c', '5', 'a', 'b', 'c', '3c1', '3c2', '0a', '0is', 'b1', '1c', '3a1', '3a1i', '3a1ii', '3a2', '1b2', '1a1', '1a2', '1b1', '5d', 'd', '2a1', '2a2', 'd0', 'her2+', '1a3', '5t', '3d', '4a2', '4a1', '4m1a']
['v7', 'v6', 'v8']
['b', 'c', 'd', 'a', 'e']


In [1041]:
orig_kw_dict['type']['2dm']

['hypercholesterolemia with type2dm']

In [None]:
import re 

full_dissim_score = 10.

def list_jaccard_dist(l1, l2):
    """ provided 2 lists, compute the jacard similarity between them """
    full_list = []
    for cur_term in l1:
        full_list.append(cur_term)
    for cur_term in l2:
        if cur_term not in full_list:
            full_list.append(cur_term)

    if len(full_list) == 0:
        return -1.
    intersect_count = 0
    for term in full_list:
        if term in l1 and term in l2:
            intersect_count += 1

    return intersect_count / len(full_list)


def digstr2parts(digstr):
    """ returns a tuple of the digit and the string parts, if no num part, returns Null for that part """
    re_pattern = '^[0-9]+'
    f_list = re.findall(re_pattern, digstr)
    if len(f_list) == 0:
        return None, digstr  # no number part
    else:
        num_part = int(f_list[0])
        chr_part = re.sub(re_pattern, '', digstr)
        return num_part, chr_part
    
    
def calc_stage_diff(s1, s2):
    """ compute distance between 3b and 1 for example """
    enumeratable_letters = 'abcdef'
    d1, c1 = digstr2parts(s1)
    d2, c2 = digstr2parts(s2)
    
    # case 1, both digits are None
    if d1 is None and d2 is None and \
    c1 in enumeratable_letters and \
    c2 in enumeratable_letters:
        return abs(ord(c2) - ord(c1))
    
    # case 2, 1 digit is None
    if d1 is not None and d2 is None:
        return full_dissim_score
    if d1 is None and d2 is not None:
        return full_dissim_score
    
    # case 3,
    if d1 is not None and d2 is not None:
        if c1 == '' and c2 == '':
            return abs(d2 - d1)
        elif c1 != '' and c1 == c2:
            return abs(d2 - d1)
        else:
            return abs(d2 - d1) + .5
        
    return full_dissim_score
        

# function to compute simiarity between two extracted stage dicts
def stage_sim_dist(stage_dict1, stage_dict2):
    """ stage_dict has key type, grade etc 
    if dissimilar, returns 10
    if stage, type, grade, returns number diff
    if both letters, returns char ord diff
    
    for full_match types applies an (1 - jaccard multiplier) to list diffs
    """
    # if 1 of the dict is blank, return full dissimilarity
    if len(stage_dict1) == 0 and len(stage_dict2) == 0:
        return 0.
    elif len(stage_dict1) == 0 or len(stage_dict2) == 0:
        return full_dissim_score
    
    # hepatitis, genotype, ajcc must be full match
    full_sim_required_types = ['hepatitis', 'ajcc', 'genotyp_']
    full_match_scores = []
    for cur_type in full_sim_required_types:
        if len(stage_dict1[cur_type]) > 0 and len(stage_dict2[cur_type]) > 0:
            one_m_jdist = 1 - list_jaccard_dist(stage_dict1[cur_type], stage_dict2[cur_type])
            full_match_scores.append(one_m_jdist * full_dissim_score)
        elif len(stage_dict1[cur_type]) > 0 or len(stage_dict2[cur_type]) > 0:
            full_match_scores.append(full_dissim_score)
    # if any was filled, then it means some full-match dicts were not null
    if len(full_match_scores) > 0:
        max_full_score = max(full_match_scores)
        if max_full_score > 0:  # only returns if the full match was non-zero other wise check part matches
            return max_full_score
    
    # grade, type, stage can be partially matched
    partial_sim_required_types = ['grade', 'type', 'stage']
    l1, l2 = [], []
    for cur_type in partial_sim_required_types:
        for cur_item in stage_dict1[cur_type]:  # append to l1
            if cur_item not in l1 and cur_item != '':
                l1.append(cur_item)
        for cur_item in stage_dict2[cur_type]: # append to l2
            if cur_item not in l2 and cur_item != '':
                l2.append(cur_item)
                
    # the elements that are not matching contribute to the difference (if they match, they contribute 0 score)
    l1_diff, l2_diff = [], []
    l1_diff = [x for x in l1 if x not in l2]
    l2_diff = [x for x in l2 if x not in l1]
    
    # how many were matched?
    num_matched = len(l1) - len(l1_diff)
    num_diff = len(l1_diff) + len(l2_diff)
    
    # compute the average distance between l1 and l2 values
    all_min_dist = []
    for l1_diff_val in l1_diff:
        if len(l2_diff) > 0:  # exit condition
            all_dist = [calc_stage_diff(l1_diff_val, x) for x in l2_diff]
            min_dist = min(all_dist)
            min_idx = all_dist.index(min_dist)
            del l2_diff[min_idx]
            all_min_dist.append(min_dist)
        else:
            all_min_dist.append(full_dissim_score)
    
    # if any left in l2_diff, add them as fully dissim
    for l2_diff_val in l2_diff:
        all_min_dist.append(full_dissim_score)
    
    if len(all_min_dist) == 0:
        return 0.
    
    mean_dist = sum(all_min_dist) / len(all_min_dist)
    return mean_dist * num_diff / (num_matched + num_diff)

In [None]:
stage_dict1 = {
    'grade':[],
    'type':[],
    'stage':[],
    'genotyp_':[],
    'ajcc':['v7'],
    'hepatitis':[]
}

stage_dict2 = {
    'grade':['2'],
    'type':[],
    'stage':[],
    'genotyp_':[],
    'ajcc':['v7'],
    'hepatitis':[]
}

stage_sim_dist(stage_dict1, stage_dict2)

### seen types:
- number
- roman numerals
- english words as numbers
- qualifiers that follow the numbers such as a, b, c
- "-" between valid numbers that signify interval
- / that signify and