In [1]:
import os
import csv

import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

## Exploring the problem

Let's begin by looking a little more closely at the issue. Our first pass at identifying adverse event terms is to look through the labels for exact string matches to the MedDRA adverse event terminology terms. This misses many of the terms that were identified manually through a review of 200 notes (by the amazing TAC team). 

We start by looking at the manual annotations provided by the TAC team. 

NOTE: For both the manual annotations, the PT is also listed as an LLT so that we only need look at the LLT column. 

In [162]:
manual_annotations_file = '../data/200_manual_annotations_csv/FinalReferenceStandard200Labels_comma.csv'

manual_df = pd.read_csv(manual_annotations_file)
manual_df['drug'] = manual_df['Drug Name'].str.lower()

matching_string_sorted = list()
for index, row in manual_df.iterrows():
    matching_string_sorted.append(' '.join(sorted(str(row['Matching String']).split())))
manual_df['matching_string_sorted'] = matching_string_sorted

print(manual_df.shape)
manual_df.head()

(14487, 16)


Unnamed: 0,Index,Drug ID,Drug Name,Section LOINC,Section Display Name,MedDRA PT,PT ID,MedDRA LLT,LLT ID,Matching String,UMLS CUI,UMLS PrefName,Flag 1,Flag 2,drug,matching_string_sorted
0,1,00a8921e-46a6-4df1-a744-9e532b6fb06f,NUCYNTA,34084-4,ADVERSE REACTIONS,Abdominal discomfort,10000059.0,Abdominal discomfort,10000059.0,abdominal discomfort,C0232487,Abdominal discomfort,,,nucynta,abdominal discomfort
1,2,00a8921e-46a6-4df1-a744-9e532b6fb06f,NUCYNTA,34084-4,ADVERSE REACTIONS,Abnormal dreams,10000125.0,Abnormal dreams,10000125.0,abnormal dreams,C0234458,Dream disorder,,,nucynta,abnormal dreams
2,3,00a8921e-46a6-4df1-a744-9e532b6fb06f,NUCYNTA,34084-4,ADVERSE REACTIONS,Agitation,10001497.0,Agitation,10001497.0,agitation,C0085631,Agitation,,,nucynta,agitation
3,4,00a8921e-46a6-4df1-a744-9e532b6fb06f,NUCYNTA,34084-4,ADVERSE REACTIONS,Alanine aminotransferase increased,10001551.0,Alanine aminotransferase increased,10001551.0,alanine aminotransferase increased,C0151905,Alanine aminotransferase increased,,,nucynta,alanine aminotransferase increased
4,5,00a8921e-46a6-4df1-a744-9e532b6fb06f,NUCYNTA,34084-4,ADVERSE REACTIONS,Alcohol interaction,10001597.0,Interaction with alcohol,10022527.0,interaction with alcohol,C0853206,Alcohol interaction,,,nucynta,alcohol interaction with


In [163]:
# how many of the terms match either the LLT they were mapped to?
total_n = manual_df.shape[0]
exact_n = manual_df[manual_df['Matching String'].str.lower()==manual_df['MedDRA LLT'].str.lower()].shape[0]

exact_n, total_n, exact_n/total_n

(10188, 14487, 0.7032511907227169)

In [164]:
manual_df[manual_df['Matching String'].str.lower()!=manual_df['MedDRA LLT'].str.lower()]

Unnamed: 0,Index,Drug ID,Drug Name,Section LOINC,Section Display Name,MedDRA PT,PT ID,MedDRA LLT,LLT ID,Matching String,UMLS CUI,UMLS PrefName,Flag 1,Flag 2,drug,matching_string_sorted
12,13,00a8921e-46a6-4df1-a744-9e532b6fb06f,NUCYNTA,34084-4,ADVERSE REACTIONS,Chronic respiratory disease,10061768.0,Chronic respiratory disease,10061768.0,chronic pulmonary disease,C0264220,Chronic disease of respiratory system,,,nucynta,chronic disease pulmonary
25,26,00a8921e-46a6-4df1-a744-9e532b6fb06f,NUCYNTA,34084-4,ADVERSE REACTIONS,Drug dependence,10013663.0,Dependence addictive,10012336.0,addiction,C0085281,Addictive Behavior,,,nucynta,addiction
37,38,00a8921e-46a6-4df1-a744-9e532b6fb06f,NUCYNTA,34084-4,ADVERSE REACTIONS,Gastrointestinal disorder,10017944.0,Gastrointestinal symptom NOS,10018011.0,gastrointestinal effects,C0426576,Gastrointestinal symptom,,,nucynta,effects gastrointestinal
46,47,00a8921e-46a6-4df1-a744-9e532b6fb06f,NUCYNTA,34084-4,ADVERSE REACTIONS,Hypotension,10021097.0,Hypotensive,10021107.0,hypotensive effects,C0857353,Hypotensive,,,nucynta,effects hypotensive
52,53,00a8921e-46a6-4df1-a744-9e532b6fb06f,NUCYNTA,34084-4,ADVERSE REACTIONS,Muscle contractions involuntary,10028293.0,Muscle contractions involuntary,10028293.0,involuntary muscle contractions,C0235086,Involuntary muscle contraction,,,nucynta,contractions involuntary muscle
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14474,14475,fff5d805-4ffd-4e8e-8e63-6f129697563e,ZYKADIA,43685-7,WARNINGS AND PRECAUTIONS,Interstitial lung disease,10022611.0,Interstitial lung disease,10022611.0,ild,C0206062,Lung Diseases,,,zykadia,ild
14476,14477,fff5d805-4ffd-4e8e-8e63-6f129697563e,ZYKADIA,43685-7,WARNINGS AND PRECAUTIONS,Lipase increased,10024574.0,Lipase increased,10024574.0,elevations of lipase,C1963823,Hyperlipasaemia,,,zykadia,elevations lipase of
14477,14478,fff5d805-4ffd-4e8e-8e63-6f129697563e,ZYKADIA,43685-7,WARNINGS AND PRECAUTIONS,Maternal drugs affecting foetus,10026923.0,Drug toxicity NEC affecting foetus,10013750.0,embryofetal toxicity,C0270009,Fetal or neonatal effect of toxic substance tr...,,,zykadia,embryofetal toxicity
14484,14485,fff5d805-4ffd-4e8e-8e63-6f129697563e,ZYKADIA,43685-7,WARNINGS AND PRECAUTIONS,Transaminases increased,10054889.0,Transaminases increased,10054889.0,elevated transaminases,C0438717,Transaminases increased,,,zykadia,elevated transaminases


Let's compare that to if we use the MedDRA vocabulary itself and relook for the terms. This should produce about the same result as the quick calculation above. 

In [11]:
# load the meddra dictionary
meddra_23p1_file = '../data/meddra_pt_llt_map_omop_v23.1.csv'
meddra_df = pd.read_csv(meddra_23p1_file)
meddra_df.head()

Unnamed: 0,pt_concept_id,pt_concept_name,pt_meddra_id,llt_concept_id,llt_concept_name,llt_meddra_id
0,788094,Defiant behaviour,10077244,788741,Defiant behavior,10077245
1,788095,Proctectomy,10077252,37585820,Total mesorectal excision,10069130
2,788095,Proctectomy,10077252,43010901,Lower anterior resection,10083500
3,788095,Proctectomy,10077252,37585816,Abdomino-perineal resection of rectum,10000108
4,788095,Proctectomy,10077252,45887267,Abdominoperineal resection,10075500


In [117]:
# build a dictionary from llt to pt
llt2pt = dict()
for index, row in meddra_df.iterrows():
    llt2pt[row['llt_meddra_id']] = row['pt_meddra_id']
    llt2pt[row['pt_meddra_id']] = row['pt_meddra_id']

In [36]:
meddra_terms = set(meddra_df['llt_concept_name'].str.lower())
meddra_terms |= set(meddra_df['pt_concept_name'].str.lower())
len(meddra_terms)

62571

In [39]:
meddra_exact_n = 0
for index, row in manual_df.iterrows():
    if type(row['Matching String']) != str:
        continue
    
    if row['Matching String'].lower() in meddra_terms:
        meddra_exact_n += 1

meddra_exact_n, total_n, meddra_exact_n/total_n

(10051, 14487, 0.6937944363912473)

## Non-exact Matches Review

Cindy Chen did excellent work to review many of the ones that did not match and figure out the reason. Terms in different order and wording differences were found to be the largest two categories.

In [134]:
unscored_file = '../data/unscored_terms_with_notes.csv'
unscored_df = pd.read_csv(unscored_file)
unscored_df['drug'] = unscored_df['drug'].str.lower()

unscored_df['pt_id'] = unscored_df['llt_id'].astype(str).transform(lambda x: int(llt2pt.get(int(x), -1)))

unscored_df['drug,pt_id'] = list(zip(unscored_df['drug'], unscored_df['pt_id']))

unscored_df.head()

Unnamed: 0,drug,meddra_term,term in label,Problem,Why was it not annotated?,class,Pred0,Pred1,scored,split,Property,section,llt_id,other notes,pt_id,"drug,pt_id"
0,actemra,absolute neutrophil count decreased,decreases in neutrophil,Term in different order,,is_event,0,0,not_scored,train,20859,AR,10059234,,10029366,"(actemra, 10029366)"
1,actemra,arthritis bacterial,bacterial arthritis,Term in different order,terms in different order,is_event,0,0,not_scored,train,20310,AR,10053555,,10053555,"(actemra, 10053555)"
2,actemra,ast increased,,,,is_event,0,0,not_scored,train,20069,AR,10003544,,10003481,"(actemra, 10003481)"
3,actemra,bilirubin total increased,,,,is_event,0,0,not_scored,train,20968,AR,10056806,,10005364,"(actemra, 10005364)"
4,actemra,cholesterol total increased,,,,is_event,0,0,not_scored,train,20568,AR,10008671,,10005425,"(actemra, 10005425)"


In [42]:
unscored_df.groupby('Problem').count()['drug']

Problem
Abbreviations                                                       3
Abbreviations, Extra word in meddra                                 1
Abbreviations, Parentheses, Term in different order                 3
Abbreviations, Term in different order                              1
Abbreviations, Term in different order, Wording Differences         2
Abbreviations, Word insertion and lists                             1
Abbreviations, Word insertion and lists, Wording Differences        1
Abbreviations, Wording Differences                                  2
Extra word in meddra                                               14
Extra word in meddra, Hyphens                                       1
Extra word in meddra, Spelling                                      1
Extra word in meddra, Term in different order                       1
Extra word in meddra, Wording Differences                           2
Hyphens                                                            18
Hyphens, Par

In [43]:
unscored_df[unscored_df['Problem'] == 'Wording Differences']

Unnamed: 0,drug,meddra_term,term in label,Problem,Why was it not annotated?,class,Pred0,Pred1,scored,split,Property,section,llt_id,other notes
5,ACTEMRA,death,,Wording Differences,not mentioned in AR,is_event,0,0,not_scored,train,20608,AR,10011906,
13,ACTEMRA,neoplasm malignant,"Malignancies During the 24 week, controlled pe...",Wording Differences,"""malignancies"" instead of neoplasm",is_event,0,0,not_scored,train,20578,AR,10028997,
20,ADCETRIS,death,,Wording Differences,not mentioned in AR,is_event,0,0,not_scored,train,21164,AR,10011906,
21,ADCETRIS,drug toxicity,The most common adverse reactions occurring in...,Wording Differences,not explicitly mentioned,is_event,0,0,not_scored,train,21116,AR,10013746,
22,ADCETRIS,hematotoxicity,,Wording Differences,not explicitly mentioned� what does 'hematotox...,is_event,0,0,not_scored,train,20749,AR,10061196,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
607,INVOKANA,upper limb fracture,Fractures were observed as early as 12 weeks a...,Wording Differences,,is_event,0,0,not_scored,test,2128,AR,10061394,
608,ISTODAX,arrhythmia supraventricular,supraventricular arrhythmia,Wording Differences,,is_event,0,0,not_scored,train,20192,AR,10003130,
609,ISTODAX,ecg nonspecific st-t change,Electrocardiogram ST-T wave changes,Wording Differences,,is_event,0,0,not_scored,train,20678,AR,10057502,
610,ISTODAX,electrocardiogram change,Electrocardiogram ST-T wave changes,Wording Differences,,is_event,0,0,not_scored,train,21149,AR,10061116,


## DeepCADRME

DeepCADRME is a method for identifying adverse event terms from the labels when they are not continuous. It should help to address the primary two categories. We (Cindy) ran DeepCADRME on the 199 of the 200 labels from TAC (one label wasn't available for some reason). Here we explore how many of the missing ~4k annotations are recovered using this method. 

In [52]:
deepcadrme_file = '../data/deepcadrme_guess_terms_meddramatch.csv'
deepcadrme_df = pd.read_csv(deepcadrme_file)

deepcadrme_df['drug'] = deepcadrme_df['file'].str.strip('.xml')
deepcadrme_df['drug'] = deepcadrme_df['drug'].str.lower()

deepcadrme_df.head()

Unnamed: 0.1,Unnamed: 0,file,term,start,len,match_method,proposed_meddra_term,proposed_meddra_strings,drug
0,0,ACTEMRA.xml,upper respiratory tract infections,410,34,fuzzy,10046306,infection respiratory tract upper,actemra
1,1,ACTEMRA.xml,nasopharyngitis,446,15,exact,10028810,nasopharyngitis,actemra
2,2,ACTEMRA.xml,headache,463,8,exact,10019211,headache,actemra
3,3,ACTEMRA.xml,hypertension,473,12,exact,10020772,hypertension,actemra
4,4,ACTEMRA.xml,increased alt,487,13,fuzzy,10001551,alanine aminotransferase increased,actemra


In [58]:
len(set(manual_df['drug'])), len(set(deepcadrme_df['drug'])), len(set(manual_df['drug']) & set(deepcadrme_df['drug']))

(200, 199, 199)

In [171]:
nonexact_manual_df = manual_df[manual_df['Matching String'].str.lower()!=manual_df['MedDRA LLT'].str.lower()]
nonexact_manual_df = nonexact_manual_df[~nonexact_manual_df['PT ID'].isnull()]
nonexact_manual_df.shape

(4251, 16)

In [84]:
# Map to PTs and turn into a set
nonexact_annots = set(zip(nonexact_manual_df['drug'], nonexact_manual_df['PT ID'].astype(int)))
len(nonexact_annots)

3200

In [85]:
deepcadrme_annots = set(zip(deepcadrme_df['drug'], deepcadrme_df['proposed_meddra_term']))
len(deepcadrme_annots)

9399

In [87]:
# Number and proportion of missing annotations recovered by using DeepCADRME
len(nonexact_annots & deepcadrme_annots), len(nonexact_annots & deepcadrme_annots)/len(nonexact_annots)

(1278, 0.399375)

In [126]:
unscored_annots = set(zip(unscored_df['drug'], unscored_df['pt_id']))

len(unscored_annots), len(unscored_annots & deepcadrme_annots)

(1447, 513)

In [138]:
unscored_df[unscored_df['drug,pt_id'].isin(deepcadrme_annots)].groupby('Problem').count()['drug']

Problem
Abbreviations                                                      2
Abbreviations, Extra word in meddra                                1
Abbreviations, Term in different order                             1
Extra word in meddra                                               7
Extra word in meddra, Spelling                                     1
Extra word in meddra, Term in different order                      1
Extra word in meddra, Wording Differences                          1
Hyphens                                                           13
Hyphens, Parentheses                                               1
Hyphens, Word insertion and lists                                  1
Parentheses                                                        6
Parentheses, Term in different order                               5
Parentheses, Term in different order, Word insertion and lists     2
Parentheses, Word insertion and lists                              1
Spelling                  

## N-gram Resort

Explore how many of the terms could be identified by looking at different permutations of the words in an adverse event term.

In [156]:
meddra_terms_sorted = set(map(lambda x: ' '.join(sorted(x.split())), meddra_terms))

In [168]:
meddra_exactsorted_n = manual_df[manual_df['matching_string_sorted'].isin(meddra_terms_sorted)].shape[0]

meddra_exactsorted_n, total_n, meddra_exactsorted_n/total_n

(10378, 14487, 0.7163663974597916)

In [174]:
resort_matches = nonexact_manual_df[nonexact_manual_df['matching_string_sorted'].isin(meddra_terms_sorted)].shape[0]

resort_matches, nonexact_manual_df.shape[0]

(417, 4251)

In [178]:
resort_matches_df = nonexact_manual_df[nonexact_manual_df['matching_string_sorted'].isin(meddra_terms_sorted)]
resort_annots = set(zip(resort_matches_df['drug'], resort_matches_df['PT ID']))
len(resort_annots)

380

In [182]:
len(resort_annots & deepcadrme_annots)

298