In [42]:
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from glob import glob
from tqdm import tqdm

In [43]:
#change to appropriate data folder
folder = 'data/TAC2017/'

In [44]:
train_labels = glob(folder+'train_xml/*')
drug_label_text = []
for label in tqdm(train_labels):
    drug_name = label.split('/')[-1].split('.')[0]
    with open(label, 'r') as f:
        soup = BeautifulSoup(f, 'xml')
    for section in soup.find_all('Section'):
        section_name = section['name']
        drug_label_text.append([drug_name, section_name, section.text])
drug_label_text = pd.DataFrame(drug_label_text, columns=['drug_name', 'section_name', 'section_text'])
drug_label_text.to_csv(folder+'train_drug_label_text.csv', index=False)
drug_label_text.head()

100%|██████████| 101/101 [00:01<00:00, 79.54it/s]


Unnamed: 0,drug_name,section_name,section_text
0,XEOMIN,adverse reactions,6 ADVERSE REACTIONS\n\n The following adv...
1,XEOMIN,boxed warnings,\n\n BOXED WARNING: WARNING: DISTANT SPREAD...
2,XEOMIN,warnings and precautions,5 WARNINGS AND PRECAUTIONS\n\n\n\n EXCERP...
3,QUTENZA,adverse reactions,6 ADVERSE REACTIONS\n\n The following ser...
4,QUTENZA,warnings and precautions,5 WARNINGS AND PRECAUTIONS\n\n\n\n EXCER...


In [45]:
test_labels = glob(folder+'gold_xml/*')
drug_label_text = []
for label in tqdm(test_labels):
    drug_name = label.split('/')[-1].split('.')[0]
    with open(label, 'r') as f:
        soup = BeautifulSoup(f, 'xml')
    for section in soup.find_all('Section'):
        section_name = section['name']
        drug_label_text.append([drug_name, section_name, section.text])
drug_label_text = pd.DataFrame(drug_label_text, columns=['drug_name', 'section_name', 'section_text'])
drug_label_text.to_csv(folder+'test_drug_label_text.csv', index=False)
drug_label_text.head()

100%|██████████| 99/99 [00:01<00:00, 88.23it/s] 


Unnamed: 0,drug_name,section_name,section_text
0,IMPAVIDO,adverse reactions,6 ADVERSE REACTIONS\n\n Because clinical ...
1,IMPAVIDO,boxed warnings,\n\n BOXED WARNING: WARNING: EMBRYO-FETAL T...
2,IMPAVIDO,warnings and precautions,5 WARNINGS AND PRECAUTIONS\n\n\n\n EXCER...
3,LIVALO,adverse reactions,6 ADVERSE REACTIONS\n\n The following ser...
4,LIVALO,warnings and precautions,5 WARNINGS AND PRECAUTIONS\n\n\n\n EXCER...


also extract the manually annotated ades.

In [46]:
train_labels = glob(folder+'train_xml/*')
drug_label_text_all = pd.DataFrame()
section_cat = []
for label in tqdm(train_labels):
    drug_name = label.split('/')[-1].split('.')[0]
    with open(label, 'r') as f:
        soup = BeautifulSoup(f, 'xml')
    mentions = []
    for mention in soup.find_all('Mention'):
        if mention['type'] == 'AdverseReaction':
          section_name = mention['section']
          mention_str = mention['str'].lower()
          mentions.append([drug_name, section_name, mention_str])
    mentions_df = pd.DataFrame(mentions, columns=['drug_name', 'section_id', 'reaction_string'])
    drug_label_text = []
    for reaction in soup.find_all('Reaction'):
        string = reaction['str']
        for norm in reaction.find_all('Normalization'):
          try:
            meddra_str = norm['meddra_pt']
            meddra_id = norm['meddra_pt_id']
          except:
            meddra_str, meddra_id = None, None
          try:
            meddra_llt = norm['meddra_llt']
            meddra_llt_id = norm['meddra_llt_id']
          except:
            meddra_llt, meddra_llt_id = None, None
          drug_label_text.append([drug_name, string, meddra_str, meddra_id, meddra_llt, meddra_llt_id])
    drug_label_text = pd.DataFrame(drug_label_text, columns=['drug_name', 'reaction_string', 'meddra_pt', 'meddra_pt_id', 'meddra_llt', 'meddra_llt_id'])
    drug_label_text = mentions_df.merge(drug_label_text, how='left', on=['drug_name', 'reaction_string'])
    drug_label_text_all = pd.concat([drug_label_text_all, drug_label_text])
    section_cat.extend([[drug_name, i['id'], i['name']] for i in soup.find_all('Section')])

section_table = pd.DataFrame(section_cat, columns=['drug_name', 'section_id', 'section_name'])
drug_label_text_all = drug_label_text_all.merge(section_table, on=['drug_name', 'section_id'], how='left')
drug_label_text_all.to_csv(folder+'train_drug_label_text_manual_ades.csv', index=False)
drug_label_text_all.head()

100%|██████████| 101/101 [00:01<00:00, 62.35it/s]


Unnamed: 0,drug_name,section_id,reaction_string,meddra_pt,meddra_pt_id,meddra_llt,meddra_llt_id,section_name
0,XEOMIN,S1,hypersensitivity,Hypersensitivity,10020751.0,,,adverse reactions
1,XEOMIN,S1,dysphagia,Dysphagia,10013950.0,,,adverse reactions
2,XEOMIN,S1,breathing difficulties,Dyspnoea,10013968.0,Difficulty breathing,10012791.0,adverse reactions
3,XEOMIN,S1,spread of effects from toxin,,,,,adverse reactions
4,XEOMIN,S1,dysphagia,Dysphagia,10013950.0,,,adverse reactions


In [61]:
test_labels = glob(folder+'gold_xml/*')
drug_label_text_all = pd.DataFrame()
section_cat = []
for label in tqdm(test_labels):
    drug_name = label.split('/')[-1].split('.')[0]
    with open(label, 'r') as f:
        soup = BeautifulSoup(f, 'xml')
    mentions = []
    for mention in soup.find_all('Mention'):
        if mention['type'] == 'AdverseReaction':
          section_name = mention['section']
          mention_str = mention['str'].lower()
          mentions.append([drug_name, section_name, mention_str])
    mentions_df = pd.DataFrame(mentions, columns=['drug_name', 'section_id', 'reaction_string'])
    ####
    drug_label_text = []
    for reaction in soup.find_all('Reaction'):
        string = reaction['str']
        for norm in reaction.find_all('Normalization'):
          try:
            meddra_str = norm['meddra_pt']
            meddra_id = norm['meddra_pt_id']
          except:
            meddra_str, meddra_id = None, None
          try:
            meddra_llt = norm['meddra_llt']
            meddra_llt_id = norm['meddra_llt_id']
          except:
            meddra_llt, meddra_llt_id = None, None
          drug_label_text.append([drug_name, string, meddra_str, meddra_id, meddra_llt, meddra_llt_id])
    drug_label_text = pd.DataFrame(drug_label_text, columns=['drug_name', 'reaction_string', 'meddra_pt', 'meddra_pt_id', 'meddra_llt', 'meddra_llt_id'])
    ####
    drug_label_text = mentions_df.merge(drug_label_text, how='left', on=['drug_name', 'reaction_string'])
    drug_label_text_all = pd.concat([drug_label_text_all, drug_label_text])
    section_cat.extend([[drug_name, i['id'], i['name']] for i in soup.find_all('Section')])

section_table = pd.DataFrame(section_cat, columns=['drug_name', 'section_id', 'section_name'])
drug_label_text_all = drug_label_text_all.merge(section_table, on=['drug_name', 'section_id'], how='left')
drug_label_text_all.to_csv(folder+'test_drug_label_text_manual_ades.csv', index=False)
drug_label_text_all.head()

100%|██████████| 99/99 [00:01<00:00, 65.35it/s]


Unnamed: 0,drug_name,section_id,reaction_string,meddra_pt,meddra_pt_id,meddra_llt,meddra_llt_id,section_name
0,IMPAVIDO,S1,nausea,Nausea,10028813,,,adverse reactions
1,IMPAVIDO,S1,vomiting,Vomiting,10047700,,,adverse reactions
2,IMPAVIDO,S1,diarrhea,Diarrhoea,10012735,Diarrhea,10012727.0,adverse reactions
3,IMPAVIDO,S1,headache,Headache,10019211,,,adverse reactions
4,IMPAVIDO,S1,decreased appetite,Decreased appetite,10061428,,,adverse reactions


----

In [47]:
train_labels = glob(folder+'train_xml/*')
mentions_all = pd.DataFrame()
section_cat = []
for label in tqdm(train_labels):
    drug_name = label.split('/')[-1].split('.')[0]
    with open(label, 'r') as f:
        soup = BeautifulSoup(f, 'xml')
    mentions = []
    for mention in soup.find_all('Mention'):
        mentions.append([drug_name, mention.attrs])
    mentions_df = pd.DataFrame(mentions, columns=['drug_name', 'mention_tags'])
    mentions_attr = mentions_df['mention_tags'].apply(pd.Series)
    mentions_df = pd.concat([mentions_df, mentions_attr], axis=1)
    mentions_all = pd.concat([mentions_all, mentions_df])
mentions_all = mentions_all.drop('mention_tags', axis=1)
mentions_all.to_csv(folder+'train_drug_label_mentions.csv', index=False)
mentions_all.head()

100%|██████████| 101/101 [00:05<00:00, 19.02it/s]


Unnamed: 0,drug_name,id,section,type,start,len,str
0,XEOMIN,M1,S1,AdverseReaction,143,16,Hypersensitivity
1,XEOMIN,M2,S1,AdverseReaction,235,9,Dysphagia
2,XEOMIN,M3,S1,AdverseReaction,249,22,Breathing Difficulties
3,XEOMIN,M4,S1,AdverseReaction,353,28,Spread of Effects from Toxin
4,XEOMIN,M5,S1,AdverseReaction,545,9,dysphagia


In [48]:
test_labels = glob(folder+'gold_xml/*')
mentions_all = pd.DataFrame()
section_cat = []
for label in tqdm(test_labels):
    drug_name = label.split('/')[-1].split('.')[0]
    with open(label, 'r') as f:
        soup = BeautifulSoup(f, 'xml')
    mentions = []
    for mention in soup.find_all('Mention'):
        mentions.append([drug_name, mention.attrs])
    mentions_df = pd.DataFrame(mentions, columns=['drug_name', 'mention_tags'])
    mentions_attr = mentions_df['mention_tags'].apply(pd.Series)
    mentions_df = pd.concat([mentions_df, mentions_attr], axis=1)
    mentions_all = pd.concat([mentions_all, mentions_df])
mentions_all = mentions_all.drop('mention_tags', axis=1)
mentions_all.to_csv(folder+'test_drug_label_mentions.csv', index=False)
mentions_all.head()

100%|██████████| 99/99 [00:04<00:00, 20.81it/s]


Unnamed: 0,drug_name,id,section,type,start,len,str
0,IMPAVIDO,M1,S1,AdverseReaction,371,6,nausea
1,IMPAVIDO,M2,S1,AdverseReaction,379,8,vomiting
2,IMPAVIDO,M3,S1,AdverseReaction,389,8,diarrhea
3,IMPAVIDO,M4,S1,AdverseReaction,399,8,headache
4,IMPAVIDO,M5,S1,AdverseReaction,409,18,decreased appetite


---

In [49]:
train_labels = glob(folder+'train_xml/*')
relations_all = pd.DataFrame()
section_cat = []
for label in tqdm(train_labels):
    drug_name = label.split('/')[-1].split('.')[0]
    with open(label, 'r') as f:
        soup = BeautifulSoup(f, 'xml')
    relations = []
    for relation in soup.find_all('Relation'):
        relations.append([drug_name, relation.attrs])
    relations_df = pd.DataFrame(relations, columns=['drug_name', 'relation_tags'])
    relations_df = pd.concat([relations_df, relations_df['relation_tags'].apply(pd.Series)], axis=1)
    if relations_df.shape[0] != 0:
        relations_all = pd.concat([relations_all, relations_df])
relations_all = relations_all.drop('relation_tags', axis=1)
relations_all.to_csv(folder+'train_drug_label_relations.csv', index=False)
relations_all.head()

100%|██████████| 101/101 [00:02<00:00, 50.24it/s]


Unnamed: 0,drug_name,id,type,arg1,arg2
0,XEOMIN,RL1,Effect,M42,M41
1,XEOMIN,RL2,Negated,M42,M40
2,XEOMIN,RL3,Hypothetical,M99,M98
3,XEOMIN,RL4,Hypothetical,M100,M98
4,XEOMIN,RL5,Hypothetical,M102,M101


In [50]:
test_labels = glob(folder+'gold_xml/*')
relations_all = pd.DataFrame()
section_cat = []
for label in tqdm(test_labels):
    drug_name = label.split('/')[-1].split('.')[0]
    with open(label, 'r') as f:
        soup = BeautifulSoup(f, 'xml')
    relations = []
    for relation in soup.find_all('Relation'):
        relations.append([drug_name, relation.attrs])
    relations_df = pd.DataFrame(relations, columns=['drug_name', 'relation_tags'])
    relations_df = pd.concat([relations_df, relations_df['relation_tags'].apply(pd.Series)], axis=1)
    if relations_df.shape[0] != 0:
        relations_all = pd.concat([relations_all, relations_df])
relations_all = relations_all.drop('relation_tags', axis=1)
relations_all.to_csv(folder+'test_drug_label_relations.csv', index=False)
relations_all.head()

100%|██████████| 99/99 [00:01<00:00, 50.72it/s]


Unnamed: 0,drug_name,id,type,arg1,arg2
0,IMPAVIDO,RL1,Negated,M14,M13
1,IMPAVIDO,RL2,Effect,M21,M20
2,IMPAVIDO,RL3,Effect,M23,M22
3,IMPAVIDO,RL4,Effect,M28,M30
4,IMPAVIDO,RL5,Effect,M29,M30


---

In [51]:
train_labels = glob(folder+'train_xml/*')
reactions_all = pd.DataFrame()
section_cat = []
for label in tqdm(train_labels):
    drug_name = label.split('/')[-1].split('.')[0]
    with open(label, 'r') as f:
        soup = BeautifulSoup(f, 'xml')
    reactions = []
    for reaction in soup.find_all('Reaction'):
        reactions.append([drug_name, reaction.attrs, [i.attrs for i in reaction.find_all('Normalization')]])
    reactions_df = pd.DataFrame(reactions, columns=['drug_name', 'reaction_tags', 'normalization_tags'])
    reactions_df = reactions_df.explode('normalization_tags')
    reaction_df = reactions_df['reaction_tags'].apply(pd.Series).rename(columns={'id':'reaction_id'})
    norm_df = reactions_df['normalization_tags'].apply(pd.Series).rename(columns={'id':'norm_id'})
    reactions_df = pd.concat([reactions_df, reaction_df, norm_df], axis=1).drop(['reaction_tags', 'normalization_tags'], axis=1)
    if reactions_df.shape[0] != 0:
        reactions_all = pd.concat([reactions_all, reactions_df])
reactions_all.to_csv(folder+'train_drug_label_reactions.csv', index=False)
reactions_all.head()

100%|██████████| 101/101 [00:04<00:00, 20.27it/s]


Unnamed: 0,drug_name,reaction_id,str,norm_id,meddra_pt,meddra_pt_id,meddra_llt,meddra_llt_id,flag
0,XEOMIN,AR1,hypersensitivity,AR1.N1,Hypersensitivity,10020751.0,,,
1,XEOMIN,AR2,dysphagia,AR2.N1,Dysphagia,10013950.0,,,
2,XEOMIN,AR3,breathing difficulties,AR3.N1,Dyspnoea,10013968.0,Difficulty breathing,10012791.0,
3,XEOMIN,AR4,spread of effects from toxin,AR4.N1,,,,,unmapped
4,XEOMIN,AR5,neck pain,AR5.N1,Neck pain,10028836.0,,,


In [52]:
test_labels = glob(folder+'gold_xml/*')
reactions_all = pd.DataFrame()
section_cat = []
for label in tqdm(test_labels):
    drug_name = label.split('/')[-1].split('.')[0]
    with open(label, 'r') as f:
        soup = BeautifulSoup(f, 'xml')
    reactions = []
    for reaction in soup.find_all('Reaction'):
        reactions.append([drug_name, reaction.attrs, [i.attrs for i in reaction.find_all('Normalization')]])
    reactions_df = pd.DataFrame(reactions, columns=['drug_name', 'reaction_tags', 'normalization_tags'])
    reactions_df = reactions_df.explode('normalization_tags')
    reaction_df = reactions_df['reaction_tags'].apply(pd.Series).rename(columns={'id':'reaction_id'})
    norm_df = reactions_df['normalization_tags'].apply(pd.Series).rename(columns={'id':'norm_id'})
    reactions_df = pd.concat([reactions_df, reaction_df, norm_df], axis=1).drop(['reaction_tags', 'normalization_tags'], axis=1)
    if reactions_df.shape[0] != 0:
        reactions_all = pd.concat([reactions_all, reactions_df])
reactions_all.to_csv(folder+'test_drug_label_reactions.csv', index=False)
reactions_all.head()

100%|██████████| 99/99 [00:04<00:00, 21.95it/s]


Unnamed: 0,drug_name,reaction_id,str,norm_id,meddra_pt,meddra_pt_id,meddra_llt,meddra_llt_id,flag
0,IMPAVIDO,AR1,nausea,AR1.N1,Nausea,10028813,,,
1,IMPAVIDO,AR2,vomiting,AR2.N1,Vomiting,10047700,,,
2,IMPAVIDO,AR3,diarrhea,AR3.N1,Diarrhoea,10012735,Diarrhea,10012727.0,
3,IMPAVIDO,AR4,headache,AR4.N1,Headache,10019211,,,
4,IMPAVIDO,AR5,decreased appetite,AR5.N1,Decreased appetite,10061428,,,


---

In [70]:
mentions_all = pd.read_csv(folder+'train_drug_label_mentions.csv')
mentions_all['discontinuous_term'] = mentions_all['start'].apply(lambda x: 1 if ',' in x else 0)
relations_all = pd.read_csv(folder+'train_drug_label_relations.csv')
relations_all = relations_all.merge(mentions_all[['drug_name', 'id', 'type', 'str']], 
                    how='left', left_on=['drug_name', 'arg1'], right_on=['drug_name', 'id'])\
                             .merge(mentions_all[['drug_name', 'id', 'type', 'str']], 
                    how='left', left_on=['drug_name', 'arg2'], right_on=['drug_name', 'id'])
relations_all['negated_term'] = relations_all['type_x'].apply(lambda x: 1 if x == 'Negated' else 0)
relations_all['hypothetical_term'] = relations_all['type_x'].apply(lambda x: 1 if x == 'Hypothetical' else 0)
relations_all = relations_all[['drug_name',
                                'arg1',
                                  'str_x',
                                    'negated_term',
                                      'hypothetical_term']].sort_values(
                                          'negated_term', ascending=False
                                          ).drop_duplicates(
                                                  ['drug_name', 'arg1', 'str_x'],
                                                keep = 'first')
mentions_all = mentions_all.merge(relations_all, left_on = ['drug_name', 'id'], right_on = ['drug_name', 'arg1'], how = 'left')
mentions_all[['negated_term', 'hypothetical_term']] = mentions_all[['negated_term', 'hypothetical_term']].fillna(int(0))
mentions_all['str'] = mentions_all['str'].apply(lambda x: x.lower())
mentions_all = mentions_all[['drug_name', 'section', 'discontinuous_term', 'negated_term', 'hypothetical_term', 'str']].drop_duplicates()
mentions_all = mentions_all.groupby(['drug_name','section','str'])['discontinuous_term', 'negated_term', 'hypothetical_term'].max().reset_index()
#######
drug_label_text_all = pd.read_csv(folder+'train_drug_label_text_manual_ades.csv')
print(drug_label_text_all.shape[0])
drug_label_text_all_new = drug_label_text_all.merge(mentions_all,
                                                    left_on = ['section_id', 'drug_name', 'reaction_string'],
                                                    right_on = ['section', 'drug_name', 'str'], how = 'left')
drug_label_text_all_new['meddra_exact_term'] = drug_label_text_all_new.apply(lambda x: 1 if str(x['meddra_pt']).lower() == x['reaction_string'] or str(x['meddra_llt']).lower() == x['reaction_string'] 
                                                                             else 0, axis=1)
drug_label_text_all_new.to_csv(folder+'train_drug_label_text_manual_ades.csv', index=False)
drug_label_text_all_new.head(10)

  mentions_all = mentions_all.groupby(['drug_name','section','str'])['discontinuous_term', 'negated_term', 'hypothetical_term'].max().reset_index()


13894


Unnamed: 0,drug_name,section_id,reaction_string,meddra_pt,meddra_pt_id,meddra_llt,meddra_llt_id,section_name,section,str,discontinuous_term,negated_term,hypothetical_term,meddra_exact_term
0,XEOMIN,S1,hypersensitivity,Hypersensitivity,10020751.0,,,adverse reactions,S1,hypersensitivity,0,0.0,0.0,1
1,XEOMIN,S1,dysphagia,Dysphagia,10013950.0,,,adverse reactions,S1,dysphagia,0,0.0,0.0,1
2,XEOMIN,S1,breathing difficulties,Dyspnoea,10013968.0,Difficulty breathing,10012791.0,adverse reactions,S1,breathing difficulties,0,0.0,0.0,0
3,XEOMIN,S1,spread of effects from toxin,,,,,adverse reactions,S1,spread of effects from toxin,0,0.0,0.0,0
4,XEOMIN,S1,dysphagia,Dysphagia,10013950.0,,,adverse reactions,S1,dysphagia,0,0.0,0.0,1
5,XEOMIN,S1,neck pain,Neck pain,10028836.0,,,adverse reactions,S1,neck pain,0,0.0,0.0,1
6,XEOMIN,S1,muscle weakness,Muscular weakness,10028372.0,Muscle weakness,10028350.0,adverse reactions,S1,muscle weakness,0,0.0,0.0,1
7,XEOMIN,S1,injection site pain,Injection site pain,10022086.0,,,adverse reactions,S1,injection site pain,0,0.0,0.0,1
8,XEOMIN,S1,musculoskeletal pain,Musculoskeletal pain,10028391.0,,,adverse reactions,S1,musculoskeletal pain,0,0.0,0.0,1
9,XEOMIN,S1,eyelid ptosis,Eyelid ptosis,10015995.0,,,adverse reactions,S1,eyelid ptosis,0,0.0,0.0,1


In [40]:
mentions_all.head()

Unnamed: 0,drug_name,section,str,discontinuous_term,negated_term,hypothetical_term
0,ADCETRIS,S1,abdominal pain,0,0.0,0.0
1,ADCETRIS,S1,acute respiratory distress syndrome,0,0.0,0.0
2,ADCETRIS,S1,alopecia,0,0.0,0.0
3,ADCETRIS,S1,anaphylaxis,0,0.0,0.0
4,ADCETRIS,S1,anemia,0,0.0,0.0


In [67]:
mentions_all = pd.read_csv(folder+'test_drug_label_mentions.csv')
mentions_all['discontinuous_term'] = mentions_all['start'].apply(lambda x: 1 if ',' in x else 0)
relations_all = pd.read_csv(folder+'test_drug_label_relations.csv')
relations_all = relations_all.merge(mentions_all[['drug_name', 'id', 'type', 'str']], 
                    how='left', left_on=['drug_name', 'arg1'], right_on=['drug_name', 'id'])\
                             .merge(mentions_all[['drug_name', 'id', 'type', 'str']], 
                    how='left', left_on=['drug_name', 'arg2'], right_on=['drug_name', 'id'])
relations_all['negated_term'] = relations_all['type_x'].apply(lambda x: 1 if x == 'Negated' else 0)
relations_all['hypothetical_term'] = relations_all['type_x'].apply(lambda x: 1 if x == 'Hypothetical' else 0)
relations_all = relations_all[['drug_name', 'arg1', 'str_x', 'negated_term', 'hypothetical_term']].sort_values('negated_term', ascending=False).drop_duplicates(['drug_name', 'arg1', 'str_x'], keep = 'first')
mentions_all = mentions_all.merge(relations_all[['drug_name', 'arg1', 'negated_term', 'hypothetical_term']], left_on = ['drug_name', 'id'], right_on = ['drug_name', 'arg1'], how = 'left')
mentions_all['negated_term'] = mentions_all['negated_term'].fillna(int(0))
mentions_all['hypothetical_term'] = mentions_all['hypothetical_term'].fillna(int(0))
mentions_all['str'] = mentions_all['str'].apply(lambda x: x.lower())
mentions_all = mentions_all[['drug_name', 'section', 'discontinuous_term', 'negated_term', 'hypothetical_term', 'str']].drop_duplicates()
mentions_all = mentions_all.groupby(['drug_name','section','str'])['discontinuous_term', 'negated_term', 'hypothetical_term'].max().reset_index()
#######
drug_label_text_all = pd.read_csv(folder+'test_drug_label_text_manual_ades.csv')
drug_label_text_all_new = drug_label_text_all.merge(mentions_all,
                                                    left_on = ['section_id', 'drug_name', 'reaction_string'],
                                                    right_on = ['section', 'drug_name', 'str'], how = 'left')
drug_label_text_all_new['meddra_exact_term'] = drug_label_text_all_new.apply(lambda x: 1 if str(x['meddra_pt']).lower() == x['reaction_string'] or str(x['meddra_llt']).lower() == x['reaction_string'] 
                                                                             else 0, axis=1)
drug_label_text_all_new.to_csv(folder+'test_drug_label_text_manual_ades.csv', index=False)
drug_label_text_all_new.head(10)

  mentions_all = mentions_all.groupby(['drug_name','section','str'])['discontinuous_term', 'negated_term', 'hypothetical_term'].max().reset_index()


Unnamed: 0,drug_name,section_id,reaction_string,meddra_pt,meddra_pt_id,meddra_llt,meddra_llt_id,section_name,section,str,discontinuous_term,negated_term,hypothetical_term,meddra_exact_term
0,IMPAVIDO,S1,nausea,Nausea,10028813.0,,,adverse reactions,S1,nausea,0,0.0,0.0,1
1,IMPAVIDO,S1,vomiting,Vomiting,10047700.0,,,adverse reactions,S1,vomiting,0,0.0,0.0,1
2,IMPAVIDO,S1,diarrhea,Diarrhoea,10012735.0,Diarrhea,10012727.0,adverse reactions,S1,diarrhea,0,0.0,0.0,1
3,IMPAVIDO,S1,headache,Headache,10019211.0,,,adverse reactions,S1,headache,0,0.0,0.0,1
4,IMPAVIDO,S1,decreased appetite,Decreased appetite,10061428.0,,,adverse reactions,S1,decreased appetite,0,0.0,0.0,1
5,IMPAVIDO,S1,dizziness,Dizziness,10013573.0,,,adverse reactions,S1,dizziness,0,0.0,0.0,1
6,IMPAVIDO,S1,abdominal pain,Abdominal pain,10000081.0,,,adverse reactions,S1,abdominal pain,0,0.0,0.0,1
7,IMPAVIDO,S1,pruritus,Pruritus,10037087.0,,,adverse reactions,S1,pruritus,0,0.0,0.0,1
8,IMPAVIDO,S1,somnolence,Somnolence,10041349.0,,,adverse reactions,S1,somnolence,0,0.0,0.0,1
9,IMPAVIDO,S1,elevated transaminases,Transaminases increased,10054889.0,,,adverse reactions,S1,elevated transaminases,0,0.0,0.0,0


In [68]:
#def flag_negated(x):
print(drug_label_text_all.shape)
print(drug_label_text_all_new.shape)

(12788, 8)
(12788, 14)
