In [None]:
import pandas as pd
import pickle
import matplotlib.pyplot as plt
import numpy as np
from pathlib import Path

In [None]:
# load retrieved_df2 from cadmus output
retrieved_df = pickle.load(open('retrieved_df2.p','rb'))

In [None]:
# hp terms with commas in - cause issues with splitting trigger
hp_commas = pickle.load(open('hp_commas.p','rb'))

# mappings from cui to hpo using mrconso file
cui_mapped = pickle.load(open('cui_fully_mapped.p','rb'))

In [None]:
def hp_from_lst(hp):
    if type(hp)==list:
        return(hp[0])
    else:
        return hp

In [None]:
cui_mapped['hpo_id'] = cui_mapped['hpo_id'].apply(hp_from_lst)

In [None]:
def mm_list_to_dict(mm_outfile_path, file_id):
    mm_dicts = []
    with open(mm_outfile_path,'r') as f:
        mm_outfile = f.readlines()
#         check for files where mm hasn't worked
        if len(mm_outfile) >1:
            
            # lose first item as this is artefact
            mm_outfile = mm_outfile[1:]
            mm_outfile = [i.split('|') for i in mm_outfile]
            for i in mm_outfile:
                try:
                    if i[1] == 'MMI':
                        try:
                            mm_dicts.append({'id':file_id,
                                             'mmi':i[1],
                                             'score':i[2],
                                             'preferred_name':i[3],
                                             'cui':i[4], 
                                             'sem_type_list':i[5], 
                                             'trigger':i[6], 
                                             'location':i[7], 
                                             'pos':i[8]})
                        except:
                            mm_dicts.append({'id':file_id,
                                             'mmi':0,
                                             'score':0,
                                             'preferred_name':0,
                                             'cui':0, 
                                             'sem_type_list':0, 
                                             'trigger':0, 
                                             'location':0, 
                                             'pos':0})
                except:
                    mm_dicts.append({'id':file_id,
                                             'mmi':0,
                                             'score':0,
                                             'preferred_name':0,
                                             'cui':0, 
                                             'sem_type_list':0, 
                                             'trigger':0, 
                                             'location':0, 
                                             'pos':0})
                    

        else:
             mm_dicts.append({'id':file_id,
                              'mmi':0,
                              'score':0,
                              'preferred_name':0,
                              'cui':0,
                              'sem_type_list':0,
                              'trigger':0,
                              'location':0,
                              'pos':0})
    
    # substitute triggers which include commas to allow split of trigger list
    for i in mm_dicts:
        if i['cui'] in hp_cui:
            for j in range(len(hp_commas)):
                try:
                    if i['trigger'] != 0:
                        com = hp_commas['hp_label'].iloc[j]
                        nocom = hp_commas['hp_label_no_comma'].iloc[j]
                        for k,v in i.items():
                            if com in v:
                                i[k] = re.sub(com, nocom, v)
                except: 
                    com = hp_commas['hp_label'].iloc[j]
                    nocom = hp_commas['hp_label_no_comma'].iloc[j]

                    
    for i in mm_dicts:
        try:
            i['trigger'] = i['trigger'][1:-1].split(",")
        except:
            i['trigger'] = 0

                
    for i in mm_dicts:

        if i['trigger'] == 0:
            i['match_freq'] = 0
            i['neg_freq'] = 0
        
        else:
            match_freq = 0
            neg_freq = 0
            for k in i['trigger']:
                if k.endswith("0"):
                    match_freq += 1
                elif k.endswith("1"):
                    neg_freq += 1
            i['match_freq'] = match_freq
            i['neg_freq'] = neg_freq
        
    return mm_dicts
            
def all_mm_output(mm_output_dirpath):
    mm_output_dir = Path(mm_output_dirpath)
    mm_output_paths = [i for i in mm_output_dir.glob('*.out')]
    
    all_mm = []
    for output_path in mm_output_paths:
        file_id = output_path.stem
        mm_filepath = str(output_path)
        mm_dicts = mm_list_to_dict(mm_filepath, file_id)
        all_mm = all_mm + mm_dicts
        
    return all_mm

In [None]:
def map_mm_cui(mm_output_dirpath):
    mm_df = pd.DataFrame(all_mm_output(mm_output_dirpath))
    mm_df['source']='HPO'
    mm_map = mm_df.merge(cui_mapped, left_on='cui', right_index=True, how='left')
    mm_map = mm_map.rename(columns={'hpo_id':'source_id'})

    return mm_map

# output_type is str e.g. tiab, content, technical text
def freq_per_paper_mm(mm_output_dirpath, output_type):
    mm_map = map_mm_cui(mm_output_dirpath)
    paper_ids = list(set(mm_map['id']))
    pid_freq = []
    for pid in paper_ids:
        paper_df = mm_map[mm_map['id']==pid]
        hpo_df = paper_df[paper_df['source']=='HPO']
        hp_lst = list(zip(hpo_df['source_id'],hpo_df['match_freq']))
        hgnc_df = paper_df[paper_df['source']=='HGNC']
        hgnc_lst = list(zip(hgnc_df['source_id'],hgnc_df['match_freq']))
        omim_df = paper_df[paper_df['source']=='OMIM']
        omim_lst = list(zip(omim_df['source_id'],omim_df['match_freq']))
        go_df = paper_df[paper_df['source']=='GO']
        go_lst = list(zip(go_df['source_id'],go_df['match_freq']))
        msh_df = paper_df[paper_df['source']=='MSH']
        msh_lst = list(zip(msh_df['source_id'],msh_df['match_freq']))

        pid_freq.append({'id':pid,
                         f'{output_type}_hpo':hp_lst, 
                         f'{output_type}_hgnc':hgnc_lst,
                         f'{output_type}_omim':omim_lst,
                         f'{output_type}_go':go_lst,
                         f'{output_type}_msh':msh_lst})
        
    term_freq_df = pd.DataFrame(pid_freq)
    
    term_freq_df[f'unique_{output_type}_hpo'] = [len(i) for i in term_freq_df[f'{output_type}_hpo']]
    term_freq_df[f'unique_{output_type}_hgnc'] = [len(i) for i in term_freq_df[f'{output_type}_hgnc']]
    term_freq_df[f'unique_{output_type}_omim'] = [len(i) for i in term_freq_df[f'{output_type}_omim']]
    term_freq_df[f'unique_{output_type}_go'] = [len(i) for i in term_freq_df[f'{output_type}_go']]
    term_freq_df[f'unique_{output_type}_msh'] = [len(i) for i in term_freq_df[f'{output_type}_msh']]
    term_freq_df[f'unique_{output_type}_total'] = term_freq_df[f'unique_{output_type}_hpo']+term_freq_df[f'unique_{output_type}_hgnc']+term_freq_df[f'unique_{output_type}_omim']+term_freq_df[f'unique_{output_type}_go']+term_freq_df[f'unique_{output_type}_msh']
    
    return term_freq_df


def retrieved_df_mm_output(tiab_output_dir, content_output_dir):
    tiab_df = freq_per_paper_mm(tiab_output_dir,'tiab')
    content_df = freq_per_paper_mm(content_output_dir,'content')
    
    tiab_content_df = pd.merge(tiab_df,content_df, how='outer')
    
    full_df = pd.merge(retrieved_df, tiab_content_df, how='right',left_index=True,right_on='id')
    full_df = full_df.dropna(subset=['content_text'])
    full_df = full_df.set_index('id')
    
    return full_df

In [None]:
full_df = retrieved_df_mm_output("path_to_tiab_mm_output", "path_to_content_text_mm_output")

full_df = full_df.drop(columns=['authors',
 'journal',
 'pub_type',
 'pub_date',
 'doi',
 'issn',
 'crossref',
 'full_text_links',
 'licenses',
 'xml_parse_d',
 'html_parse_d',
 'pdf_parse_d',
 'plain_parse_d'])

In [None]:
# hpo terms which do not have root 'phenotypic abnormality' - need to remove
parse_out = pickle.load(open('hp_ids_to_parse_out.p','rb'))

In [None]:
tiab_parsed = []
for j in full_df['tiab_hpo']:
    if type(j)==list:
        tiab_parse_mod = [(i,v) for i,v in j if i not in parse_out]
        tiab_parsed.append(tiab_parse_mod)
    else:
        tiab_parsed.append(0)
    
full_df['tiab_hpo'] = tiab_parsed

content_parsed = []
for j in full_df['content_hpo']:
    parse_mod = [(i,v) for i,v in j if i not in parse_out]
    content_parsed.append(parse_mod)
    
full_df['content_hpo'] = content_parsed

In [None]:
pickle.dump(full_df,open('ret_mm_df.p','wb'))