In [1]:
import pandas as pd
import numpy as np
from quickumls import QuickUMLS
from parse_pubmed_xml import PubMedData

In [2]:
data = PubMedData.from_dir('../data')

In [3]:
data.text

Unnamed: 0,article_title,abstract_text,keywords
26998820,"Vitamin D, HLA-DRB1 and Epstein-Barr virus ant...",BACKGROUND AND PURPOSE: Our objective was to s...,Epstein-Barr virus; multiple sclerosis; prospe...
31362644,The role of nutritional factors during adolesc...,Objective: The potential role of nutritional f...,Multiple sclerosis; adolescence; nutritional f...
31610401,Randomized-controlled trial of a modified Medi...,BACKGROUND: There is a high level of interest ...,Clinical trial; Diet; Fatigue; Mediterranean; ...
32300060,"Vitamin D, smoking, EBV, and long-term cogniti...","OBJECTIVE: To investigate whether vitamin D, s...",
32310197,Effect of task-oriented circuit training on mo...,BACKGROUND: Exercise training has positive eff...,Multiple sclerosis; balance; cognition; task-o...
32335779,Effects of THC/CBD oromucosal spray on spastic...,INTRODUCTION: The approval of 9-δ-tetrahydocan...,CBD; Clinical practice; Multiple sclerosis; Sp...


In [4]:
def apply_QuickUMLS(txt_array, matcher):
    """
    Apply QuickUMLS on the text in `txt_array` and return the set of identified UMLS CUIs.

    Parameters
    ----------
    txt_array: {str, np.ndarray}
        the text to process with QuickUMLS
    matcher: QuickUMLS object

    Returns
    -------
    cuis: set
        set of cuis identified by QuickUMLS
    """

    if isinstance(txt_array, str):
        txt_array = np.array(txt_array)

    if txt_array is np.nan:
        return np.nan
    cuis = set()
    for match in matcher.match(txt_array, best_match=True, ignore_syntax=False):
        best_match = match[0]
        cuis.add(best_match['cui'])
    return cuis

In [5]:
umls_data = '../data/quickUMLS_eng'
matcher = QuickUMLS(quickumls_fp=umls_data)

In [6]:
data.text['all_text'] = data.text.apply(lambda row: ' '.join([i for i in row if i is not None]), axis=1)
data.text['cuis'] = data.text['all_text'].apply(apply_QuickUMLS, args=(matcher,))
data.text

Unnamed: 0,article_title,abstract_text,keywords,all_text,cuis
26998820,"Vitamin D, HLA-DRB1 and Epstein-Barr virus ant...",BACKGROUND AND PURPOSE: Our objective was to s...,Epstein-Barr virus; multiple sclerosis; prospe...,"Vitamin D, HLA-DRB1 and Epstein-Barr virus ant...","{C0441621, C0018017, C0026769, C0751967, C0087..."
31362644,The role of nutritional factors during adolesc...,Objective: The potential role of nutritional f...,Multiple sclerosis; adolescence; nutritional f...,The role of nutritional factors during adolesc...,"{C1704326, C0018017, C0026769, C3540798, C0441..."
31610401,Randomized-controlled trial of a modified Medi...,BACKGROUND: There is a high level of interest ...,Clinical trial; Diet; Fatigue; Mediterranean; ...,Randomized-controlled trial of a modified Medi...,"{C1704326, C0018017, C1138412, C0180979, C0018..."
32300060,"Vitamin D, smoking, EBV, and long-term cogniti...","OBJECTIVE: To investigate whether vitamin D, s...",,"Vitamin D, smoking, EBV, and long-term cogniti...","{C0429028, C0018017, C0221198, C0337664, C0026..."
32310197,Effect of task-oriented circuit training on mo...,BACKGROUND: Exercise training has positive eff...,Multiple sclerosis; balance; cognition; task-o...,Effect of task-oriented circuit training on mo...,"{C0027902, C0018017, C1299581, C4086490, C0026..."
32335779,Effects of THC/CBD oromucosal spray on spastic...,INTRODUCTION: The approval of 9-δ-tetrahydocan...,CBD; Clinical practice; Multiple sclerosis; Sp...,Effects of THC/CBD oromucosal spray on spastic...,"{C2346845, C1170317, C0018792, C0026769, C1547..."


In [8]:
data.text.to_excel('../data/examples_with_cuis.xlsx')