<a href="https://colab.research.google.com/github/victormurcia/CTS_Test/blob/main/CTS_Parser_Development_Dask.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Setting up the Environment

In [1]:
#I need to import locale to ensure that the encoding is set to UTF-8 (weird Google Colab bug)
import locale
locale.getpreferredencoding = lambda: "UTF-8"

#Check the current build in Google Colab
!cat /etc/*release
print('\n')

#Check CUDA version
!nvcc --version
print('\n')

#Ensure that the required packages are installed in the current environment
!pip install numpy --quiet
!pip install pandas --quiet
!pip install spacy==3.4.4 --quiet
!pip install scispacy --quiet
!pip install medspacy --quiet
!pip install negspacy --quiet
!pip install transformers
!pip install seaborn --quiet
!pip install matplotlib --quiet
!pip install "dask[complete]" --quiet
!pip install ipywidgets --quiet
!pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118 --quiet 
print('\n')

#Spacy models used for processing biomedical, scientific, or clinical text 
#Spacy pipeline for biomedical data.
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_core_sci_sm-0.5.1.tar.gz --quiet
#Spacy pipeline for biomedical data. Has a larger vocabulary and 50k word vectors
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_core_sci_md-0.5.1.tar.gz --quiet
#This one is another spacy pipeline with 785k vocabulary and uses scibert-base as a transformer model
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_core_sci_scibert-0.5.1.tar.gz --quiet
#Spacy pipeline for biomedical data with 600k word vectors
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_core_sci_lg-0.5.1.tar.gz --quiet
#A spaCy NER model trained on the CRAFT corpus.
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_ner_craft_md-0.5.1.tar.gz --quiet
#A spaCy NER model trained on the JNLPBA corpus.
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_ner_jnlpba_md-0.5.1.tar.gz --quiet
#A spaCy NER model trained on the BC5CDR corpus.
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_ner_bc5cdr_md-0.5.1.tar.gz --quiet
#A spaCy NER model trained on the BIONLP13CG corpus.
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_ner_bionlp13cg_md-0.5.1.tar.gz --quiet
#This is the med7 transformer model found here: https://github.com/kormilitzin/med7
!pip install https://huggingface.co/kormilitzin/en_core_med7_trf/resolve/main/en_core_med7_trf-any-py3-none-any.whl --quiet
#This is the med7 vector model 
!pip install https://huggingface.co/kormilitzin/en_core_med7_lg/resolve/main/en_core_med7_lg-any-py3-none-any.whl --quiet
print('\n')

DISTRIB_ID=Ubuntu
DISTRIB_RELEASE=20.04
DISTRIB_CODENAME=focal
DISTRIB_DESCRIPTION="Ubuntu 20.04.5 LTS"
NAME="Ubuntu"
VERSION="20.04.5 LTS (Focal Fossa)"
ID=ubuntu
ID_LIKE=debian
PRETTY_NAME="Ubuntu 20.04.5 LTS"
VERSION_ID="20.04"
HOME_URL="https://www.ubuntu.com/"
SUPPORT_URL="https://help.ubuntu.com/"
BUG_REPORT_URL="https://bugs.launchpad.net/ubuntu/"
PRIVACY_POLICY_URL="https://www.ubuntu.com/legal/terms-and-policies/privacy-policy"
VERSION_CODENAME=focal
UBUNTU_CODENAME=focal


nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2022 NVIDIA Corporation
Built on Wed_Sep_21_10:33:58_PDT_2022
Cuda compilation tools, release 11.8, V11.8.89
Build cuda_11.8.r11.8/compiler.31833905_0


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.5/6.5 MB[0m [31m68.7 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflict

In [2]:
#Import the required libraries/packages
#General utilities
import numpy as np
import pandas as pd
import dask.dataframe as dd
import seaborn as sns
import matplotlib.pyplot as plt
import os, random, time,sys, re
from ipywidgets import widgets, interact, interactive, fixed, interact_manual
from tqdm import tqdm

#NLP Stuff
#Spacy
import spacy
from spacy.lang.en.stop_words import STOP_WORDS #Load stopwords
from spacy.language import Language
from spacy.tokenizer import Tokenizer
#Scispacy
import scispacy
from scispacy.linking import EntityLinker
from scispacy.abbreviation import AbbreviationDetector
from scispacy.hyponym_detector import HyponymDetector
#Medspacy
import medspacy
from medspacy.ner import TargetRule
from medspacy.visualization import visualize_ent
from negspacy.negation import Negex

#To use Transformers models from HuggingFace
import transformers
from transformers import AutoTokenizer, AutoModel,AutoModelForTokenClassification
#NLTK

#Enable data to be extracted from my Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Load the Data

In [3]:
#Load the Synthetic Veteran Suicide Dataset from my Google Drive
data_dir = r'/content/drive/MyDrive/csv_usa_100k/' #Establish location of data
print('The data is located at:')
print(data_dir, '\n')

def find_csv_files(data_dir):
  """
  This function finds the .csv files located in data_dir and returns an alphabetically sorted file list.

  Parameters:
  data_dir (str) : Directory of files
  
  Returns:
  data_list (list) : list of .csv files
  """
  data_list = [f for f in os.listdir(data_dir) if f.endswith('.csv')] #List the .csv files
  print('The data files are:')
  data_list.sort() #Sort the .csv files alphabetically
  print(data_list,'\n')

  #Get names of .csv files without the extension for naming stuff later
  f_names = [s.replace(".csv","") for s in data_list]

  return data_list,f_names

#Get .csv files
data_list,f_names = find_csv_files(data_dir)

The data is located at:
/content/drive/MyDrive/csv_usa_100k/ 

The data files are:
['allergies.csv', 'careplans.csv', 'conditions.csv', 'devices.csv', 'encounters.csv', 'imaging_studies.csv', 'immunizations.csv', 'medications.csv', 'observations.csv', 'organizations.csv', 'patients.csv', 'payer_transitions.csv', 'payers.csv', 'procedures.csv', 'providers.csv', 'supplies.csv'] 



Some of these files have information that won't be useful for me so I'll remove them from my list now.  The files that I'll be removing are: encounters, organizations, payer_transitions, payers, and providers. 

In [4]:
files2remove = ['encounters.csv', 'organizations.csv', 'payer_transitions.csv', 'payers.csv', 'providers.csv'] 
def remove_files(data_list,files2remove):
  for f in data_list:
    if f in files2remove:
      data_list.remove(f)

  res = filter(lambda i: i not in files2remove, data_list)
  return list(res)

data_list = remove_files(data_list,files2remove)
print(data_list)

['allergies.csv', 'careplans.csv', 'conditions.csv', 'devices.csv', 'imaging_studies.csv', 'immunizations.csv', 'medications.csv', 'observations.csv', 'patients.csv', 'procedures.csv', 'supplies.csv']


In [5]:
#Add data path to beginning of each element of list so that I can easily access them later
def prepend(data_list, str):
    str += '% s'
    new_list = [str % i for i in data_list]
    return new_list

#Add 
csv_list = prepend(data_list,data_dir)
print('Full path of .csv files \n')
csv_list

Full path of .csv files 



['/content/drive/MyDrive/csv_usa_100k/allergies.csv',
 '/content/drive/MyDrive/csv_usa_100k/careplans.csv',
 '/content/drive/MyDrive/csv_usa_100k/conditions.csv',
 '/content/drive/MyDrive/csv_usa_100k/devices.csv',
 '/content/drive/MyDrive/csv_usa_100k/imaging_studies.csv',
 '/content/drive/MyDrive/csv_usa_100k/immunizations.csv',
 '/content/drive/MyDrive/csv_usa_100k/medications.csv',
 '/content/drive/MyDrive/csv_usa_100k/observations.csv',
 '/content/drive/MyDrive/csv_usa_100k/patients.csv',
 '/content/drive/MyDrive/csv_usa_100k/procedures.csv',
 '/content/drive/MyDrive/csv_usa_100k/supplies.csv']

## Create Dataframes With Dask

In [6]:
cols = list(pd.read_csv(csv_list[8], nrows =0)) #Only read the column headers
print(cols)

['Id', 'BIRTHDATE', 'DEATHDATE', 'SSN', 'DRIVERS', 'PASSPORT', 'PREFIX', 'FIRST', 'LAST', 'SUFFIX', 'MAIDEN', 'MARITAL', 'RACE', 'ETHNICITY', 'GENDER', 'BIRTHPLACE', 'ADDRESS', 'CITY', 'STATE', 'COUNTY', 'ZIP', 'LAT', 'LON', 'HEALTHCARE_EXPENSES', 'HEALTHCARE_COVERAGE']


In [7]:
#After checking the contents of all the .csv files, I decided that the following columns across all .csv files will not be 
#included during loading
omit_cols = ['START','STOP','ENCOUNTER','BASE_COST','PAYER_COVERAGE','DATE','TYPE','DEATHDATE', 
             'SSN', 'DRIVERS', 'PASSPORT', 'MAIDEN','ADDRESS','FIRST', 'LAST', 'SUFFIX','START_YEAR', 
             'END_YEAR', 'PAYER','BASE_COST','QUANTITY']

In [8]:
def read_all_csvs(csv_list,omit_cols):
  """
  This function reads all the csv files in a list, loads each of them into a dask dataframe, and 
  then places them into a list called df_list that can be accessed later.

  Parameters:
  csv_list (list)  : List containing csv files
  omit_cols (list) : List containing names of columns to be omitted during dataframe loading

  Returns:
  df_list (list) :list containing dask dataframes
  """
  #Start timer
  start_time = time.time()

  #Initialize array that will contain the loaded dataframes
  df_list = []

  #Number of .csv files to load
  nFiles = len(csv_list)

  #Iterate through the csv files and load the dataframes
  for j in range(nFiles):
    cols    = list(pd.read_csv(csv_list[j], nrows = 0)) #Only read header row for column names
    temp_df = dd.read_csv(csv_list[j] , usecols =[i for i in cols if i not in omit_cols],assume_missing=True,dtype={'VALUE': 'object'}) #Make dask df
    df_list.append(temp_df)
  
  #End timer
  print("Loading of " + str(nFiles) + " .csv files took: %.2f seconds" % (time.time() - start_time))

  return df_list

df_list = read_all_csvs(csv_list,omit_cols)

Loading of 11 .csv files took: 19.54 seconds


## EDA

In [82]:
#Visualize descriptions columns of dataframe 
def make_countplots(df,name,n):

  if n < 25:
    val = n
  else:
    val = 25
  b = sns.countplot(data=df,y='DESCRIPTION',order = df['DESCRIPTION'].value_counts().index[:25])
  b.axes.set_title("Top "+ str(val) + ' ' + name.capitalize() + " Present in Dataset",fontsize=25)
  b.set_xlabel("Counts",fontsize=15)
  b.set_ylabel(name.capitalize(),fontsize=15)
  b.tick_params(labelsize=8)
  plt.show()

#View the number of unique patients in each dataframe
@interact
def show_nunique_vals_in_df(x=widgets.IntSlider(min=0,max=len(df_list)-1,step=1,value=0)):
  nPatients = df_list[x]['PATIENT'].nunique().compute()
  nDescriptions = df_list[x]['DESCRIPTION'].nunique().compute()
  print('The ' + f_names[x] + '.csv file has' ,nPatients, 'unique patients and',nDescriptions,'unique',f_names[x])
  make_countplots(df_list[x].compute(),f_names[x],nDescriptions)

interactive(children=(IntSlider(value=0, description='x', max=10), Output()), _dom_classes=('widget-interact',…

## Restructuring and Merging Dataframes

In [9]:
def make_patient_df(nPatients,df,df_list, s_list, init_rand=False):
    """
    This function combines the observations for a patient in the Synthetic Veteran Suicide Dataset into a 
    single row in a new dataframe. Doing this should help with the NLP processing later.
    
    Parameters
    ----------
    nPatients : int,  How many patients to process?
    df        : df,   Dataframe containing patient data
    df_list   : list, list of all dataframes in dataset
    init_rand : bool, Use random values to choose patients? Defaults to False so only first 10 patients are selected
    s_list    : list, List of strings to append to column names in the reformatted dataframes 
    ----------
    """

    #Determine number of unique patients in dataframe
    unique_patients = df['PATIENT'].nunique()
    
    #Initialize iterators and patient index
    i = 0; j = random.randrange(unique_patients)
    
    #Initialize index array and list of restructured dataframes
    index_list,rs_df_list = [], []
    
    for i in range(nPatients):
      # Create empty patient dataframe
      columns = df.columns
      new_df  = pd.DataFrame(index=np.arange(nPatients),columns=columns)
      
      #Select a random patient from the existing dataframe (random sample)
      if init_rand == True:
        j = random.randrange(unique_patients)                
      else:
        j = i 
      
      #Check that patient id has not already been added
      if j not in index_list:
        index_list.append(j)
      else:
        while j in index_list:
            j = random.randrange(unique_patients) 

        index_list.append(j)
      
      #Subset the dataframe so as to only show the results associated with a given patient
      patient    = df['PATIENT'][j] #Patient ID
      dummy_df   = df[df['PATIENT'] == patient]
      
      #Programatically populate the dataframe by first making lists out of the columns and then inserting
      #these lists into cells in the resulting dataframe
      for k,col in enumerate(columns):
        if col == 'PATIENT':
            new_df.at[i, 'PATIENT'] = patient    
        else:
            new_df.at[i, col] = dummy_df[col].tolist()
      
      #Append the restructured dataframe to the list of dfs
      rs_df_list.append(new_df)

      #1. Select the source dataframe to be converted from df_list
      #2. Subset this dataframe so that only the rows with the current patient ids are selected
      for m,cdf in enumerate(df_list[1:]):
        columns = cdf.columns

        # Create empty patient dataframe
        new_df  = pd.DataFrame(index=np.arange(nPatients),columns=columns)

        if 'PATIENT' not in columns:
          dummy_df = cdf[cdf['Id'] == patient]
        else:
          dummy_df = cdf[cdf['PATIENT'] == patient]

        dummy_df = dummy_df.compute()

        for k,col in enumerate(columns):
          if 'PATIENT' not in columns:
            if col == 'Id':
              new_df.at[i, 'PATIENT'] = patient    
            else:
              new_df.at[i, col] = dummy_df[col].tolist()
          else:
            if col == 'PATIENT':
              new_df.at[i, 'PATIENT'] = patient    
            else:
              new_df.at[i, col] = dummy_df[col].tolist()

        rs_df_list.append(new_df)

    #Rename columns so as to have a suffix included for easy traceback to source file
    for m,df in enumerate(rs_df_list):
      columns = df.columns
      if 'PATIENT' not in columns:
        df.columns = [x + s_list[m] if x != 'Id' else x for x in columns]
      else:
        df.columns = [x + s_list[m] if x != 'PATIENT' else x for x in columns]    

    #print('These are the indices for the patient IDs used to generate the dataframe below. \n',index_list)

    #End timer
    #print("Reformatting dataframes took: %.2f seconds" % (time.time() - start_time))

    return rs_df_list  

s_list = ['_als','_cps','_cds','_dvs','_iss','_ims','_mds','_obs','_pts','_prs','_sps']

In [10]:
def patient_df_wrapper(nPatients,df_list,s_list):
  """
  This function simply iterates over the make_patient_df function so as to generate the restructured dataframes for each patient. 
  It concatenates the resulting dataframes into a single dataframe that will be used for subsequent processing/analysis. 
  
  Parameters
  ----------
  nPatients : int,  How many patients to process?
  df_list   : list, list of all dataframes in dataset
  s_list    : list, List of strings to append to column names in the reformatted dataframes 
  ----------

  Returns
  ----------
  df        : df, Pandas dataframe containing EHR for nPatients. One row per patient.
  ----------
  """

  print('Starting process...')

  #Start timer
  start_time = time.time()

  #Initialize array to contain dataframes
  multiple_patients_df_list = []

  #Generate the patient dataframes
  for i in tqdm(range(nPatients)):
    rs_df_list  = make_patient_df(1, df_list[0].compute(),df_list, s_list, init_rand=True)
    dfs = [df.set_index('PATIENT') for df in rs_df_list] #Set the index for each dataframe to be the PATIENT Id
    df = pd.concat(dfs, axis=1) #
    multiple_patients_df_list.append(df)
  
  df = pd.concat(multiple_patients_df_list, axis=0)

  #End timer
  print("\n Multipatient dataframe generated after: %.2f seconds" % (time.time() - start_time))

  return df

#Generate dataframe for 10 patients
s_list = ['_als','_cps','_cds','_dvs','_iss','_ims','_mds','_obs','_pts','_prs','_sps']
multi_patient_df = patient_df_wrapper(10)

#Save resulting dataframe to a .csv file
multi_patient_df.to_csv('multi_veteran_df.csv')

#Visualize dataframe
multi_patient_df

Starting process...


100%|██████████| 10/10 [1:06:36<00:00, 399.65s/it]


 Multipatient dataframe generated after: 3996.51 seconds





Unnamed: 0_level_0,CODE_als,DESCRIPTION_als,Id_cps,CODE_cps,DESCRIPTION_cps,REASONCODE_cps,REASONDESCRIPTION_cps,CODE_cds,DESCRIPTION_cds,CODE_dvs,...,LAT_pts,LON_pts,HEALTHCARE_EXPENSES_pts,HEALTHCARE_COVERAGE_pts,CODE_prs,DESCRIPTION_prs,REASONCODE_prs,REASONDESCRIPTION_prs,CODE_sps,DESCRIPTION_sps
PATIENT,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
98de4759-8225-4160-adb6-2559305fe1df,"[300916003.0, 419474003.0, 232350006.0, 232347...","[Latex allergy, Allergy to mould, House dust m...","[3015755d-ed8c-4d24-9351-c62a2c70c4cb, af95a5e...","[711282006.0, 384758001.0, 699728000.0, 225358...","[Skin condition care, Self-care interventions ...","[24079001.0, nan, 233678006.0, 284549007.0, na...","[Atopic dermatitis, nan, Childhood asthma, Lac...","[24079001.0, 65363002.0, 233678006.0, 36749800...","[Atopic dermatitis, Otitis media, Childhood as...",[],...,[38.11572396207773],[-122.29443682927533],[1015063.23],[15986.36],"[430193006.0, 430193006.0, 430193006.0, 430193...","[Medication Reconciliation (procedure), Medica...","[nan, nan, nan, nan, nan, nan, nan, 233678006....","[nan, nan, nan, nan, nan, nan, nan, Childhood ...","[409534002.0, 713779008.0, 469673003.0, 706724...",[Disposable air-purifying respirator (physical...
ad5977c9-1260-495b-aa55-dc09860ec783,"[419474003.0, 232350006.0, 232347008.0, 418689...","[Allergy to mould, House dust mite allergy, Da...","[b18fecf0-42c3-4c45-8693-168720f9390a, 9e610fd...","[384758001.0, 699728000.0, 53950000.0, 3856910...","[Self-care interventions (procedure), Asthma s...","[nan, 233678006.0, 10509002.0, 33737001.0, 192...","[nan, Childhood asthma, Acute bronchitis (diso...","[65363002.0, 446096008.0, 444814009.0, 2336780...","[Otitis media, Perennial allergic rhinitis, Vi...",[],...,[34.07864131788067],[-117.68830169526343],[30604.94],[0.0],"[430193006.0, 395142003.0, 430193006.0, 430193...","[Medication Reconciliation (procedure), Allerg...","[nan, nan, nan, nan, nan, nan, 233678006.0, na...","[nan, nan, nan, nan, nan, nan, Childhood asthm...",[],[]
af5f7e54-ddd4-4203-833e-d6e2987be0b0,"[419474003.0, 232347008.0, 418689008.0, 419263...","[Allergy to mould, Dander (animal) allergy, Al...","[06cff475-6242-48da-85f8-33400552168f, 5e3a362...","[384758001.0, 47387005.0, 53950000.0, 38569100...","[Self-care interventions (procedure), Head inj...","[nan, 62106007.0, 10509002.0, 65966004.0, 2631...","[nan, Concussion with no loss of consciousness...","[65363002.0, 62106007.0, 10509002.0, 195662009...","[Otitis media, Concussion with no loss of cons...",[],...,[37.34491726453704],[-121.9326999347838],[1424871.8],[6762.9],"[430193006.0, 430193006.0, 430193006.0, 430193...","[Medication Reconciliation (procedure), Medica...","[nan, nan, nan, nan, nan, nan, nan, 10509002.0...","[nan, nan, nan, nan, nan, nan, nan, Acute bron...",[],[]
3d8909a0-651c-4e62-bf3e-1482390bdc58,"[300916003.0, 424213003.0, 419474003.0, 232350...","[Latex allergy, Allergy to bee venom, Allergy ...","[d8250f49-a205-4e93-bcd4-892ca1e42ea0, 508ab50...","[384758001.0, 53950000.0, 711282006.0, 5395000...","[Self-care interventions (procedure), Respirat...","[nan, 10509002.0, 24079001.0, 10509002.0, 6596...","[nan, Acute bronchitis (disorder), Atopic derm...","[10509002.0, 43878008.0, 24079001.0, 65363002....","[Acute bronchitis (disorder), Streptococcal so...",[],...,[38.41288433052555],[-121.47075316015002],[1610676.85],[10712.75],"[430193006.0, 269911007.0, 117015009.0, 430193...","[Medication Reconciliation (procedure), Sputum...","[nan, 10509002.0, 43878008.0, nan, nan, 105090...","[nan, Acute bronchitis (disorder), Streptococc...",[],[]
a8fcc478-a986-47bb-8a49-47387a3e15ab,"[424213003.0, 419474003.0, 232350006.0, 232347...","[Allergy to bee venom, Allergy to mould, House...","[ab529498-2354-4110-aa5a-6f5c8abad668, c609caa...","[384758001.0, 699728000.0, 170836005.0, 539500...","[Self-care interventions (procedure), Asthma s...","[nan, 233678006.0, nan, 10509002.0, 44465007.0...","[nan, Childhood asthma, nan, Acute bronchitis ...","[65363002.0, 195662009.0, 233678006.0, 6536300...","[Otitis media, Acute viral pharyngitis (disord...",[],...,[26.0539759874286],[-80.24179482942706],[1159901.93],[35541.49999999999],"[430193006.0, 430193006.0, 430193006.0, 430193...","[Medication Reconciliation (procedure), Medica...","[nan, nan, nan, nan, 233678006.0, nan, nan, na...","[nan, nan, nan, nan, Childhood asthma, nan, na...",[],[]
797e0e78-121c-4523-bfc5-c1fac631e504,"[232347008.0, 91930004.0]","[Dander (animal) allergy, Allergy to eggs]","[d90cf202-537f-4938-88c2-e8e02627bafe, da6922e...","[384758001.0, 53950000.0, 443402002.0, 7362540...","[Self-care interventions (procedure), Respirat...","[nan, 10509002.0, 59621000.0, 47505003.0, 4750...","[nan, Acute bronchitis (disorder), Hypertensio...","[65363002.0, 65363002.0, 65363002.0, 195662009...","[Otitis media, Otitis media, Otitis media, Acu...",[],...,[33.92408161999087],[-118.05437508893844],[1624346.03],[543863.5699999991],"[430193006.0, 430193006.0, 430193006.0, 430193...","[Medication Reconciliation (procedure), Medica...","[nan, nan, nan, nan, nan, 195662009.0, nan, na...","[nan, nan, nan, nan, nan, Acute viral pharyngi...",[],[]
285ac536-7075-4743-868e-4fb185323cef,[91934008.0],[Allergy to nut],"[e96495b9-0d9e-4031-b5d7-79b389a8ceba, cb2a155...","[384758001.0, 47387005.0, 91251008.0, 53950000...","[Self-care interventions (procedure), Head inj...","[nan, 62106007.0, 44465007.0, 10509002.0, 4446...","[nan, Concussion with no loss of consciousness...","[241929008.0, 65363002.0, 65363002.0, 19566200...","[Acute allergic reaction, Otitis media, Otitis...",[],...,[29.670226410773584],[-82.40880503150285],[1485517.82],[7659.919999999999],"[430193006.0, 430193006.0, 430193006.0, 430193...","[Medication Reconciliation (procedure), Medica...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...",[],[]
1e97a5cd-22a2-4205-95af-9567cefbd5d8,"[300916003.0, 419474003.0, 232350006.0, 232347...","[Latex allergy, Allergy to mould, House dust m...","[f79d6834-9f3d-4538-a0a0-3b54ce4f1244, c973520...","[384758001.0, 711282006.0, 53950000.0, 5395000...","[Self-care interventions (procedure), Skin con...","[nan, 24079001.0, 10509002.0, 10509002.0, nan,...","[nan, Atopic dermatitis, Acute bronchitis (dis...","[232353008.0, 24079001.0, 10509002.0, 65363002...",[Perennial allergic rhinitis with seasonal var...,[],...,[27.732385680635296],[-82.69091589800718],[936662.5],[4484.039999999999],"[430193006.0, 430193006.0, 430193006.0, 395142...","[Medication Reconciliation (procedure), Medica...","[nan, nan, nan, nan, nan, nan, 10509002.0, nan...","[nan, nan, nan, nan, nan, nan, Acute bronchiti...",[],[]
462b6335-9775-456f-8573-5ab03500770b,"[300916003.0, 419474003.0, 232350006.0, 232347...","[Latex allergy, Allergy to mould, House dust m...","[e37cbdd8-bb79-46ed-9f45-8ee33f2751b6, 3e44479...","[384758001.0, 699728000.0, 53950000.0, 2253580...","[Self-care interventions (procedure), Asthma s...","[nan, 233678006.0, 10509002.0, 284549007.0, na...","[nan, Childhood asthma, Acute bronchitis (diso...","[65363002.0, 65363002.0, 195662009.0, 23367800...","[Otitis media, Otitis media, Acute viral phary...",[],...,[32.697143538002976],[-117.1466232011608],[564257.87],[11213.899999999998],"[430193006.0, 430193006.0, 395142003.0, 430193...","[Medication Reconciliation (procedure), Medica...","[nan, nan, nan, nan, nan, nan, 233678006.0, 43...","[nan, nan, nan, nan, nan, nan, Childhood asthm...",[],[]
57a4c01b-5f84-417f-ae96-61a6537e21f6,"[424213003.0, 419474003.0, 232350006.0, 232347...","[Allergy to bee venom, Allergy to mould, House...","[ef602217-11a9-49ef-b09d-42da7aea1ec8, f1f37e9...","[384758001.0, 53950000.0, 443402002.0, 5395000...","[Self-care interventions (procedure), Respirat...","[nan, 10509002.0, 59621000.0, 10509002.0, 3692...","[nan, Acute bronchitis (disorder), Hypertensio...","[65363002.0, 241929008.0, 444814009.0, 3674980...","[Otitis media, Acute allergic reaction, Viral ...",[],...,[30.35098270891739],[-84.49276761429394],[626520.63],[2617.42],"[430193006.0, 430193006.0, 430193006.0, 430193...","[Medication Reconciliation (procedure), Medica...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...",[],[]


In [11]:
#I don't need the Id columns so I'll drop them
df2 = multi_patient_df[multi_patient_df.columns.drop(list(multi_patient_df.filter(regex='Id')))]
df2

Unnamed: 0_level_0,CODE_als,DESCRIPTION_als,CODE_cps,DESCRIPTION_cps,REASONCODE_cps,REASONDESCRIPTION_cps,CODE_cds,DESCRIPTION_cds,CODE_dvs,DESCRIPTION_dvs,...,LAT_pts,LON_pts,HEALTHCARE_EXPENSES_pts,HEALTHCARE_COVERAGE_pts,CODE_prs,DESCRIPTION_prs,REASONCODE_prs,REASONDESCRIPTION_prs,CODE_sps,DESCRIPTION_sps
PATIENT,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
98de4759-8225-4160-adb6-2559305fe1df,"[300916003.0, 419474003.0, 232350006.0, 232347...","[Latex allergy, Allergy to mould, House dust m...","[711282006.0, 384758001.0, 699728000.0, 225358...","[Skin condition care, Self-care interventions ...","[24079001.0, nan, 233678006.0, 284549007.0, na...","[Atopic dermatitis, nan, Childhood asthma, Lac...","[24079001.0, 65363002.0, 233678006.0, 36749800...","[Atopic dermatitis, Otitis media, Childhood as...",[],[],...,[38.11572396207773],[-122.29443682927533],[1015063.23],[15986.36],"[430193006.0, 430193006.0, 430193006.0, 430193...","[Medication Reconciliation (procedure), Medica...","[nan, nan, nan, nan, nan, nan, nan, 233678006....","[nan, nan, nan, nan, nan, nan, nan, Childhood ...","[409534002.0, 713779008.0, 469673003.0, 706724...",[Disposable air-purifying respirator (physical...
ad5977c9-1260-495b-aa55-dc09860ec783,"[419474003.0, 232350006.0, 232347008.0, 418689...","[Allergy to mould, House dust mite allergy, Da...","[384758001.0, 699728000.0, 53950000.0, 3856910...","[Self-care interventions (procedure), Asthma s...","[nan, 233678006.0, 10509002.0, 33737001.0, 192...","[nan, Childhood asthma, Acute bronchitis (diso...","[65363002.0, 446096008.0, 444814009.0, 2336780...","[Otitis media, Perennial allergic rhinitis, Vi...",[],[],...,[34.07864131788067],[-117.68830169526343],[30604.94],[0.0],"[430193006.0, 395142003.0, 430193006.0, 430193...","[Medication Reconciliation (procedure), Allerg...","[nan, nan, nan, nan, nan, nan, 233678006.0, na...","[nan, nan, nan, nan, nan, nan, Childhood asthm...",[],[]
af5f7e54-ddd4-4203-833e-d6e2987be0b0,"[419474003.0, 232347008.0, 418689008.0, 419263...","[Allergy to mould, Dander (animal) allergy, Al...","[384758001.0, 47387005.0, 53950000.0, 38569100...","[Self-care interventions (procedure), Head inj...","[nan, 62106007.0, 10509002.0, 65966004.0, 2631...","[nan, Concussion with no loss of consciousness...","[65363002.0, 62106007.0, 10509002.0, 195662009...","[Otitis media, Concussion with no loss of cons...",[],[],...,[37.34491726453704],[-121.9326999347838],[1424871.8],[6762.9],"[430193006.0, 430193006.0, 430193006.0, 430193...","[Medication Reconciliation (procedure), Medica...","[nan, nan, nan, nan, nan, nan, nan, 10509002.0...","[nan, nan, nan, nan, nan, nan, nan, Acute bron...",[],[]
3d8909a0-651c-4e62-bf3e-1482390bdc58,"[300916003.0, 424213003.0, 419474003.0, 232350...","[Latex allergy, Allergy to bee venom, Allergy ...","[384758001.0, 53950000.0, 711282006.0, 5395000...","[Self-care interventions (procedure), Respirat...","[nan, 10509002.0, 24079001.0, 10509002.0, 6596...","[nan, Acute bronchitis (disorder), Atopic derm...","[10509002.0, 43878008.0, 24079001.0, 65363002....","[Acute bronchitis (disorder), Streptococcal so...",[],[],...,[38.41288433052555],[-121.47075316015002],[1610676.85],[10712.75],"[430193006.0, 269911007.0, 117015009.0, 430193...","[Medication Reconciliation (procedure), Sputum...","[nan, 10509002.0, 43878008.0, nan, nan, 105090...","[nan, Acute bronchitis (disorder), Streptococc...",[],[]
a8fcc478-a986-47bb-8a49-47387a3e15ab,"[424213003.0, 419474003.0, 232350006.0, 232347...","[Allergy to bee venom, Allergy to mould, House...","[384758001.0, 699728000.0, 170836005.0, 539500...","[Self-care interventions (procedure), Asthma s...","[nan, 233678006.0, nan, 10509002.0, 44465007.0...","[nan, Childhood asthma, nan, Acute bronchitis ...","[65363002.0, 195662009.0, 233678006.0, 6536300...","[Otitis media, Acute viral pharyngitis (disord...",[],[],...,[26.0539759874286],[-80.24179482942706],[1159901.93],[35541.49999999999],"[430193006.0, 430193006.0, 430193006.0, 430193...","[Medication Reconciliation (procedure), Medica...","[nan, nan, nan, nan, 233678006.0, nan, nan, na...","[nan, nan, nan, nan, Childhood asthma, nan, na...",[],[]
797e0e78-121c-4523-bfc5-c1fac631e504,"[232347008.0, 91930004.0]","[Dander (animal) allergy, Allergy to eggs]","[384758001.0, 53950000.0, 443402002.0, 7362540...","[Self-care interventions (procedure), Respirat...","[nan, 10509002.0, 59621000.0, 47505003.0, 4750...","[nan, Acute bronchitis (disorder), Hypertensio...","[65363002.0, 65363002.0, 65363002.0, 195662009...","[Otitis media, Otitis media, Otitis media, Acu...",[],[],...,[33.92408161999087],[-118.05437508893844],[1624346.03],[543863.5699999991],"[430193006.0, 430193006.0, 430193006.0, 430193...","[Medication Reconciliation (procedure), Medica...","[nan, nan, nan, nan, nan, 195662009.0, nan, na...","[nan, nan, nan, nan, nan, Acute viral pharyngi...",[],[]
285ac536-7075-4743-868e-4fb185323cef,[91934008.0],[Allergy to nut],"[384758001.0, 47387005.0, 91251008.0, 53950000...","[Self-care interventions (procedure), Head inj...","[nan, 62106007.0, 44465007.0, 10509002.0, 4446...","[nan, Concussion with no loss of consciousness...","[241929008.0, 65363002.0, 65363002.0, 19566200...","[Acute allergic reaction, Otitis media, Otitis...",[],[],...,[29.670226410773584],[-82.40880503150285],[1485517.82],[7659.919999999999],"[430193006.0, 430193006.0, 430193006.0, 430193...","[Medication Reconciliation (procedure), Medica...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...",[],[]
1e97a5cd-22a2-4205-95af-9567cefbd5d8,"[300916003.0, 419474003.0, 232350006.0, 232347...","[Latex allergy, Allergy to mould, House dust m...","[384758001.0, 711282006.0, 53950000.0, 5395000...","[Self-care interventions (procedure), Skin con...","[nan, 24079001.0, 10509002.0, 10509002.0, nan,...","[nan, Atopic dermatitis, Acute bronchitis (dis...","[232353008.0, 24079001.0, 10509002.0, 65363002...",[Perennial allergic rhinitis with seasonal var...,[],[],...,[27.732385680635296],[-82.69091589800718],[936662.5],[4484.039999999999],"[430193006.0, 430193006.0, 430193006.0, 395142...","[Medication Reconciliation (procedure), Medica...","[nan, nan, nan, nan, nan, nan, 10509002.0, nan...","[nan, nan, nan, nan, nan, nan, Acute bronchiti...",[],[]
462b6335-9775-456f-8573-5ab03500770b,"[300916003.0, 419474003.0, 232350006.0, 232347...","[Latex allergy, Allergy to mould, House dust m...","[384758001.0, 699728000.0, 53950000.0, 2253580...","[Self-care interventions (procedure), Asthma s...","[nan, 233678006.0, 10509002.0, 284549007.0, na...","[nan, Childhood asthma, Acute bronchitis (diso...","[65363002.0, 65363002.0, 195662009.0, 23367800...","[Otitis media, Otitis media, Acute viral phary...",[],[],...,[32.697143538002976],[-117.1466232011608],[564257.87],[11213.899999999998],"[430193006.0, 430193006.0, 395142003.0, 430193...","[Medication Reconciliation (procedure), Medica...","[nan, nan, nan, nan, nan, nan, 233678006.0, 43...","[nan, nan, nan, nan, nan, nan, Childhood asthm...",[],[]
57a4c01b-5f84-417f-ae96-61a6537e21f6,"[424213003.0, 419474003.0, 232350006.0, 232347...","[Allergy to bee venom, Allergy to mould, House...","[384758001.0, 53950000.0, 443402002.0, 5395000...","[Self-care interventions (procedure), Respirat...","[nan, 10509002.0, 59621000.0, 10509002.0, 3692...","[nan, Acute bronchitis (disorder), Hypertensio...","[65363002.0, 241929008.0, 444814009.0, 3674980...","[Otitis media, Acute allergic reaction, Viral ...",[],[],...,[30.35098270891739],[-84.49276761429394],[626520.63],[2617.42],"[430193006.0, 430193006.0, 430193006.0, 430193...","[Medication Reconciliation (procedure), Medica...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...",[],[]


In [40]:
#Rename columns to something more intuitive
new_names = ['AllergyCode',
 'Allergy',
 'CareplanCode',
 'Careplan',
 'CareplanReason',
 'CareplanReasonDescription',
 'ConditionCode',
 'Condition',
 'DeviceCode',
 'Device',
 'DeviceUDI',
 'ImageBodysiteCode',
 'ImageBodysiteDescription',
 'ImageModalityCode',
 'ImageModalityDescription',
 'ImageSOPCode',
 'ImageSOPDescription',
 'ImmunizationCode',
 'Immunization',
 'MedicationCode',
 'Medication',
 'MedicationDispenses',
 'MedicationCost',
 'MedicationReason',
 'MedicationReasonDescription',
 'ObservationCode',
 'Observation',
 'ObservationValue',
 'ObservationUnit',
 'PatientBirthday',
 'PatientPrefix',
 'PatientMarital',
 'PatientRace',
 'PatientEthnicity',
 'PatientGender',
 'PatientBirthplace',
 'PatientCity',
 'PatientState',
 'PatientCounty',
 'PatientZIP',
 'PatientLAT',
 'PatientLON',
 'PatientHEALTHCARE_EXPENSES',
 'PatientHEALTHCARE_COVERAGE',
 'ProcedureCode',
 'ProcedureDescription',
 'ProcedureReason',
 'ProcedureReasonDescription',
 'SupplyCode',
 'SupplyDescription']

df2.columns = new_names
df2.head()

Unnamed: 0_level_0,AllergyCode,Allergy,CareplanCode,Careplan,CareplanReason,CareplanReasonDescription,ConditionCode,Condition,DeviceCode,Device,...,PatientLAT,PatientLON,PatientHEALTHCARE_EXPENSES,PatientHEALTHCARE_COVERAGE,ProcedureCode,ProcedureDescription,ProcedureReason,ProcedureReasonDescription,SupplyCode,SupplyDescription
PATIENT,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
98de4759-8225-4160-adb6-2559305fe1df,"[300916003.0, 419474003.0, 232350006.0, 232347...","[Latex allergy, Allergy to mould, House dust m...","[711282006.0, 384758001.0, 699728000.0, 225358...","[Skin condition care, Self-care interventions ...","[24079001.0, nan, 233678006.0, 284549007.0, na...","[Atopic dermatitis, nan, Childhood asthma, Lac...","[24079001.0, 65363002.0, 233678006.0, 36749800...","[Atopic dermatitis, Otitis media, Childhood as...",[],[],...,[38.11572396207773],[-122.29443682927533],[1015063.23],[15986.36],"[430193006.0, 430193006.0, 430193006.0, 430193...","[Medication Reconciliation (procedure), Medica...","[nan, nan, nan, nan, nan, nan, nan, 233678006....","[nan, nan, nan, nan, nan, nan, nan, Childhood ...","[409534002.0, 713779008.0, 469673003.0, 706724...",[Disposable air-purifying respirator (physical...
ad5977c9-1260-495b-aa55-dc09860ec783,"[419474003.0, 232350006.0, 232347008.0, 418689...","[Allergy to mould, House dust mite allergy, Da...","[384758001.0, 699728000.0, 53950000.0, 3856910...","[Self-care interventions (procedure), Asthma s...","[nan, 233678006.0, 10509002.0, 33737001.0, 192...","[nan, Childhood asthma, Acute bronchitis (diso...","[65363002.0, 446096008.0, 444814009.0, 2336780...","[Otitis media, Perennial allergic rhinitis, Vi...",[],[],...,[34.07864131788067],[-117.68830169526343],[30604.94],[0.0],"[430193006.0, 395142003.0, 430193006.0, 430193...","[Medication Reconciliation (procedure), Allerg...","[nan, nan, nan, nan, nan, nan, 233678006.0, na...","[nan, nan, nan, nan, nan, nan, Childhood asthm...",[],[]
af5f7e54-ddd4-4203-833e-d6e2987be0b0,"[419474003.0, 232347008.0, 418689008.0, 419263...","[Allergy to mould, Dander (animal) allergy, Al...","[384758001.0, 47387005.0, 53950000.0, 38569100...","[Self-care interventions (procedure), Head inj...","[nan, 62106007.0, 10509002.0, 65966004.0, 2631...","[nan, Concussion with no loss of consciousness...","[65363002.0, 62106007.0, 10509002.0, 195662009...","[Otitis media, Concussion with no loss of cons...",[],[],...,[37.34491726453704],[-121.9326999347838],[1424871.8],[6762.9],"[430193006.0, 430193006.0, 430193006.0, 430193...","[Medication Reconciliation (procedure), Medica...","[nan, nan, nan, nan, nan, nan, nan, 10509002.0...","[nan, nan, nan, nan, nan, nan, nan, Acute bron...",[],[]
3d8909a0-651c-4e62-bf3e-1482390bdc58,"[300916003.0, 424213003.0, 419474003.0, 232350...","[Latex allergy, Allergy to bee venom, Allergy ...","[384758001.0, 53950000.0, 711282006.0, 5395000...","[Self-care interventions (procedure), Respirat...","[nan, 10509002.0, 24079001.0, 10509002.0, 6596...","[nan, Acute bronchitis (disorder), Atopic derm...","[10509002.0, 43878008.0, 24079001.0, 65363002....","[Acute bronchitis (disorder), Streptococcal so...",[],[],...,[38.41288433052555],[-121.47075316015002],[1610676.85],[10712.75],"[430193006.0, 269911007.0, 117015009.0, 430193...","[Medication Reconciliation (procedure), Sputum...","[nan, 10509002.0, 43878008.0, nan, nan, 105090...","[nan, Acute bronchitis (disorder), Streptococc...",[],[]
a8fcc478-a986-47bb-8a49-47387a3e15ab,"[424213003.0, 419474003.0, 232350006.0, 232347...","[Allergy to bee venom, Allergy to mould, House...","[384758001.0, 699728000.0, 170836005.0, 539500...","[Self-care interventions (procedure), Asthma s...","[nan, 233678006.0, nan, 10509002.0, 44465007.0...","[nan, Childhood asthma, nan, Acute bronchitis ...","[65363002.0, 195662009.0, 233678006.0, 6536300...","[Otitis media, Acute viral pharyngitis (disord...",[],[],...,[26.0539759874286],[-80.24179482942706],[1159901.93],[35541.49999999999],"[430193006.0, 430193006.0, 430193006.0, 430193...","[Medication Reconciliation (procedure), Medica...","[nan, nan, nan, nan, 233678006.0, nan, nan, na...","[nan, nan, nan, nan, Childhood asthma, nan, na...",[],[]


In [41]:
test_str = ",".join(df2['Allergy'][0])
test_str

'Latex allergy,Allergy to mould,House dust mite allergy,Dander (animal) allergy,Allergy to grass pollen,Allergy to eggs,Shellfish allergy'

## Accessing Clinical Trials

In [19]:
#Query for trials
cond = 'allergy'#input('Enter the disease condition to find clinical trials: ')
a = 'https://clinicaltrials.gov/api/query/study_fields?expr='
b = '&fields=NCTId%2CBriefTitle%2CCondition%2COverallStatus%2CLeadSponsorName%2CEligibilityCriteria'
c = '&min_rnk=1&max_rnk=1000&fmt=csv'
q=(a + cond + b + c)
print(q)
qtrials = pd.read_csv(q,skiprows=10)

@interact
def show_recruiting_studies(column=['OverallStatus','Condition'], 
                            x = ['Recruiting','Completed','Unknown status'],
                            y = ['Food','Antibiotic']
                            ):
    if column == 'OverallStatus':
      return qtrials.loc[qtrials[column] == x]
    elif column == 'Condition':
      return qtrials[qtrials['Condition'].str.contains(y)]

https://clinicaltrials.gov/api/query/study_fields?expr=allergy&fields=NCTId%2CBriefTitle%2CCondition%2COverallStatus%2CLeadSponsorName%2CEligibilityCriteria&min_rnk=1&max_rnk=1000&fmt=csv


interactive(children=(Dropdown(description='column', options=('OverallStatus', 'Condition'), value='OverallSta…

In [42]:
qtrials

Unnamed: 0,Rank,NCTId,BriefTitle,Condition,OverallStatus,LeadSponsorName,EligibilityCriteria
0,1,NCT04827602,Drug Allergy Labels After Drug Allergy Investi...,Drug Hypersensitivity,Completed,"University Hospital, Gentofte, Copenhagen",Inclusion Criteria:||Penicillin allergy tested...
1,2,NCT03826953,Allergy UK Research and Development Nurse Project,Allergy,Unknown status,University of Edinburgh,"Inclusion Criteria:||All children, young peopl..."
2,3,NCT05561777,Penicillin Allergy Risk-Stratification and Del...,Antibiotic Allergy,Recruiting,Vanderbilt University Medical Center,Inclusion Criteria:||Pediatric Hospital Medici...
3,4,NCT01914978,Evaluating the Effectiveness of a Handbook for...,"Hypersensitivity, Food",Completed,Boston Children's Hospital,Inclusion Criteria:||Parents of children ages ...
4,5,NCT03581604,De-labeling of Patients With False Diagnosis o...,Allergy Drug,Recruiting,Oslo University Hospital,Inclusion Criteria:||Adult patients who are re...
...,...,...,...,...,...,...,...
995,996,NCT02378129,Evaluation of Dentin Hypersensitivity Using Gl...,Dentin Hypersensitivity,Completed,Federal University of Pelotas,Inclusion Criteria:||Male and female subjects ...
996,997,NCT02060864,Effect of AN-PEP Enzyme on Gluten Digestion in...,Non-coeliac Gluten Sensitivity,Completed,DSM Food Specialties,Inclusion Criteria:||Male/female|Age ≥18 but <...
997,998,NCT05704218,Hypersensitivity Pneumonitis of Domestic Origin,Sensitisation|Mold or Dust Allergy|Respiratory...,Not yet recruiting,Centre Hospitalier Universitaire de Besancon,Inclusion Criteria:||exposure to mold at home|...
998,999,NCT03703791,"Real World, Open Label, QOL Assessment of Pean...",Peanut Allergy,Terminated,"Aimmune Therapeutics, Inc.",Key Inclusion Criteria:||Age 4 through 17 year...


## NLP
Now that I have data for a patient consolidated I can start to parse it! There's a few different pretrained parser models that I can try fairly easily. I'll be making use of the spacy, medspacy, and scispacy libraries to aid with this. 

In [3]:
#Load the models
ss_sm            = spacy.load("en_core_sci_sm")
ss_md            = spacy.load("en_core_sci_md")
ss_bert          = spacy.load("en_core_sci_scibert")
ss_lg            = spacy.load("en_core_sci_lg")
ss_craft         = spacy.load("en_ner_craft_md")
ss_jnlpba        = spacy.load("en_ner_jnlpba_md")
ss_bionlp13cg_md = spacy.load("en_ner_bionlp13cg_md")
med7             = spacy.load("en_core_med7_lg")
#PubMedBERT
pubmedbert = transformers.AutoModel.from_pretrained("microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext")

Downloading (…)lve/main/config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [6]:
#List the models
models = [ss_sm, ss_md, ss_bert, ss_lg, ss_craft, ss_jnlpba, ss_bionlp13cg_md, med7, pubmedbert]

In [12]:
models[3].pipe_names

['tok2vec', 'tagger', 'attribute_ruler', 'lemmatizer', 'parser', 'ner']

In [5]:
#Check the components of the pipeline
ss_sm.pipe_names

['tok2vec', 'tagger', 'attribute_ruler', 'lemmatizer', 'parser', 'ner']

Hmmm, there's no removal of stopwords here. I'll have to incorporate that. 

In [17]:
ss_sm.pipe_labels['ner']

['ENTITY']

In [17]:
# create distinct colours for labels
#col_dict = {}
#seven_colours = ['#e6194B', '#3cb44b', '#ffe119', '#ffd8b1', '#f58231', '#f032e6', '#42d4f4']
#for label, colour in zip(med7.pipe_labels['ner'], seven_colours):
#    col_dict[label] = colour

#options = {'ents': med7.pipe_labels['ner'], 'colors':col_dict}

#text = 'A patient was prescribed Magnesium hydroxide 400mg/5ml suspension PO of total 30ml bid for the next 5 days.'
#doc = med7(text)

#spacy.displacy.render(doc, style='ent', jupyter=True, options=options)

#[(ent.text, ent.label_) for ent in doc.ents]

In [16]:
#Select model to use in pipeline
#nlp = spacy.load("en_core_sci_sm")

#Add the abbreviation detector to the pipeline
#nlp.add_pipe("abbreviation_detector")

##Add the entity linker to the pipeline
#nlp.add_pipe("scispacy_linker", config={"linker_name": "umls","max_entities_per_mention": 6})

#Add negation to the pipeline
#nlp.add_pipe("negex")

In [18]:
#Define function to remove stopwords to be incorporated into the pipeline
@Language.component("remove_stopwords")
def remove_stopwords(doc):
  doc = [token for token in doc if not token.is_stop]
  print(doc)
  return doc

In [3]:
#Add a tokenizer at the beginning of the 
#ss_sm.add_pipe("tokenizer",first=True)

# Add the custom pipeline component to the pipeline after the tokenizer
#ss_sm.add_pipe('remove_stopwords', name='stopwords', after='lemmatizer')

#Check the components of the pipeline to ensure that the stopword removal is now a part of it
ss_sm.pipe_names

['tok2vec', 'tagger', 'attribute_ruler', 'lemmatizer', 'parser', 'ner']

In [19]:
# Define regex pattern to match parentheses and punctuation
pattern = r"[\(\)\[\]\{\}\.,:;\!\?]"

# Create custom tokenizer with regex pattern to remove parentheses and punctuation
class CustomTokenizer:
    def __init__(self, ss_sm):
        self.tokenizer = ss_sm.tokenizer
        self.regex_pattern = re.compile(pattern)

    def __call__(self, text):
        tokens = self.tokenizer(text)
        new_tokens = []
        for token in tokens:
            # Remove tokens that match the regex pattern
            if self.regex_pattern.match(token.text) is None:
                new_tokens.append(token)
        return spacy.tokens.Doc(ss_sm.vocab, new_tokens)

# Add custom tokenizer to pipeline
ss_sm.tokenizer = CustomTokenizer(ss_sm)

In [16]:
ss_sm.pipe_names

['tok2vec',
 'tagger',
 'attribute_ruler',
 'lemmatizer',
 'parser',
 'ner',
 'abbreviation_detector',
 'scispacy_linker',
 'negex']

In [30]:
#Text to be analyzed
text = "Myeloid derived suppressor cells (MDSC) are immature myeloid cells with immunosuppressive activity. \
They accumulate in tumor-bearing mice and humans with different types of cancer, including hepatocellular carcinoma (HCC)."

#Load the model
ss_sm = spacy.load("en_core_sci_sm")

#Create a doc object for spacy pipeline
doc = ss_sm(text)

fmt_str = "{:<20}| {:<6}| {:<7}| {:<8}"
print(fmt_str.format("token", "pos", "label", "parent"))

for token in doc:
    print(fmt_str.format(token.text, token.pos_, token.ent_type_, token.head.text))

token               | pos   | label  | parent  
Myeloid             | ADJ   | ENTITY | derived 
derived             | ADJ   | ENTITY | cells   
suppressor          | NOUN  | ENTITY | cells   
cells               | NOUN  | ENTITY | cells   
(                   | PUNCT |        | MDSC    
MDSC                | NOUN  | ENTITY | cells   
)                   | PUNCT |        | MDSC    
are                 | AUX   |        | cells   
immature            | ADJ   | ENTITY | cells   
myeloid             | ADJ   | ENTITY | cells   
cells               | NOUN  | ENTITY | cells   
with                | ADP   |        | activity
immunosuppressive   | ADJ   | ENTITY | activity
activity            | NOUN  | ENTITY | cells   
.                   | PUNCT |        | cells   
They                | PRON  |        | accumulate
accumulate          | VERB  | ENTITY | accumulate
in                  | ADP   |        | mice    
tumor-bearing       | ADJ   | ENTITY | mice    
mice                | NOUN  | ENTITY

In [15]:
#Text to be processed
text = "Myeloid derived suppressor cells (MDSC) are immature myeloid cells with immunosuppressive activity. \
They accumulate in tumor-bearing mice and humans with different types of cancer, including hepatocellular carcinoma (HCC)."

#Add the abbreviation pipe to the spacy pipeline.
if 'abbreviation_detector' not in ss_sm.pipe_names:
   ss_sm.add_pipe("abbreviation_detector")

#Add the EntityLinker pipe to spacy pipeline
if 'scispacy_linker' not in ss_sm.pipe_names:
  ss_sm.add_pipe("scispacy_linker", config={"linker_name": "umls", "max_entities_per_mention": 3})

#Add the Negation pipe to spacy pipeline
if 'negex' not in ss_sm.pipe_names:
  ss_sm.add_pipe("negex")

#Create spacy doc from text
doc = ss_sm(text)

# Create a list of stopwords
stopwords = spacy.lang.en.stop_words.STOP_WORDS

#Remove stopwords and punctuations
filtered_tokens = [token.text      for token in doc if not token.is_stop and not token.is_punct]
filtered_pos    = [token.pos_      for token in doc if not token.is_stop and not token.is_punct]
filtered_label  = [token.ent_type_ for token in doc if not token.is_stop and not token.is_punct]
filtered_parent = [token.head.text for token in doc if not token.is_stop and not token.is_punct]
print('Parsing Text')

#PART 1. Basic NER and removal of punctuations/stopwords
fmt_str = "{:<20}| {:<6}| {:<7}| {:<8}"
print(fmt_str.format("token", "pos", "label", "parent"))

# Print filtered tokens and named entities after removal of punctuations and stopwords
for i in range(len(filtered_tokens)):
    print(fmt_str.format(filtered_tokens[i], filtered_pos[i], filtered_label[i], filtered_parent[i]))

#PART 2. Identification of abbreviations
print('\nAbbreviations')
fmt_str = "{:<6}| {:<35}| {:<6}| {:<6}"
print(fmt_str.format("Short", "Long", "Starts", "Ends"))

for abrv in doc._.abbreviations:
    print(fmt_str.format(abrv.text, str(abrv._.long_form), abrv.start, abrv.end))

#PART 3. Linking to medical concepts via UMLS
print('\nEntity Linking')
fmt_str = "{:<35}| {:<11}| {:<6}"
print(fmt_str.format("Entity", "Concept ID", "Score"))

for i in range(len(doc.ents)): #Print the entities and their associated UMLS codes
  entity = doc.ents[i]
  for kb_entry in entity._.kb_ents:
    cui = kb_entry[0]
    match_score = kb_entry[1]
    print(fmt_str.format(entity.text, cui, match_score))

#PART 4. Parse for negations
print('\nNegations')
fmt_str = "{:<35}| {:<10}"
print(fmt_str.format("Entity", "Is negated"))

for entity in doc.ents:
    print(fmt_str.format(entity.text, entity._.negex))

Parsing Text
token               | pos   | label  | parent  
Myeloid             | ADJ   | ENTITY | derived 
derived             | ADJ   | ENTITY | cells   
suppressor          | NOUN  | ENTITY | cells   
cells               | NOUN  | ENTITY | cells   
MDSC                | NOUN  | ENTITY | cells   
immature            | ADJ   | ENTITY | cells   
myeloid             | ADJ   | ENTITY | cells   
cells               | NOUN  | ENTITY | cells   
immunosuppressive   | ADJ   | ENTITY | activity
activity            | NOUN  | ENTITY | cells   
accumulate          | VERB  | ENTITY | accumulate
tumor-bearing       | ADJ   | ENTITY | mice    
mice                | NOUN  | ENTITY | accumulate
humans              | NOUN  | ENTITY | mice    
different           | ADJ   |        | types   
types               | NOUN  |        | accumulate
cancer              | NOUN  | ENTITY | types   
including           | VERB  |        | carcinoma
hepatocellular      | ADJ   | ENTITY | carcinoma
carcinoma          

In [29]:
token_df = pd.DataFrame(
    {'token': filtered_tokens,
     'pos': filtered_pos,
     'label': filtered_label,
     'parent':filtered_parent
    })
token_df

Unnamed: 0,token,pos,label,parent
0,Myeloid,ADJ,ENTITY,derived
1,derived,ADJ,ENTITY,cells
2,suppressor,NOUN,ENTITY,cells
3,cells,NOUN,ENTITY,cells
4,MDSC,NOUN,ENTITY,cells
5,immature,ADJ,ENTITY,cells
6,myeloid,ADJ,ENTITY,cells
7,cells,NOUN,ENTITY,cells
8,immunosuppressive,ADJ,ENTITY,activity
9,activity,NOUN,ENTITY,cells


In [31]:
ss_sm.pipe_names

['tok2vec', 'tagger', 'attribute_ruler', 'lemmatizer', 'parser', 'ner']

## Parse Eligibility Criteria of Clinical Trials
For this I'll first focus on the EligibilityCriteria column on the qtrials dataframe.

In [20]:
qt_ec = qtrials[['EligibilityCriteria']]
qt_ec

Unnamed: 0,EligibilityCriteria
0,Inclusion Criteria:||Penicillin allergy tested...
1,"Inclusion Criteria:||All children, young peopl..."
2,Inclusion Criteria:||Pediatric Hospital Medici...
3,Inclusion Criteria:||Parents of children ages ...
4,Inclusion Criteria:||Adult patients who are re...
...,...
995,Inclusion Criteria:||Male and female subjects ...
996,Inclusion Criteria:||Male/female|Age ≥18 but <...
997,Inclusion Criteria:||exposure to mold at home|...
998,Key Inclusion Criteria:||Age 4 through 17 year...


In [21]:
ec = qt_ec['EligibilityCriteria'][1]
ec

'Inclusion Criteria:||All children, young people and adults who fit the selection criteria from across all the practices can be referred to the allergy clinic.||All patients and parents / carers where appropriate must be deemed capable of giving informed consent to take part in the research project.||Infants under two with suspected food allergy|Infants under two with moderate-to-severe eczema not responding to standard treatment.|Children and young people (up to 16 years of age) with suspected allergic rhinitis symptoms that are unresponsive to a combination of oral antihistamines and nasal steroids|Young people and adults (from 16 years of age) with a history of anaphylaxis or suspected anaphylaxis||Exclusion Criteria:||Over 2 years of age with delayed type food allergy presenting primarily with gastrointestinal symptoms|Over 2 years of age with confirmed non IgE-mediated symptoms including food intolerances, coeliac disease etc.|Single urticarial reactions without an obvious trigger

Each row contains the eligibility criteria for a clinical trials. The eligibility criteria is based on inclusion criteria and exclusion criteria. I think I'll start by splitting the eligibility criteria into two separate dataframes: 1 for the inclusion criteria and 1 for the exclusion criteria. This can be done fairly easily since the Inclusion and Exclusion Criteria segments are clearly demarcated.

In [22]:
sections = ec.split("Exclusion Criteria:||")
inclusion_criteria = sections[0].replace('Inclusion Criteria:||',"")
exclusion_criteria = sections[1]
print("Inclusion Criteria: ", inclusion_criteria)
print("Exclusion Criteria: ", exclusion_criteria)

Inclusion Criteria:  All children, young people and adults who fit the selection criteria from across all the practices can be referred to the allergy clinic.||All patients and parents / carers where appropriate must be deemed capable of giving informed consent to take part in the research project.||Infants under two with suspected food allergy|Infants under two with moderate-to-severe eczema not responding to standard treatment.|Children and young people (up to 16 years of age) with suspected allergic rhinitis symptoms that are unresponsive to a combination of oral antihistamines and nasal steroids|Young people and adults (from 16 years of age) with a history of anaphylaxis or suspected anaphylaxis||
Exclusion Criteria:  Over 2 years of age with delayed type food allergy presenting primarily with gastrointestinal symptoms|Over 2 years of age with confirmed non IgE-mediated symptoms including food intolerances, coeliac disease etc.|Single urticarial reactions without an obvious trigger

In [23]:
print('Inclusion Criteria')
ic_list,ec_list = [],[]
for item in  inclusion_criteria.split('||'):
    ic_list.extend(item.split("|"))
print('\n'.join(ic_list))

print('Exclusion Criteria')
for item in  exclusion_criteria.split('||'):
    ec_list.extend(item.split("|"))
print('\n'.join(ec_list))

Inclusion Criteria
All children, young people and adults who fit the selection criteria from across all the practices can be referred to the allergy clinic.
All patients and parents / carers where appropriate must be deemed capable of giving informed consent to take part in the research project.
Infants under two with suspected food allergy
Infants under two with moderate-to-severe eczema not responding to standard treatment.
Children and young people (up to 16 years of age) with suspected allergic rhinitis symptoms that are unresponsive to a combination of oral antihistamines and nasal steroids
Young people and adults (from 16 years of age) with a history of anaphylaxis or suspected anaphylaxis

Exclusion Criteria
Over 2 years of age with delayed type food allergy presenting primarily with gastrointestinal symptoms
Over 2 years of age with confirmed non IgE-mediated symptoms including food intolerances, coeliac disease etc.
Single urticarial reactions without an obvious triggers
Non-a

Nice! Now that I have these criteria separated, I'll place them into dataframes

In [24]:
#Dataframe containing inclusion criteria for current trial
ic_df = pd.DataFrame(ic_list,columns=['InclusionCriteria'])
ic_df

Unnamed: 0,InclusionCriteria
0,"All children, young people and adults who fit ..."
1,All patients and parents / carers where approp...
2,Infants under two with suspected food allergy
3,Infants under two with moderate-to-severe ecze...
4,Children and young people (up to 16 years of a...
5,Young people and adults (from 16 years of age)...
6,


In [48]:
ic_df.isnull().values.any()

False

In [50]:
len(ic_df['InclusionCriteria'][6])

0

In [25]:
#Dataframe containing exclusion criteria for current trial
ec_df = pd.DataFrame(ec_list,columns=['ExclusionCriteria'])
ec_df

Unnamed: 0,ExclusionCriteria
0,Over 2 years of age with delayed type food all...
1,Over 2 years of age with confirmed non IgE-med...
2,Single urticarial reactions without an obvious...
3,Non-allergic chronic urticaria
4,Drug allergy
5,"Well controlled allergic rhinitis, asthma or a..."
6,Mild-to-moderate atopic eczema without an obvi...
7,Localised insect sting reactions


In [26]:
#Create spacy doc from text
doc = ss_sm(text)

#Tokenize and remove stopwords and punctuation from each row 
ic_df['tokens'] = ic_df['InclusionCriteria'].apply(lambda x: [token.text      for token in ss_sm(x)  if not token.is_stop and not token.is_punct])
ic_df['pos']    = ic_df['InclusionCriteria'].apply(lambda x: [token.pos_      for token in ss_sm(x)  if not token.is_stop and not token.is_punct])
ic_df['entity'] = ic_df['InclusionCriteria'].apply(lambda x: [token.ent_type_ for token in ss_sm(x)  if not token.is_stop and not token.is_punct])
ic_df['parent'] = ic_df['InclusionCriteria'].apply(lambda x: [token.head.text for token in ss_sm(x)  if not token.is_stop and not token.is_punct])
ic_df

Unnamed: 0,InclusionCriteria,tokens,pos,entity,parent
0,"All children, young people and adults who fit ...","[children, young, people, adults, fit, selecti...","[NOUN, ADJ, NOUN, NOUN, VERB, NOUN, NOUN, NOUN...","[ENTITY, ENTITY, ENTITY, ENTITY, , ENTITY, ENT...","[referred, people, referred, people, adults, c..."
1,All patients and parents / carers where approp...,"[patients, parents, carers, appropriate, deeme...","[NOUN, NOUN, NOUN, NOUN, VERB, ADJ, VERB, ADJ,...","[ENTITY, ENTITY, ENTITY, , , , , , ENTITY, ENT...","[patients, patients, patients, where, carers, ..."
2,Infants under two with suspected food allergy,"[Infants, suspected, food, allergy]","[NOUN, VERB, NOUN, NOUN]","[ENTITY, ENTITY, ENTITY, ENTITY]","[Infants, allergy, allergy, Infants]"
3,Infants under two with moderate-to-severe ecze...,"[Infants, moderate-to-severe, eczema, respondi...","[NOUN, ADJ, NOUN, VERB, ADJ, NOUN]","[ENTITY, , ENTITY, , ENTITY, ENTITY]","[Infants, eczema, two, Infants, treatment, res..."
4,Children and young people (up to 16 years of a...,"[Children, young, people, 16, years, age, susp...","[NOUN, ADJ, NOUN, NUM, NOUN, NOUN, ADJ, ADJ, N...","[ENTITY, ENTITY, ENTITY, , ENTITY, ENTITY, , E...","[Children, people, Children, years, Children, ..."
5,Young people and adults (from 16 years of age)...,"[Young, people, adults, 16, years, age, histor...","[ADJ, NOUN, NOUN, NUM, NOUN, NOUN, NOUN, ADJ, ...","[ENTITY, ENTITY, ENTITY, , ENTITY, ENTITY, ENT...","[people, people, people, years, people, years,..."
6,,[],[],[],[]


In [28]:
#ic_df['InclusionCriteria'][0] = "Myeloid derived suppressor cells (MDSC) are immature myeloid cells with immunosuppressive activity."
ic_df['AbbreviationsLong']  = ic_df['InclusionCriteria'].apply(lambda x: [abrv.text             for abrv in ss_sm(x)._.abbreviations])
ic_df['AbbreviationsShort'] = ic_df['InclusionCriteria'].apply(lambda x: [str(abrv._.long_form) for abrv in ss_sm(x)._.abbreviations])
ic_df

Unnamed: 0,InclusionCriteria,tokens,pos,entity,parent,AbbreviationsLong,AbbreviationsShort
0,"All children, young people and adults who fit ...","[children, young, people, adults, fit, selecti...","[NOUN, ADJ, NOUN, NOUN, VERB, NOUN, NOUN, NOUN...","[ENTITY, ENTITY, ENTITY, ENTITY, , ENTITY, ENT...","[referred, people, referred, people, adults, c...",[],[]
1,All patients and parents / carers where approp...,"[patients, parents, carers, appropriate, deeme...","[NOUN, NOUN, NOUN, NOUN, VERB, ADJ, VERB, ADJ,...","[ENTITY, ENTITY, ENTITY, , , , , , ENTITY, ENT...","[patients, patients, patients, where, carers, ...",[],[]
2,Infants under two with suspected food allergy,"[Infants, suspected, food, allergy]","[NOUN, VERB, NOUN, NOUN]","[ENTITY, ENTITY, ENTITY, ENTITY]","[Infants, allergy, allergy, Infants]",[],[]
3,Infants under two with moderate-to-severe ecze...,"[Infants, moderate-to-severe, eczema, respondi...","[NOUN, ADJ, NOUN, VERB, ADJ, NOUN]","[ENTITY, , ENTITY, , ENTITY, ENTITY]","[Infants, eczema, two, Infants, treatment, res...",[],[]
4,Children and young people (up to 16 years of a...,"[Children, young, people, 16, years, age, susp...","[NOUN, ADJ, NOUN, NUM, NOUN, NOUN, ADJ, ADJ, N...","[ENTITY, ENTITY, ENTITY, , ENTITY, ENTITY, , E...","[Children, people, Children, years, Children, ...",[],[]
5,Young people and adults (from 16 years of age)...,"[Young, people, adults, 16, years, age, histor...","[ADJ, NOUN, NOUN, NUM, NOUN, NOUN, NOUN, ADJ, ...","[ENTITY, ENTITY, ENTITY, , ENTITY, ENTITY, ENT...","[people, people, people, years, people, years,...",[],[]


In [27]:
ic_df = ic_df.drop([6])
ic_df

Unnamed: 0,InclusionCriteria,tokens,pos,entity,parent
0,"All children, young people and adults who fit ...","[children, young, people, adults, fit, selecti...","[NOUN, ADJ, NOUN, NOUN, VERB, NOUN, NOUN, NOUN...","[ENTITY, ENTITY, ENTITY, ENTITY, , ENTITY, ENT...","[referred, people, referred, people, adults, c..."
1,All patients and parents / carers where approp...,"[patients, parents, carers, appropriate, deeme...","[NOUN, NOUN, NOUN, NOUN, VERB, ADJ, VERB, ADJ,...","[ENTITY, ENTITY, ENTITY, , , , , , ENTITY, ENT...","[patients, patients, patients, where, carers, ..."
2,Infants under two with suspected food allergy,"[Infants, suspected, food, allergy]","[NOUN, VERB, NOUN, NOUN]","[ENTITY, ENTITY, ENTITY, ENTITY]","[Infants, allergy, allergy, Infants]"
3,Infants under two with moderate-to-severe ecze...,"[Infants, moderate-to-severe, eczema, respondi...","[NOUN, ADJ, NOUN, VERB, ADJ, NOUN]","[ENTITY, , ENTITY, , ENTITY, ENTITY]","[Infants, eczema, two, Infants, treatment, res..."
4,Children and young people (up to 16 years of a...,"[Children, young, people, 16, years, age, susp...","[NOUN, ADJ, NOUN, NUM, NOUN, NOUN, ADJ, ADJ, N...","[ENTITY, ENTITY, ENTITY, , ENTITY, ENTITY, , E...","[Children, people, Children, years, Children, ..."
5,Young people and adults (from 16 years of age)...,"[Young, people, adults, 16, years, age, histor...","[ADJ, NOUN, NOUN, NUM, NOUN, NOUN, NOUN, ADJ, ...","[ENTITY, ENTITY, ENTITY, , ENTITY, ENTITY, ENT...","[people, people, people, years, people, years,..."


In [36]:
#ic_df['Entities']   = ic_df['InclusionCriteria'].apply(lambda x: [ss_sm(x).ents[0].text for kb_entry in ss_sm(x).ents[0]._.kb_ents])
ic_df['Concept_ID'] = ic_df['InclusionCriteria'].apply(lambda x: [kb_entry[0]      for kb_entry in ss_sm(x).ents[i]._.kb_ents])
ic_df['MatchScore'] = ic_df['InclusionCriteria'].apply(lambda x: [kb_entry[1]      for kb_entry in ss_sm(x).ents[i]._.kb_ents])

#PART 3. Linking to medical concepts via UMLS
ic_df

Unnamed: 0,InclusionCriteria,tokens,pos,entity,parent,AbbreviationsLong,AbbreviationsShort,Entities,Concept_ID,MatchScore
0,Myeloid derived suppressor cells (MDSC) are im...,"[children, young, people, adults, fit, selecti...","[NOUN, ADJ, NOUN, NOUN, VERB, NOUN, NOUN, NOUN...","[ENTITY, ENTITY, ENTITY, ENTITY, , ENTITY, ENT...","[referred, people, referred, people, adults, c...",[MDSC],[Myeloid derived suppressor cells],"[Myeloid derived suppressor cells, Myeloid der...","[C4277543, C1513790, C0887899]","[0.9999999403953552, 0.7051023840904236, 0.704..."
1,All patients and parents / carers where approp...,"[patients, parents, carers, appropriate, deeme...","[NOUN, NOUN, NOUN, NOUN, VERB, ADJ, VERB, ADJ,...","[ENTITY, ENTITY, ENTITY, , , , , , ENTITY, ENT...","[patients, patients, patients, where, carers, ...",[],[],"[Myeloid derived suppressor cells, Myeloid der...","[C0030705, C0025360, C0017313]","[1.0, 0.8390243649482727, 0.8104863166809082]"
2,Infants under two with suspected food allergy,"[Infants, suspected, food, allergy]","[NOUN, VERB, NOUN, NOUN]","[ENTITY, ENTITY, ENTITY, ENTITY]","[Infants, allergy, allergy, Infants]",[],[],"[Myeloid derived suppressor cells, Myeloid der...","[C0021270, C3813607, C0021272]","[1.0, 0.8641207814216614, 0.856170117855072]"
3,Infants under two with moderate-to-severe ecze...,"[Infants, moderate-to-severe, eczema, respondi...","[NOUN, ADJ, NOUN, VERB, ADJ, NOUN]","[ENTITY, , ENTITY, , ENTITY, ENTITY]","[Infants, eczema, two, Infants, treatment, res...",[],[],"[Myeloid derived suppressor cells, Myeloid der...","[C0021270, C3813607, C0021272]","[1.0, 0.8641207814216614, 0.856170117855072]"
4,Children and young people (up to 16 years of a...,"[Children, young, people, 16, years, age, susp...","[NOUN, ADJ, NOUN, NUM, NOUN, NOUN, ADJ, ADJ, N...","[ENTITY, ENTITY, ENTITY, , ENTITY, ENTITY, , E...","[Children, people, Children, years, Children, ...",[],[],"[Myeloid derived suppressor cells, Myeloid der...","[C0008059, C0680063, C1456649]","[0.9999998807907104, 0.9999998807907104, 0.907..."
5,Young people and adults (from 16 years of age)...,"[Young, people, adults, 16, years, age, histor...","[ADJ, NOUN, NOUN, NUM, NOUN, NOUN, NOUN, ADJ, ...","[ENTITY, ENTITY, ENTITY, , ENTITY, ENTITY, ENT...","[people, people, people, years, people, years,...",[],[],"[Myeloid derived suppressor cells, Myeloid der...",[C2963163],[0.7737775444984436]


In [33]:
test_list = ic_df['InclusionCriteria'][1]
ss_sm(test_list).ents

(patients, parents, carers, consent, research project)

In [None]:
test_list = ic_df['InclusionCriteria'][1]
entities = ss_sm(test_list).ents
for entity in entities:
  print(entity)
  for kb_entry in entity._.kb_ents:
    cui = kb_entry[0]
    match_score = kb_entry[1]
    

In [40]:
entities = ss_sm(ic_df['InclusionCriteria'][0]).ents
entities

(Myeloid derived suppressor cells,
 MDSC,
 immature,
 myeloid cells,
 immunosuppressive activity)

In [31]:
ss_sm(ic_df['InclusionCriteria'][0]).ents[0]._.kb_ents

[('C4277543', 0.9999999403953552),
 ('C1513790', 0.7051023840904236),
 ('C0887899', 0.7045407295227051)]