<a href="https://colab.research.google.com/github/victormurcia/CTS_Test/blob/main/CTS_Unifying_Routines.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [17]:
#I need to import locale to ensure that the encoding is set to UTF-8 (weird Google Colab bug)
import locale
locale.getpreferredencoding = lambda: "UTF-8"

#Check the current build in Google Colab
!cat /etc/*release
print('\n')

#Check CUDA version
!nvcc --version
print('\n')

#Ensure that the required packages are installed in the current environment
install_pckgs = True
if install_pckgs == True:
  !pip install numpy --quiet
  !pip install pandas --quiet
  !pip install spacy==3.4.4 --quiet
  !pip install scispacy --quiet
  !pip install medspacy --quiet
  !pip install negspacy --quiet
  !pip install transformers
  !pip install seaborn --quiet
  !pip install matplotlib --quiet
  !pip install "dask[complete]" --quiet
  !pip install ipywidgets --quiet
  !pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118 --quiet 
  print('\n')

#Spacy models used for processing biomedical, scientific, or clinical text 
#Spacy pipeline for biomedical data.
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_core_sci_sm-0.5.1.tar.gz --quiet
#Spacy pipeline for biomedical data. Has a larger vocabulary and 50k word vectors
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_core_sci_md-0.5.1.tar.gz --quiet
#This one is another spacy pipeline with 785k vocabulary and uses scibert-base as a transformer model
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_core_sci_scibert-0.5.1.tar.gz --quiet
#Spacy pipeline for biomedical data with 600k word vectors
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_core_sci_lg-0.5.1.tar.gz --quiet
#A spaCy NER model trained on the CRAFT corpus.
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_ner_craft_md-0.5.1.tar.gz --quiet
#A spaCy NER model trained on the JNLPBA corpus.
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_ner_jnlpba_md-0.5.1.tar.gz --quiet
#A spaCy NER model trained on the BC5CDR corpus.
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_ner_bc5cdr_md-0.5.1.tar.gz --quiet
#A spaCy NER model trained on the BIONLP13CG corpus.
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_ner_bionlp13cg_md-0.5.1.tar.gz --quiet
#This is the med7 transformer model found here: https://github.com/kormilitzin/med7
!pip install https://huggingface.co/kormilitzin/en_core_med7_trf/resolve/main/en_core_med7_trf-any-py3-none-any.whl --quiet
#This is the med7 vector model 
!pip install https://huggingface.co/kormilitzin/en_core_med7_lg/resolve/main/en_core_med7_lg-any-py3-none-any.whl --quiet

DISTRIB_ID=Ubuntu
DISTRIB_RELEASE=20.04
DISTRIB_CODENAME=focal
DISTRIB_DESCRIPTION="Ubuntu 20.04.5 LTS"
NAME="Ubuntu"
VERSION="20.04.5 LTS (Focal Fossa)"
ID=ubuntu
ID_LIKE=debian
PRETTY_NAME="Ubuntu 20.04.5 LTS"
VERSION_ID="20.04"
HOME_URL="https://www.ubuntu.com/"
SUPPORT_URL="https://help.ubuntu.com/"
BUG_REPORT_URL="https://bugs.launchpad.net/ubuntu/"
PRIVACY_POLICY_URL="https://www.ubuntu.com/legal/terms-and-policies/privacy-policy"
VERSION_CODENAME=focal
UBUNTU_CODENAME=focal


nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2022 NVIDIA Corporation
Built on Wed_Sep_21_10:33:58_PDT_2022
Cuda compilation tools, release 11.8, V11.8.89
Build cuda_11.8.r11.8/compiler.31833905_0


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l

In [18]:
#Import the required libraries/packages
#General utilities
import numpy as np
import pandas as pd
import dask.dataframe as dd
import seaborn as sns
import matplotlib.pyplot as plt
import os, random, time,sys, re
from ipywidgets import widgets, interact, interactive, fixed, interact_manual
from tqdm import tqdm
import urllib.error

#NLP Stuff
#Spacy
import spacy
from spacy.lang.en.stop_words import STOP_WORDS #Load stopwords
from spacy.language import Language
from spacy.tokenizer import Tokenizer
#Scispacy
import scispacy
from scispacy.linking import EntityLinker
from scispacy.abbreviation import AbbreviationDetector
from scispacy.hyponym_detector import HyponymDetector
#Medspacy
import medspacy
from medspacy.ner import TargetRule
from medspacy.visualization import visualize_ent
from negspacy.negation import Negex

#To use Transformers models from HuggingFace
import transformers
from transformers import AutoTokenizer, AutoModel,AutoModelForTokenClassification
#NLTK

#Enable data to be extracted from my Google Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [19]:
#Load Patient Dataframe
url ='https://raw.githubusercontent.com/victormurcia/CTS_Test/main/test_data/multi_veteran_df.csv'
patients_df = pd.read_csv(url)
#patients_df

#Load the inclusion criteria
url ='https://raw.githubusercontent.com/victormurcia/CTS_Test/main/test_data/parsed_ct_ic.csv'
parsed_ct_ic = pd.read_csv(url)
#parsed_ct_ic

#Load the exclusion criteria
url ='https://raw.githubusercontent.com/victormurcia/CTS_Test/main/test_data/parsed_ct_ec.csv'
parsed_ct_ec = pd.read_csv(url)

In [20]:
#Order of operations
# 1. Load the patients dataframe
# 2. Extract the EHR for the current patient
# 3. Preprocess each patient feature column
# 4. Run the NER model on each patient feature
# 5. Run a query on clinicaltrials.gov for all the conditions present in the patient profile
# 6. Extract the eligibility criteria for each queried clinical trial for each condition
# 7. Split eligibility criteria into inclusion/exclusion sections
# 8. Run NER model on both inclusion/exclusion sections
# 9. Determine Sorensen-Dice index between inclusion/exclusion sections and the patient EHR
# 10. Return the complete list of clinical trials stating whether the patient would qualify or not for each clinical trial

In [21]:
########################################################################################################
def create_patient_df_for_NER(patients_df):
  """
  This function creates a dataframe for a single patient from the 
  """
  #Select a single row from the DataFrame
  row_data = patients_df.loc[0]

  #Create a new DataFrame with the single row
  patient_df = pd.DataFrame([row_data], columns=row_data.index)

  #Select columns that summarize patient profile and put them into a list
  allergies     = patient_df['DESCRIPTION_als']
  condition     = patient_df['DESCRIPTION_cds']
  devices       = patient_df['DESCRIPTION_dvs']
  immunizations = patient_df['DESCRIPTION_ims']
  medications   = patient_df['DESCRIPTION_mds']
  observations  = patient_df['DESCRIPTION_obs']
  values        = patient_df['VALUE_obs']
  units         = patient_df['UNITS_obs']
  procedures    = patient_df['DESCRIPTION_prs']
  birthday      = patient_df['BIRTHDATE_pts']
  marital       = patient_df['MARITAL_pts']
  race          = patient_df['RACE_pts']
  ethnicity     = patient_df['ETHNICITY_pts']
  gender        = patient_df['GENDER_pts']
  city          = patient_df['CITY_pts']
  county        = patient_df['COUNTY_pts']

  #Make list for patient profile
  patient_prof_list = [allergies, condition, devices, immunizations, medications, observations, 
                       procedures,birthday,marital, race, ethnicity, gender, city, county]
  patient_prof_cols = ['allergies', 'condition', 'devices', 'immunizations', 
                       'medications', 'observations', 'procedures', 'birthday', 
                       'marital', 'race', 'ethnicity', 'gender', 'city', 'county']

  #Create a dictionary with column names and Series data
  data_dict = dict(zip(patient_prof_cols, patient_prof_list))

  #Create a new DataFrame with the single column
  patient_prof = pd.DataFrame(data_dict)

  #Concatenate the Series data into a single Series
  combined_series = pd.concat(patient_prof_list)

  #Create a DataFrame with a single column using the combined Series
  final_patient_df = pd.DataFrame({'Patient_Profile': combined_series})

  #Get column names of patient df
  final_patient_df['aspects'] = patient_prof_cols

  #Change the order of columns,reset the index, and drop the index column
  final_patient_df = final_patient_df.reindex(columns=['aspects', 'Patient_Profile']).reset_index().drop('index',axis=1)

  #Convert the list column to a string column separated by a single space
  final_patient_df['Patient_Profile'] = final_patient_df['Patient_Profile'].apply(lambda x: ' '.join(map(str, eval(x))))

  return final_patient_df
########################################################################################################
def get_umls_codes(text: str,model,suffix):
  """
  Extracts UMLS codes from a given text using a pre-trained spaCy model.

  Args:
  - text: input text to extract UMLS codes from
  - model: pre-trained spaCy model to use for NLP tasks

  Returns:
  - list of dictionaries containing UMLS code information for each entity in the text
  """
  # Process the text and extract UMLS codes
  doc = model(text)
  umls_codes = [
      {
          "text" + suffix: entity.text,
          #"start": entity.start_char,
          #"end": entity.end_char,
          "umls_id" + suffix: umls_ent[0],
          "score" + suffix: umls_ent[1]
      }
      for entity in doc.ents
      for umls_ent in entity._.kb_ents
  ]
  
  return umls_codes
########################################################################################################
def extract_values(dicts, key):
  """
  """
  return [d.get(key, None) for d in dicts]
########################################################################################################
def run_ner(df,col_name,model):
  """
  Extracts UMLS codes from text data in a given DataFrame column using a pre-trained spaCy model.

  Args:
  - df: pandas DataFrame containing the text data
  - col_name: string representing the name of the column in df that contains the text data
  - model: pre-trained spaCy model to use for NLP tasks

  Returns:
  - pandas DataFrame with new columns for each UMLS code extracted from the text data
  """

  if col_name == 'Patient_Profile':
    ner_results_col = 'umls_codes_patient'
    suffix = '_pat'
  elif col_name == 'inclusion_criteria':
    ner_results_col = 'umls_codes_ic'
    suffix = '_ic'
  elif col_name == 'exclusion_criteria':
    ner_results_col = 'umls_codes_ec'
    suffix = '_ec'
  else:
    print('Invalid column name')

  #Run the NER model on patient df and get UMLS codes after parsing and entity linking
  df[ner_results_col] = df[col_name].apply(get_umls_codes, args = (model,suffix))

  # Create new columns from the keys in the dictionaries within the 'info' column lists
  unique_keys = set().union(*(d.keys() for dicts in df[ner_results_col] for d in dicts))

  #Unpack the dictionary keys into separate columns
  for key in unique_keys:
    df[key] = df[ner_results_col].apply(lambda dicts: extract_values(dicts, key))

  return df
########################################################################################################
def contains_multiple_words(s):
  """
  This function determines if a string (s) is composed of multiple space separated words. It returns a boolean based on the result
  """
  if len(s.split()) > 1:
    return True
  else:
    return False
########################################################################################################
def get_list_of_conditions_from_patient_df(patient_ner_df):
  """
  """

  #Get list of conditions from parsed patient dataframe and remove duplicates
  list_of_conditions = list(set(patient_ner_df['text'].explode().tolist()))

  # Remove nan values from the list using list comprehension and sort elements in alphabetical order
  list_of_conditions = sorted([x for x in list_of_conditions if not pd.isna(x)])

  # Define a regular expression pattern to match brackets and parentheses
  pattern = r"[\[\]\(\)]"

  # Remove brackets and parentheses from each element in the list
  list_of_conditions = [re.sub(pattern, "", str(elem)) for elem in list_of_conditions]

  return list_of_conditions
########################################################################################################
def get_clinical_trials(condition):
  """
  Creates a pandas dataframe from a query to clinicaltrials.gov

  Args:
  - condition: string representing the condition to search for

  Returns:
  - pandas dataframe with clinical trial data
  """
  #Check that condition is only 1 word else concatenate the string with a + sign
  multiword_condition = contains_multiple_words(condition)
  if multiword_condition == True:
    condition = "+".join(condition.split())
  print(condition)
  a = 'https://clinicaltrials.gov/api/query/study_fields?expr='
  b = '&fields=NCTId%2CBriefTitle%2CCondition%2COverallStatus%2CEligibilityCriteria'
  c = '&min_rnk=1&max_rnk=1000&fmt=csv'
  q=(a + condition + b + c)
  print(q)
  #qtrials_df = pd.read_csv(q,skiprows=10)
  # Read CSV data from URL and handle errors
  for i in range(5):
    try:
      qtrials_df = pd.read_csv(q, skiprows=10)
      break
    except urllib.error.URLError as e:
      print("URLError:", e.reason)
      print("Retrying in 5 seconds...")
      time.sleep(5)
    except urllib.error.HTTPError as e:
      if e.code == 500:
        print(condition,"HTTP Error 500: Internal Server Error")
      else:
        print("HTTP Error:", e.code)
      break
  else:
    raise ValueError("Unable to connect to the server after 5 attempts")
  return qtrials_df 
########################################################################################################
#PART 4. Query for clinical trials based on patient conditions
def query_trials_wrapper(list_of_conditions):
  """
  """
  list_of_cts = []
  for condition in list_of_conditions:
    #print(condition)
    qtrials_df = get_clinical_trials(condition)
    list_of_cts.append(qtrials_df)

  # Concatenate all DataFrames in the list into a single DataFrame
  all_trials_df = pd.concat(list_of_cts, ignore_index=True)

  return all_trials_df
########################################################################################################
def split_criteria(eligibility_criteria):
    # Split the input string
    substrings = eligibility_criteria.split('||')
    # Initialize the inclusion and exclusion criteria sections
    inclusion_criteria = ''
    exclusion_criteria = ''

    # Loop through the substrings to build the inclusion and exclusion criteria sections
    for i in range(len(substrings)):
        if 'Key Inclusion Criteria:' in substrings[i] or 'Inclusion Criteria:' in substrings[i]:
            if i+1 < len(substrings):
                inclusion_criteria += substrings[i+1]
        elif 'Key Exclusion Criteria:' in substrings[i] or 'Exclusion Criteria:' in substrings[i]:
            if i+1 < len(substrings):
                exclusion_criteria += substrings[i+1]

    # Return the inclusion and exclusion criteria sections as a tuple
    return pd.Series([inclusion_criteria, exclusion_criteria])

########################################################################################################
def cts_parser(patients_df,model):
  """
  This function parses the electronic health records for a patient extracted from the Synthetic Veteran Suicide Dataset and a set of Clinical Trials queried using the clinicaltrials.gov API to 
  determine how good of a match a patient is to a clinical trial. The output of this function will be a dataframe containing a list of clinical trials, their 
  """

  #Start timer
  start_time = time.time()

  #PART 1. Create and prepare single patient dataframe for NER
  print('Step 1/9 : Creating dataframe for patient')
  final_patient_df = create_patient_df_for_NER(patients_df)

  #PART 2. Run the NER model on the patient EHR
  print('Step 2/9 : Run NER on patient data')
  patient_ner_df = run_ner(final_patient_df,'Patient_Profile',model)

  #Save parsed patient EHR to .csv
  print('Save parsed patient data') 
  patient_ner_df.to_csv('parsed_patient_ehr.csv', index=False)

  #PART 3. Get list of patient conditions after parsing EHR while also removing duplicates
  print('Step 3/9 : Creating list of patient conditions ')
  list_of_conditions = get_list_of_conditions_from_patient_df(patient_ner_df)
  
  #PART 4. Query for clinical trials based on patient conditions and generate CT dataframe
  print('Step 4/9 : Querying for clinical trials')
  all_trials_df = query_trials_wrapper(list_of_conditions)

  #PART 5. Get dataframe containing only clinical trials that are actively recruiting
  print('Step 5/9 : Only include clinical trials that are actively recruiting')
  recruiting_trials_df = all_trials_df[all_trials_df['OverallStatus']=='Recruiting'].reset_index()

  #PART 7. Create inclusion and inclusion criteria columns on dataframe for subsequent parsing
  print('Step 6/9 : Split eligibility criteria into inclusion and exclusion sections')
  recruiting_trials_df[['inclusion_criteria', 'exclusion_criteria']] = recruiting_trials_df['EligibilityCriteria'].astype(str).apply(split_criteria)

  #PART 8. Run the NER model on the eligibility criteria from the clinical trial df
  parsed_trials_df = run_ner(recruiting_trials_df,'inclusion_criteria',model)
  parsed_trials_df = run_ner(parsed_trials_df,'exclusion_criteria',model)

  #End timer
  print("Parsing patient records took: %.2f seconds" % (time.time() - start_time))

  return patient_ner_df,all_trials_df,recruiting_trials_df

In [22]:
#List of available models
models = ["en_core_sci_sm","en_core_sci_md","en_core_sci_scibert","en_core_sci_lg","en_ner_craft_md","en_ner_jnlpba_md","en_ner_bionlp13cg_md","en_core_med7_lg"]

#Load the pre-trained spaCy NER model with sci-spaCy
model = spacy.load(models[0])

#Add the EntityLinker pipe to spacy pipeline
if 'scispacy_linker' not in model.pipe_names:
  model.add_pipe("scispacy_linker", config={"linker_name": "umls", "max_entities_per_mention": 1})

#Add the Negation pipe to spacy pipeline
if 'negex' not in model.pipe_names:
  model.add_pipe("negex")

#Add the abbreviation pipe to the spacy pipeline.
if 'abbreviation_detector' not in model.pipe_names:
  model.add_pipe("abbreviation_detector")

In [23]:
patient_ner_df,all_trials_df,recruiting_trials_df = cts_parser(patients_df,model)

Creating dataframe for patient
Run NER on patient data
Creating list of patient conditions 
Querying for clinical trials
ACTUAT
https://clinicaltrials.gov/api/query/study_fields?expr=ACTUAT&fields=NCTId%2CBriefTitle%2CCondition%2COverallStatus%2CEligibilityCriteria&min_rnk=1&max_rnk=1000&fmt=csv
Ab
https://clinicaltrials.gov/api/query/study_fields?expr=Ab&fields=NCTId%2CBriefTitle%2CCondition%2COverallStatus%2CEligibilityCriteria&min_rnk=1&max_rnk=1000&fmt=csv
Acetaminophen
https://clinicaltrials.gov/api/query/study_fields?expr=Acetaminophen&fields=NCTId%2CBriefTitle%2CCondition%2COverallStatus%2CEligibilityCriteria&min_rnk=1&max_rnk=1000&fmt=csv
Acetaminophen+325+MG
https://clinicaltrials.gov/api/query/study_fields?expr=Acetaminophen+325+MG&fields=NCTId%2CBriefTitle%2CCondition%2COverallStatus%2CEligibilityCriteria&min_rnk=1&max_rnk=1000&fmt=csv
Acute+bronchitis
https://clinicaltrials.gov/api/query/study_fields?expr=Acute+bronchitis&fields=NCTId%2CBriefTitle%2CCondition%2COverallStatu

In [24]:
recruiting_trials_df

Unnamed: 0,index,Rank,NCTId,BriefTitle,Condition,OverallStatus,EligibilityCriteria,inclusion_criteria,exclusion_criteria
0,5,4,NCT05617755,"AB-1015, an Integrated Circuit T (ICT) Cell Th...","Carcinoma, Ovarian Epithelial|Ovarian Neoplasm...",Recruiting,"Inclusion Criteria:||Recurrent, advanced, plat...","Recurrent, advanced, platinum resistant ovaria...",Cytotoxic chemotherapy within 14 days of time ...
1,7,6,NCT05013086,177Lu-AB-3PRGD2 in Patients With Non Small Cel...,Non Small Cell Lung Cancer,Recruiting,Inclusion Criteria:||confirmed NSCLC patients;...,confirmed NSCLC patients;|tumor lesions with h...,the exclusion criteria were a serum creatinine...
2,18,17,NCT05211570,AB8939 in Patients With Relapsed/Refractory Ac...,Acute Myeloid Leukemia Refractory|Acute Myeloi...,Recruiting,DOSE ESCALATION STUDY||Key Inclusion Criteria:...,Patients with documented diagnosis of acute my...,Patients eligible to a standard of care|Patien...
3,23,22,NCT04943185,"A Prospective, Single Surgeon, Randomized Cont...",The Objective of the Study is to Prospectively...,Recruiting,Inclusion Criteria:||Patients with maximally t...,Patients with maximally tolerated medically tr...,Non-Ocular
4,24,23,NCT05577416,A Study of AB-218 in Patients With IDH1 Mutate...,Glioma,Recruiting,Inclusion Criteria:||Patients will have a radi...,Patients will have a radiological diagnosis of...,Patients who meet any of the following criteri...
...,...,...,...,...,...,...,...,...,...
20154,144320,980,NCT04909528,LLTS to Treat Premature Ventricular Contractions,Premature Ventricular Contraction,Recruiting,"Inclusion Criteria:||Age >18, <80 of age|Sympt...","Age >18, <80 of age|Symptomatic PVCs refractor...",Left ventricular ejection fraction (LVEF) < 45...
20155,144322,982,NCT04932668,Home Based Electrical Stimulation on Post-stro...,Spasticity as Sequela of Stroke,Recruiting,Inclusion Criteria:||Post stroke (hemorrhagic ...,Post stroke (hemorrhagic or ischemic) with ank...,Introduction or changes in anti-spastic medica...
20156,144327,987,NCT05321693,Neuromodulatory Effects of Transcranial Pulsed...,Fibromyalgia,Recruiting,Inclusion Criteria:||Women from 30 to 65 years...,Women from 30 to 65 years old|FM diagnosis acc...,Pregnancy or lack of contraceptive use;|Histor...
20157,144332,992,NCT05630911,"Conscious Movement Processing, Postural Stabil...",Fall Injury|Postural; Defect|Old Age; Debility,Recruiting,Inclusion Criteria:||65 years old or above;|ab...,65 years old or above;|able to stand independe...,a history of any major cerebrovascular and/or ...


In [25]:
#DataFrame containing eligility criteria for all queried trials
qt_ec = recruiting_trials_df[['EligibilityCriteria']][:10]
qt_ec

Unnamed: 0,EligibilityCriteria
0,"Inclusion Criteria:||Recurrent, advanced, plat..."
1,Inclusion Criteria:||confirmed NSCLC patients;...
2,DOSE ESCALATION STUDY||Key Inclusion Criteria:...
3,Inclusion Criteria:||Patients with maximally t...
4,Inclusion Criteria:||Patients will have a radi...
5,Inclusion Criteria:||Capable of giving signed ...
6,Inclusion Criteria:||Grade IV glioma (glioblas...
7,Inclusion Criteria:||Patients must meet all of...
8,Inclusion Criteria:||Age ≥18 years of age at t...
9,"Key Inclusion Criteria:||Clinically diagnosed,..."


In [31]:
def run_ner(df,col_name,model):
  """
  """

  if col_name == 'Patient_Profile':
    ner_results_col = 'umls_codes_patient'
    suffix = '_pat'
  elif col_name == 'inclusion_criteria':
    ner_results_col = 'umls_codes_ic'
    suffix = '_ic'
  elif col_name == 'exclusion_criteria':
    ner_results_col = 'umls_codes_ec'
    suffix = '_ec'
  else:
    print('Invalid column name')

  #Run the NER model on patient df and get UMLS codes after parsing and entity linking
  df[ner_results_col] = df[col_name].apply(get_umls_codes, args = (model,suffix))

  # Create new columns from the keys in the dictionaries within the 'info' column lists
  unique_keys = set().union(*(d.keys() for dicts in df[ner_results_col] for d in dicts))

  #Unpack the dictionary keys into separate columns
  for key in unique_keys:
    df[key] = df[ner_results_col].apply(lambda dicts: extract_values(dicts, key))

  return df

def get_umls_codes(text: str,model,suffix):
  """
  Extracts UMLS codes from a given text using a pre-trained spaCy model.

  Args:
  - text: input text to extract UMLS codes from
  - model: pre-trained spaCy model to use for NLP tasks

  Returns:
  - list of dictionaries containing UMLS code information for each entity in the text
  """
  # Process the text and extract UMLS codes
  doc = model(text)
  umls_codes = [
      {
          "text" + suffix: entity.text,
          #"start": entity.start_char,
          #"end": entity.end_char,
          "umls_id" + suffix: umls_ent[0],
          "score" + suffix: umls_ent[1]
      }
      for entity in doc.ents
      for umls_ent in entity._.kb_ents
  ]
  
  return umls_codes

In [32]:
test_df = recruiting_trials_df[:10]
test_df

Unnamed: 0,index,Rank,NCTId,BriefTitle,Condition,OverallStatus,EligibilityCriteria,inclusion_criteria,exclusion_criteria
0,5,4,NCT05617755,"AB-1015, an Integrated Circuit T (ICT) Cell Th...","Carcinoma, Ovarian Epithelial|Ovarian Neoplasm...",Recruiting,"Inclusion Criteria:||Recurrent, advanced, plat...","Recurrent, advanced, platinum resistant ovaria...",Cytotoxic chemotherapy within 14 days of time ...
1,7,6,NCT05013086,177Lu-AB-3PRGD2 in Patients With Non Small Cel...,Non Small Cell Lung Cancer,Recruiting,Inclusion Criteria:||confirmed NSCLC patients;...,confirmed NSCLC patients;|tumor lesions with h...,the exclusion criteria were a serum creatinine...
2,18,17,NCT05211570,AB8939 in Patients With Relapsed/Refractory Ac...,Acute Myeloid Leukemia Refractory|Acute Myeloi...,Recruiting,DOSE ESCALATION STUDY||Key Inclusion Criteria:...,Patients with documented diagnosis of acute my...,Patients eligible to a standard of care|Patien...
3,23,22,NCT04943185,"A Prospective, Single Surgeon, Randomized Cont...",The Objective of the Study is to Prospectively...,Recruiting,Inclusion Criteria:||Patients with maximally t...,Patients with maximally tolerated medically tr...,Non-Ocular
4,24,23,NCT05577416,A Study of AB-218 in Patients With IDH1 Mutate...,Glioma,Recruiting,Inclusion Criteria:||Patients will have a radi...,Patients will have a radiological diagnosis of...,Patients who meet any of the following criteri...
5,25,24,NCT04104672,A Study to Evaluate the Safety and Tolerabilit...,Advanced Pancreatic Cancer,Recruiting,Inclusion Criteria:||Capable of giving signed ...,Capable of giving signed informed consent|Male...,Use of any live attenuated vaccines against in...
6,26,25,NCT04656535,AB154 Combined With AB122 for Recurrent Gliobl...,Glioblastoma,Recruiting,Inclusion Criteria:||Grade IV glioma (glioblas...,Grade IV glioma (glioblastoma and its variants...,Patients who have been treated with bevacizuma...
7,28,27,NCT04395677,A Study of AB-106 in Subjects With Advanced NS...,Non Small Cell Lung Cancer,Recruiting,Inclusion Criteria:||Patients must meet all of...,Patients must meet all of the following criter...,Patient presenting with any of the following c...
8,33,32,NCT05653882,A Study Evaluating AB248 Alone or in Combinati...,Solid Tumor|Non Small Cell Lung Cancer|Melanom...,Recruiting,Inclusion Criteria:||Age ≥18 years of age at t...,Age ≥18 years of age at the time consent is si...,Has a diagnosis of immunodeficiency.|Has a his...
9,34,33,NCT04895215,AB-2004 in Treatment of Irritability Associate...,Autism Spectrum Disorder (ASD),Recruiting,"Key Inclusion Criteria:||Clinically diagnosed,...","Clinically diagnosed, documented ASD (Diagnost...","Use of an oral, injected, or inhaled antibioti..."


In [34]:
parsed_trials_df = run_ner(test_df,'inclusion_criteria',model)
parsed_trials_df = run_ner(parsed_trials_df,'exclusion_criteria',model)
parsed_trials_df

Unnamed: 0,index,Rank,NCTId,BriefTitle,Condition,OverallStatus,EligibilityCriteria,inclusion_criteria,exclusion_criteria,umls_codes_ic,score_ic,text_ic,umls_id_ic,umls_codes_ec,score_ec,umls_id_ec,text_ec
0,5,4,NCT05617755,"AB-1015, an Integrated Circuit T (ICT) Cell Th...","Carcinoma, Ovarian Epithelial|Ovarian Neoplasm...",Recruiting,"Inclusion Criteria:||Recurrent, advanced, plat...","Recurrent, advanced, platinum resistant ovaria...",Cytotoxic chemotherapy within 14 days of time ...,"[{'text_ic': 'Recurrent', 'umls_id_ic': 'C2945...","[0.9999998807907104, 1.0, 0.8688523173332214, ...","[Recurrent, advanced, platinum resistant, ovar...","[C2945760, C0205179, C4688006, C0205065, C0015...","[{'text_ec': 'Cytotoxic chemotherapy', 'umls_i...","[1.0, 1.0, 0.7621618509292603, 1.0, 1.0, 0.958...","[C0677881, C0439228, C0677881, C0439228, C0031...","[Cytotoxic chemotherapy, days, cell collection..."
1,7,6,NCT05013086,177Lu-AB-3PRGD2 in Patients With Non Small Cel...,Non Small Cell Lung Cancer,Recruiting,Inclusion Criteria:||confirmed NSCLC patients;...,confirmed NSCLC patients;|tumor lesions with h...,the exclusion criteria were a serum creatinine...,"[{'text_ic': 'NSCLC', 'umls_id_ic': 'C0007131'...","[1.0, 1.0, 1.0, 1.0, 1.0, 0.802409827709198]","[NSCLC, lesions, PET/CT, week, injection, writ...","[C0007131, C0221198, C1699633, C0439230, C0021...","[{'text_ec': 'exclusion criteria', 'umls_id_ec...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.9999999403953...","[C0680251, C0201976, C0441889, C0019046, C0441...","[exclusion criteria, serum creatinine, level, ..."
2,18,17,NCT05211570,AB8939 in Patients With Relapsed/Refractory Ac...,Acute Myeloid Leukemia Refractory|Acute Myeloi...,Recruiting,DOSE ESCALATION STUDY||Key Inclusion Criteria:...,Patients with documented diagnosis of acute my...,Patients eligible to a standard of care|Patien...,"[{'text_ic': 'Patients', 'umls_id_ic': 'C00307...","[1.0, 1.0, 0.9999998807907104, 1.0, 1.0, 1.0, ...","[Patients, documented, diagnosis, acute myeloi...","[C0030705, C1301725, C0011900, C0023467, C0023...","[{'text_ec': 'Patients', 'umls_id_ec': 'C00307...","[1.0, 1.0, 0.8145785927772522, 1.0, 1.0, 0.725...","[C0030705, C1442989, C0030705, C0472699, C0472...","[Patients, standard, care|Patients, hematopoie..."
3,23,22,NCT04943185,"A Prospective, Single Surgeon, Randomized Cont...",The Objective of the Study is to Prospectively...,Recruiting,Inclusion Criteria:||Patients with maximally t...,Patients with maximally tolerated medically tr...,Non-Ocular,"[{'text_ic': 'Patients', 'umls_id_ic': 'C00307...","[1.0, 0.7951071858406067, 1.0, 1.0, 1.0, 1.0, ...","[Patients, maximally tolerated medically treat...","[C0030705, C0752079, C0339573, C0271829, C0043...",[],[],[],[]
4,24,23,NCT05577416,A Study of AB-218 in Patients With IDH1 Mutate...,Glioma,Recruiting,Inclusion Criteria:||Patients will have a radi...,Patients will have a radiological diagnosis of...,Patients who meet any of the following criteri...,"[{'text_ic': 'Patients', 'umls_id_ic': 'C00307...","[1.0, 0.7240345478057861, 1.0, 1.0, 1.0, 1.0, ...","[Patients, radiological diagnosis, LGG, LGG, e...","[C0030705, C0597349, C1629836, C1629836, C0206...","[{'text_ec': 'Patients', 'umls_id_ec': 'C00307...","[1.0, 0.9999998807907104, 1.0, 1.0]","[C0030705, C0243161, C0679823, C0008972]","[Patients, criteria, participation, study]"
5,25,24,NCT04104672,A Study to Evaluate the Safety and Tolerabilit...,Advanced Pancreatic Cancer,Recruiting,Inclusion Criteria:||Capable of giving signed ...,Capable of giving signed informed consent|Male...,Use of any live attenuated vaccines against in...,"[{'text_ic': 'signed', 'umls_id_ic': 'C1519316...","[1.0, 0.7758122086524963, 1.0, 1.0, 1.0, 1.0, ...","[signed, consent|Male, female, participants, y...","[C1519316, C1511481, C0043210, C0679646, C0439...","[{'text_ec': 'live', 'umls_id_ec': 'C1548795',...","[1.0, 1.0, 0.9999999403953552, 0.9999998807907...","[C1548795, C0042210, C0009450, C0008049, C0439...","[live, vaccines, infectious diseases, varicell..."
6,26,25,NCT04656535,AB154 Combined With AB122 for Recurrent Gliobl...,Glioblastoma,Recruiting,Inclusion Criteria:||Grade IV glioma (glioblas...,Grade IV glioma (glioblastoma and its variants...,Patients who have been treated with bevacizuma...,"[{'text_ic': 'Grade IV glioma', 'umls_id_ic': ...","[0.8439853191375732, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[Grade IV glioma, glioblastoma, variants, Worl...","[C0017636, C0017636, C0205419, C0043237, C0040...","[{'text_ec': 'Patients', 'umls_id_ec': 'C00307...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.8670050501823...","[C0030705, C0332293, C0796392, C1701901, C0282...","[Patients, treated with, bevacizumab, continge..."
7,28,27,NCT04395677,A Study of AB-106 in Subjects With Advanced NS...,Non Small Cell Lung Cancer,Recruiting,Inclusion Criteria:||Patients must meet all of...,Patients must meet all of the following criter...,Patient presenting with any of the following c...,"[{'text_ic': 'Patients', 'umls_id_ic': 'C00307...","[1.0, 0.9999998807907104, 1.0, 1.0]","[Patients, criteria, enrollment, study]","[C0030705, C0243161, C1516879, C0008972]","[{'text_ec': 'Patient', 'umls_id_ec': 'C003070...","[1.0, 0.9999998807907104, 1.0]","[C0030705, C0243161, C0008972]","[Patient, criteria, study]"
8,33,32,NCT05653882,A Study Evaluating AB248 Alone or in Combinati...,Solid Tumor|Non Small Cell Lung Cancer|Melanom...,Recruiting,Inclusion Criteria:||Age ≥18 years of age at t...,Age ≥18 years of age at the time consent is si...,Has a diagnosis of immunodeficiency.|Has a his...,"[{'text_ic': 'Age', 'umls_id_ic': 'C0001779', ...","[1.0, 1.0, 1.0, 0.746544361114502, 1.0, 0.7230...","[Age, years, age, time consent, adequate, end ...","[C0001779, C0439234, C0001779, C1511481, C0205...","[{'text_ec': 'diagnosis', 'umls_id_ec': 'C0011...","[0.9999998807907104, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[C0011900, C0019664, C0006826, C1273390, C3887...","[diagnosis, history, malignancy, curative trea..."
9,34,33,NCT04895215,AB-2004 in Treatment of Irritability Associate...,Autism Spectrum Disorder (ASD),Recruiting,"Key Inclusion Criteria:||Clinically diagnosed,...","Clinically diagnosed, documented ASD (Diagnost...","Use of an oral, injected, or inhaled antibioti...","[{'text_ic': 'Clinically diagnosed', 'umls_id_...","[0.7474782466888428, 1.0, 0.9999999403953552, ...","[Clinically diagnosed, documented, ASD, Diagno...","[C0332140, C1301725, C0018817, C0011900, C0004...","[{'text_ec': 'oral', 'umls_id_ec': 'C0226896',...","[0.9999998807907104, 0.817918062210083, 1.0, 0...","[C0226896, C2986768, C0004048, C0003232, C0439...","[oral, injected, inhaled, antibiotic, days, sc..."


In [None]:
run_ner(df,col_name,model)