<a href="https://colab.research.google.com/github/victormurcia/CTS_Test/blob/main/CTS_Unifying_Routines.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [34]:
#I need to import locale to ensure that the encoding is set to UTF-8 (weird Google Colab bug)
import locale
locale.getpreferredencoding = lambda: "UTF-8"

#Check the current build in Google Colab
!cat /etc/*release
print('\n')

#Check CUDA version
!nvcc --version
print('\n')

#Ensure that the required packages are installed in the current environment
install_pckgs = False
if install_pckgs == True:
  !pip install numpy --quiet
  !pip install pandas --quiet
  !pip install spacy==3.4.4 --quiet
  !pip install scispacy --quiet
  !pip install medspacy --quiet
  !pip install negspacy --quiet
  !pip install transformers
  !pip install seaborn --quiet
  !pip install matplotlib --quiet
  !pip install "dask[complete]" --quiet
  !pip install ipywidgets --quiet
  !pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118 --quiet 
  print('\n')

#Spacy models used for processing biomedical, scientific, or clinical text 
#Spacy pipeline for biomedical data.
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_core_sci_sm-0.5.1.tar.gz --quiet
#Spacy pipeline for biomedical data. Has a larger vocabulary and 50k word vectors
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_core_sci_md-0.5.1.tar.gz --quiet
#This one is another spacy pipeline with 785k vocabulary and uses scibert-base as a transformer model
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_core_sci_scibert-0.5.1.tar.gz --quiet
#Spacy pipeline for biomedical data with 600k word vectors
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_core_sci_lg-0.5.1.tar.gz --quiet
#A spaCy NER model trained on the CRAFT corpus.
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_ner_craft_md-0.5.1.tar.gz --quiet
#A spaCy NER model trained on the JNLPBA corpus.
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_ner_jnlpba_md-0.5.1.tar.gz --quiet
#A spaCy NER model trained on the BC5CDR corpus.
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_ner_bc5cdr_md-0.5.1.tar.gz --quiet
#A spaCy NER model trained on the BIONLP13CG corpus.
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_ner_bionlp13cg_md-0.5.1.tar.gz --quiet
#This is the med7 transformer model found here: https://github.com/kormilitzin/med7
!pip install https://huggingface.co/kormilitzin/en_core_med7_trf/resolve/main/en_core_med7_trf-any-py3-none-any.whl --quiet
#This is the med7 vector model 
!pip install https://huggingface.co/kormilitzin/en_core_med7_lg/resolve/main/en_core_med7_lg-any-py3-none-any.whl --quiet

DISTRIB_ID=Ubuntu
DISTRIB_RELEASE=20.04
DISTRIB_CODENAME=focal
DISTRIB_DESCRIPTION="Ubuntu 20.04.5 LTS"
NAME="Ubuntu"
VERSION="20.04.5 LTS (Focal Fossa)"
ID=ubuntu
ID_LIKE=debian
PRETTY_NAME="Ubuntu 20.04.5 LTS"
VERSION_ID="20.04"
HOME_URL="https://www.ubuntu.com/"
SUPPORT_URL="https://help.ubuntu.com/"
BUG_REPORT_URL="https://bugs.launchpad.net/ubuntu/"
PRIVACY_POLICY_URL="https://www.ubuntu.com/legal/terms-and-policies/privacy-policy"
VERSION_CODENAME=focal
UBUNTU_CODENAME=focal


nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2022 NVIDIA Corporation
Built on Wed_Sep_21_10:33:58_PDT_2022
Cuda compilation tools, release 11.8, V11.8.89
Build cuda_11.8.r11.8/compiler.31833905_0


  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... 

In [35]:
#Import the required libraries/packages
#General utilities
import numpy as np
import pandas as pd
import dask.dataframe as dd
import seaborn as sns
import matplotlib.pyplot as plt
import os, random, time,sys, re
from ipywidgets import widgets, interact, interactive, fixed, interact_manual
from tqdm import tqdm

#NLP Stuff
#Spacy
import spacy
from spacy.lang.en.stop_words import STOP_WORDS #Load stopwords
from spacy.language import Language
from spacy.tokenizer import Tokenizer
#Scispacy
import scispacy
from scispacy.linking import EntityLinker
from scispacy.abbreviation import AbbreviationDetector
from scispacy.hyponym_detector import HyponymDetector
#Medspacy
import medspacy
from medspacy.ner import TargetRule
from medspacy.visualization import visualize_ent
from negspacy.negation import Negex

#To use Transformers models from HuggingFace
import transformers
from transformers import AutoTokenizer, AutoModel,AutoModelForTokenClassification
#NLTK

#Enable data to be extracted from my Google Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [36]:
#Load Patient Dataframe
url ='https://raw.githubusercontent.com/victormurcia/CTS_Test/main/test_data/multi_veteran_df.csv'
patients_df = pd.read_csv(url)
#patients_df

#Load the inclusion criteria
url ='https://raw.githubusercontent.com/victormurcia/CTS_Test/main/test_data/parsed_ct_ic.csv'
parsed_ct_ic = pd.read_csv(url)
#parsed_ct_ic

#Load the exclusion criteria
url ='https://raw.githubusercontent.com/victormurcia/CTS_Test/main/test_data/parsed_ct_ec.csv'
parsed_ct_ec = pd.read_csv(url)

In [37]:
#Order of operations
# 1. Load the patients dataframe
# 2. Extract the EHR for the current patient
# 3. Preprocess each patient feature column
# 4. Run the NER model on each patient feature
# 5. Run a query on clinicaltrials.gov for all the conditions present in the patient profile
# 6. Extract the eligibility criteria for each queried clinical trial for each condition
# 7. Split eligibility criteria into inclusion/exclusion sections
# 8. Run NER model on both inclusion/exclusion sections
# 9. Determine Sorensen-Dice index between inclusion/exclusion sections and the patient EHR
# 10. Return the complete list of clinical trials stating whether the patient would qualify or not for each clinical trial

In [137]:
########################################################################################################
def create_patient_df_for_NER(patients_df):
  """
  This function creates a dataframe for a single patient from the 
  """
  #Select a single row from the DataFrame
  row_data = patients_df.loc[0]

  #Create a new DataFrame with the single row
  patient_df = pd.DataFrame([row_data], columns=row_data.index)

  #Select columns that summarize patient profile and put them into a list
  allergies     = patient_df['DESCRIPTION_als']
  condition     = patient_df['DESCRIPTION_cds']
  devices       = patient_df['DESCRIPTION_dvs']
  immunizations = patient_df['DESCRIPTION_ims']
  medications   = patient_df['DESCRIPTION_mds']
  observations  = patient_df['DESCRIPTION_obs']
  values        = patient_df['VALUE_obs']
  units         = patient_df['UNITS_obs']
  procedures    = patient_df['DESCRIPTION_prs']
  birthday      = patient_df['BIRTHDATE_pts']
  marital       = patient_df['MARITAL_pts']
  race          = patient_df['RACE_pts']
  ethnicity     = patient_df['ETHNICITY_pts']
  gender        = patient_df['GENDER_pts']
  city          = patient_df['CITY_pts']
  county        = patient_df['COUNTY_pts']

  #Make list for patient profile
  patient_prof_list = [allergies, condition, devices, immunizations, medications, observations, 
                       procedures,birthday,marital, race, ethnicity, gender, city, county]
  patient_prof_cols = ['allergies', 'condition', 'devices', 'immunizations', 
                       'medications', 'observations', 'procedures', 'birthday', 
                       'marital', 'race', 'ethnicity', 'gender', 'city', 'county']

  #Create a dictionary with column names and Series data
  data_dict = dict(zip(patient_prof_cols, patient_prof_list))

  #Create a new DataFrame with the single column
  patient_prof = pd.DataFrame(data_dict)

  #Concatenate the Series data into a single Series
  combined_series = pd.concat(patient_prof_list)

  #Create a DataFrame with a single column using the combined Series
  final_patient_df = pd.DataFrame({'Patient_Profile': combined_series})

  #Get column names of patient df
  final_patient_df['aspects'] = patient_prof_cols

  #Change the order of columns,reset the index, and drop the index column
  final_patient_df = final_patient_df.reindex(columns=['aspects', 'Patient_Profile']).reset_index().drop('index',axis=1)

  #Convert the list column to a string column separated by a single space
  final_patient_df['Patient_Profile'] = final_patient_df['Patient_Profile'].apply(lambda x: ' '.join(map(str, eval(x))))

  return final_patient_df
########################################################################################################
def get_umls_codes(text: str,model):
  """

  """
  # Process the text and extract UMLS codes
  doc = model(text)
  umls_codes = [
      {
          "text": entity.text,
          #"start": entity.start_char,
          #"end": entity.end_char,
          "umls_id": umls_ent[0],
          "score": umls_ent[1]
      }
      for entity in doc.ents
      for umls_ent in entity._.kb_ents
  ]
  
  return umls_codes
########################################################################################################
def extract_values(dicts, key):
  """
  """
  return [d.get(key, None) for d in dicts]
########################################################################################################
def run_ner(df,col_name,model):
  """
  """
  #Run the NER model on patient df and get UMLS codes after parsing and entity linking
  df['umls_codes'] = df[col_name].apply(get_umls_codes, args = (model,))

  # Create new columns from the keys in the dictionaries within the 'info' column lists
  unique_keys = set().union(*(d.keys() for dicts in df['umls_codes'] for d in dicts))

  #Unpack the dictionary keys into separate columns
  for key in unique_keys:
    df[key] = df['umls_codes'].apply(lambda dicts: extract_values(dicts, key))

  return df
########################################################################################################
def contains_multiple_words(s):
  """
  This function determines if a string (s) is composed of multiple space separated words. It returns a boolean based on the result
  """
  if len(s.split()) > 1:
    return True
  else:
    return False
########################################################################################################
def get_list_of_conditions_from_patient_df(patient_df):
  """
  """

  #Get list of conditions from parsed patient dataframe and remove duplicates
  list_of_conditions = list(set(final_patient_df['text'].explode().tolist()))

  # Remove nan values from the list using list comprehension and sort elements in alphabetical order
  list_of_conditions = sorted([x for x in list_of_conditions if not pd.isna(x)])

  # Define a regular expression pattern to match brackets and parentheses
  pattern = r"[\[\]\(\)]"

  # Remove brackets and parentheses from each element in the list
  list_of_conditions = [re.sub(pattern, "", str(elem)) for elem in list_of_conditions]

  return list_of_conditions
########################################################################################################
def get_clinical_trials(condition):
  """
  """
  #Check that condition is only 1 word else concatenate the string with a + sign
  multiword_condition = contains_multiple_words(condition)
  if multiword_condition == True:
    condition = "+".join(condition.split())
  #cond = 'allergy'#input('Enter the disease condition to find clinical trials: ')
  a = 'https://clinicaltrials.gov/api/query/study_fields?expr='
  b = '&fields=NCTId%2CBriefTitle%2CCondition%2COverallStatus%2CEligibilityCriteria'
  c = '&min_rnk=1&max_rnk=1000&fmt=csv'
  q=(a + condition + b + c)
  print(q)
  qtrials_df = pd.read_csv(q,skiprows=10)
  return qtrials_df 
########################################################################################################
#PART 4. Query for clinical trials based on patient conditions
def query_trials_wrapper(list_of_conditions):
  """
  """
  list_of_cts = []
  for condition in list_of_conditions:
    print(condition)
    qtrials_df = get_clinical_trials(condition)
    list_of_cts.append(qtrials_df)

  # Concatenate all DataFrames in the list into a single DataFrame
  all_trials_df = pd.concat(list_of_cts, ignore_index=True)

  return all_trials_df
########################################################################################################
def split_criteria(eligibility_criteria):
    # Split the input string
    substrings = eligibility_criteria.split('||')
    # Initialize the inclusion and exclusion criteria sections
    inclusion_criteria = ''
    exclusion_criteria = ''

    # Loop through the substrings to build the inclusion and exclusion criteria sections
    for i in range(len(substrings)):
        if 'Key Inclusion Criteria:' in substrings[i] or 'Inclusion Criteria:' in substrings[i]:
            if i+1 < len(substrings):
                inclusion_criteria += substrings[i+1]
        elif 'Key Exclusion Criteria:' in substrings[i] or 'Exclusion Criteria:' in substrings[i]:
            if i+1 < len(substrings):
                exclusion_criteria += substrings[i+1]

    # Return the inclusion and exclusion criteria sections as a tuple
    return pd.Series([inclusion_criteria, exclusion_criteria])

########################################################################################################
def cts_parser(patients_df,model):
  """
  This function parses the electronic health records for a patient extracted from the Synthetic Veteran Suicide Dataset and a set of Clinical Trials queried using the clinicaltrials.gov API to 
  determine how good of a match a patient is to a clinical trial. The output of this function will be a dataframe containing a list of clinical trials, their 
  """

  #Start timer
  start_time = time.time()

  #PART 1. Create and prepare single patient dataframe for NER
  final_patient_df = create_patient_df_for_NER(patients_df)

  #PART 2. Run the NER model on the patient EHR
  patient_ner_df = run_ner(final_patient_df,'Patient_Profile',model)

  #Save parsed patient EHR to .csv 
  #patient_ner_df.to_csv('parsed_patient_ehr.csv', index=False)

  #PART 3. Get list of patient conditions after parsing EHR while also removing duplicates
  list_of_conditions = get_list_of_conditions_from_patient_df(patient_ner_df)
  
  #PART 4. Query for clinical trials based on patient conditions
  query_trials_wrapper(list_of_conditions)

  #PART 5. Get dataframe of clinical trials found based on parsed patient health record
  all_trials_df = pd.concat(list_of_cts, ignore_index=True)

  #PART 6. Get dataframe containing only clinical trials that are actively recruiting
  recruiting_trials_df = all_trials_df[all_trials_df['OverallStatus']=='Recruiting'].reset_index()

  #PART 7. Create inclusion and inclusion criteria columns on dataframe for subsequent parsing
  recruiting_trials_df[['inclusion_criteria', 'exclusion_criteria']] = recruiting_trials_df['EligibilityCriteria'].astype(str).apply(split_criteria)

  #PART 2. Run the NER model on the patient EHR
  recruiting_trials_df = run_ner(final_patient_df,'Patient_Profile',model)

  #End timer
  print("Parsing patient records took: %.2f seconds" % (time.time() - start_time))

  return final_patient_df,patient_ner_df#,qtrials_df

In [39]:
#List of available models
models = ["en_core_sci_sm","en_core_sci_md","en_core_sci_scibert","en_core_sci_lg","en_ner_craft_md","en_ner_jnlpba_md","en_ner_bionlp13cg_md","en_core_med7_lg"]

#Load the pre-trained spaCy NER model with sci-spaCy
model = spacy.load(models[0])

#Add the EntityLinker pipe to spacy pipeline
if 'scispacy_linker' not in model.pipe_names:
  model.add_pipe("scispacy_linker", config={"linker_name": "umls", "max_entities_per_mention": 1})

#Add the Negation pipe to spacy pipeline
if 'negex' not in model.pipe_names:
  model.add_pipe("negex")

#Add the abbreviation pipe to the spacy pipeline.
if 'abbreviation_detector' not in model.pipe_names:
  model.add_pipe("abbreviation_detector")

In [40]:
final_patient_df,patient_ner_df = cts_parser(patients_df,model)
patient_ner_df

Parsing patient records took: 5.33 seconds


Unnamed: 0,aspects,Patient_Profile,umls_codes,umls_id,score,text
0,allergies,Latex allergy Allergy to mould House dust mite...,"[{'text': 'Latex allergy Allergy', 'umls_id': ...","[C0577628, C0339808, C0222058, C0740919, C0013...","[0.9578602910041809, 1.0, 1.0, 0.9999998807907...","[Latex allergy Allergy, House dust mite allerg..."
1,condition,Atopic dermatitis Otitis media Childhood asthm...,"[{'text': 'Atopic dermatitis', 'umls_id': 'C00...","[C0011615, C0029882, C0264408, C0018621, C0396...","[1.0, 1.0, 1.0, 1.0, 0.9999999403953552, 1.0, ...","[Atopic dermatitis, Otitis media, Childhood as..."
2,devices,,[],[],[],[]
3,immunizations,Hep B adolescent or pediatric Hep B adolesce...,"[{'text': 'Hep B', 'umls_id': 'C0162569', 'sco...","[C0162569, C1521725, C0162569, C0694742, C0276...","[0.8482469916343689, 1.0, 0.8482469916343689, ...","[Hep B, pediatric, Hep B, pediatric Hib (PRP-O..."
4,medications,Astemizole 10 MG Oral Tablet Amoxicillin 250 M...,"[{'text': 'Astemizole', 'umls_id': 'C0085170',...","[C0085170, C0024443, C1244600, C0024443, C0991...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.81247895...","[Astemizole, MG, Oral Tablet Amoxicillin, MG, ..."
5,observations,Body Height Pain severity - 0-10 verbal numeri...,"[{'text': 'Body Height Pain', 'umls_id': 'C000...","[C0005890, C0449820, C0684224, C0005910, C0001...","[0.8759792447090149, 1.0, 1.0, 1.0, 0.81690454...","[Body Height Pain, Score, Reported, Body Weigh..."
6,procedures,Medication Reconciliation (procedure) Medicati...,"[{'text': 'Medication Reconciliation', 'umls_i...","[C2317067, C1273434, C2317067, C0184661, C0420...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[Medication Reconciliation, Allergy screening ..."
7,birthday,1983-08-07,[],[],[],[]
8,marital,M,[],[],[],[]
9,race,white,"[{'text': 'white', 'umls_id': 'C0007457', 'sco...",[C0007457],[1.0],[white]


In [41]:
final_patient_df

Unnamed: 0,aspects,Patient_Profile,umls_codes,umls_id,score,text
0,allergies,Latex allergy Allergy to mould House dust mite...,"[{'text': 'Latex allergy Allergy', 'umls_id': ...","[C0577628, C0339808, C0222058, C0740919, C0013...","[0.9578602910041809, 1.0, 1.0, 0.9999998807907...","[Latex allergy Allergy, House dust mite allerg..."
1,condition,Atopic dermatitis Otitis media Childhood asthm...,"[{'text': 'Atopic dermatitis', 'umls_id': 'C00...","[C0011615, C0029882, C0264408, C0018621, C0396...","[1.0, 1.0, 1.0, 1.0, 0.9999999403953552, 1.0, ...","[Atopic dermatitis, Otitis media, Childhood as..."
2,devices,,[],[],[],[]
3,immunizations,Hep B adolescent or pediatric Hep B adolesce...,"[{'text': 'Hep B', 'umls_id': 'C0162569', 'sco...","[C0162569, C1521725, C0162569, C0694742, C0276...","[0.8482469916343689, 1.0, 0.8482469916343689, ...","[Hep B, pediatric, Hep B, pediatric Hib (PRP-O..."
4,medications,Astemizole 10 MG Oral Tablet Amoxicillin 250 M...,"[{'text': 'Astemizole', 'umls_id': 'C0085170',...","[C0085170, C0024443, C1244600, C0024443, C0991...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.81247895...","[Astemizole, MG, Oral Tablet Amoxicillin, MG, ..."
5,observations,Body Height Pain severity - 0-10 verbal numeri...,"[{'text': 'Body Height Pain', 'umls_id': 'C000...","[C0005890, C0449820, C0684224, C0005910, C0001...","[0.8759792447090149, 1.0, 1.0, 1.0, 0.81690454...","[Body Height Pain, Score, Reported, Body Weigh..."
6,procedures,Medication Reconciliation (procedure) Medicati...,"[{'text': 'Medication Reconciliation', 'umls_i...","[C2317067, C1273434, C2317067, C0184661, C0420...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[Medication Reconciliation, Allergy screening ..."
7,birthday,1983-08-07,[],[],[],[]
8,marital,M,[],[],[],[]
9,race,white,"[{'text': 'white', 'umls_id': 'C0007457', 'sco...",[C0007457],[1.0],[white]


In [89]:
list_of_conditions = list(set(final_patient_df['text'].explode().tolist()))

# Remove nan values from the list using list comprehension and math.isnan()
list_of_conditions = sorted([x for x in list_of_conditions if not pd.isna(x)])

# Define a regular expression pattern to match brackets and parentheses
pattern = r"[\[\]\(\)]"

# Remove brackets and parentheses from each element in the list
list_of_conditions = [re.sub(pattern, "", str(elem)) for elem in list_of_conditions]

print(list_of_conditions)

['ACTUAT', 'Ab', 'Acetaminophen', 'Acetaminophen 325 MG', 'Acute bronchitis', 'Acute viral pharyngitis', 'Adenovirus', 'Admission', 'Alanine aminotransferase', 'Alkaline phosphatase', 'Allergy screening test', 'Anemia', 'Arterial blood Diastolic Blood Pressure', 'Aspartate aminotransferase', 'Astemizole', 'Asthma screening', 'Atopic dermatitis', 'Auto', 'Automated', 'Automated count', 'Automated count Basophils', 'Automated count Eosinophils', 'Automated count Erythrocytes', 'Automated count Glucose', 'Automated count Hemoglobin', 'Automated count Lymphocytes', 'Automated count MCH', 'Automated count MCHC', 'Automated count MCV', 'Automated count Monocytes', 'Automated count Neutrophils', 'Automated count Platelet distribution width', 'Automated count Platelet mean volume', 'Automated count Platelets', 'Automated count RDW', 'Automated count Tobacco smoking status', 'BMI', 'Blood', 'Blood Body temperature', 'Blood Hematocrit', 'Blood Hemoglobin', 'Blood Pain', 'Blood Pressure', 'Blood 

In [91]:
#PART 4. Query for clinical trials based on patient conditions
def query_trials_wrapper(list_of_conditions):
  """
  """
  #Start timer
  start_time = time.time()
  
  list_of_cts = []
  for condition in list_of_conditions:
    print(condition)
    qtrials_df = get_clinical_trials(condition)
    list_of_cts.append(qtrials_df)

  # Concatenate all DataFrames in the list into a single DataFrame
  all_trials = pd.concat(list_of_cts, ignore_index=True)
  #End timer
  print("Querying for clinical trials took: %.2f seconds" % (time.time() - start_time))
  return all_trials
all_trials = query_trials_wrapper(list_of_conditions)
all_trials

ACTUAT
https://clinicaltrials.gov/api/query/study_fields?expr=ACTUAT&fields=NCTId%2CBriefTitle%2CCondition%2COverallStatus%2CEligibilityCriteria&min_rnk=1&max_rnk=1000&fmt=csv
Ab
https://clinicaltrials.gov/api/query/study_fields?expr=Ab&fields=NCTId%2CBriefTitle%2CCondition%2COverallStatus%2CEligibilityCriteria&min_rnk=1&max_rnk=1000&fmt=csv
Acetaminophen
https://clinicaltrials.gov/api/query/study_fields?expr=Acetaminophen&fields=NCTId%2CBriefTitle%2CCondition%2COverallStatus%2CEligibilityCriteria&min_rnk=1&max_rnk=1000&fmt=csv
Acetaminophen 325 MG
https://clinicaltrials.gov/api/query/study_fields?expr=Acetaminophen+325+MG&fields=NCTId%2CBriefTitle%2CCondition%2COverallStatus%2CEligibilityCriteria&min_rnk=1&max_rnk=1000&fmt=csv
Acute bronchitis
https://clinicaltrials.gov/api/query/study_fields?expr=Acute+bronchitis&fields=NCTId%2CBriefTitle%2CCondition%2COverallStatus%2CEligibilityCriteria&min_rnk=1&max_rnk=1000&fmt=csv
Acute viral pharyngitis
https://clinicaltrials.gov/api/query/study

Unnamed: 0,Rank,NCTId,BriefTitle,Condition,OverallStatus,EligibilityCriteria
0,1,NCT03599791,DYmista NAsal Spray in CHInese Patients,Allergic Rhinitis,Completed,General Inclusion Criteria:||To be eligible fo...
1,2,NCT03394989,BE Study of Fluticasone Propionate/Salmeterol ...,Bronchial Asthma,Completed,Inclusion Criteria||Patients who have signed i...
2,1,NCT01649713,Yearly Licence Tolerability and Immunogenicity...,Influenza,Completed,Inclusion Criteria:||Adult persons aged 18 to ...
3,2,NCT01323933,First-in-human Study of AB0024 to Evaluate Saf...,Neoplasms,Completed,Inclusion Criteria:||Histologically or cytolog...
4,3,NCT03793946,A Digital Antimicrobial Stewardship Smartphone...,Anti-Infective Agents|Mobile Applications|Qual...,Unknown status,Inclusion Criteria:||Cluster level (wards):||•...
...,...,...,...,...,...,...
144322,998,NCT01999465,NMES Efficacy on Patients With NBPP,Neonatal Brachial Plexus Palsy,Completed,Inclusion Criteria:||Children ages 3-9 months ...
144323,999,NCT02479542,Addition of Negative Pressure Wound Therapy to...,Wounds and Injuries,Withdrawn,Inclusion Criteria:||Patients must be at least...
144324,1000,NCT05561348,The Effect and Mechanism of Transcutaneous Aur...,Parkinson Disease,Recruiting,Inclusion Criteria:||Idiopathic Parkinson's di...
144325,1,NCT01872910,A Study of LY3023703 Testing Pain Relief After...,Acute Pain,Completed,Inclusion Criteria:||Have at least 2 third mol...


In [98]:
recruiting_trials_df = all_trials[all_trials['OverallStatus']=='Recruiting'].reset_index()
recruiting_trials_df

Unnamed: 0,index,Rank,NCTId,BriefTitle,Condition,OverallStatus,EligibilityCriteria
0,5,4,NCT05617755,"AB-1015, an Integrated Circuit T (ICT) Cell Th...","Carcinoma, Ovarian Epithelial|Ovarian Neoplasm...",Recruiting,"Inclusion Criteria:||Recurrent, advanced, plat..."
1,7,6,NCT05013086,177Lu-AB-3PRGD2 in Patients With Non Small Cel...,Non Small Cell Lung Cancer,Recruiting,Inclusion Criteria:||confirmed NSCLC patients;...
2,18,17,NCT05211570,AB8939 in Patients With Relapsed/Refractory Ac...,Acute Myeloid Leukemia Refractory|Acute Myeloi...,Recruiting,DOSE ESCALATION STUDY||Key Inclusion Criteria:...
3,23,22,NCT04943185,"A Prospective, Single Surgeon, Randomized Cont...",The Objective of the Study is to Prospectively...,Recruiting,Inclusion Criteria:||Patients with maximally t...
4,24,23,NCT05577416,A Study of AB-218 in Patients With IDH1 Mutate...,Glioma,Recruiting,Inclusion Criteria:||Patients will have a radi...
...,...,...,...,...,...,...,...
20150,144304,980,NCT04932668,Home Based Electrical Stimulation on Post-stro...,Spasticity as Sequela of Stroke,Recruiting,Inclusion Criteria:||Post stroke (hemorrhagic ...
20151,144309,985,NCT05321693,Neuromodulatory Effects of Transcranial Pulsed...,Fibromyalgia,Recruiting,Inclusion Criteria:||Women from 30 to 65 years...
20152,144313,989,NCT05630911,"Conscious Movement Processing, Postural Stabil...",Fall Injury|Postural; Defect|Old Age; Debility,Recruiting,Inclusion Criteria:||65 years old or above;|ab...
20153,144321,997,NCT05405738,Cleft Palate Technique and Maxillary Growth,Cleft Palate|Growth,Recruiting,Inclusion Criteria:||5-6 years old childreen w...


In [100]:
#DataFrame containing eligility criteria for all queried trials
qt_ec = recruiting_trials_df[['EligibilityCriteria']][:10]
qt_ec

Unnamed: 0,EligibilityCriteria
0,"Inclusion Criteria:||Recurrent, advanced, plat..."
1,Inclusion Criteria:||confirmed NSCLC patients;...
2,DOSE ESCALATION STUDY||Key Inclusion Criteria:...
3,Inclusion Criteria:||Patients with maximally t...
4,Inclusion Criteria:||Patients will have a radi...
5,Inclusion Criteria:||Capable of giving signed ...
6,Inclusion Criteria:||Grade IV glioma (glioblas...
7,Inclusion Criteria:||Patients must meet all of...
8,Inclusion Criteria:||Age ≥18 years of age at t...
9,"Key Inclusion Criteria:||Clinically diagnosed,..."


In [120]:
input_string = qt_ec['EligibilityCriteria'][3]
input_string

"Inclusion Criteria:||Patients with maximally tolerated medically treated (MTMT) uncontrolled glaucoma (POAG, PEX G, PDS G)|Male or female who was in good general health and >18 years of age at the time of the preoperative exam|The subject complied with post-operative instructions and made their scheduled office appointments||Exclusion Criteria:||Non-Ocular||Known or suspected allergy or sensitivity to any medications/diagnostic agents (eg, topical anesthetic, dilating drops, fluorescein) required for this protocol or any of the XEN components (eg, porcine products or glutaraldehyde)|Known history of bleeding disorder or prolonged bleeding after surgery (in the opinion of the investigator) or those on pharmacologic blood thinners other than aspirin (up to 100 mg/day)|Chemotherapy for cancer treatment within 6 months of screening|History of dermatologic keloid formation|Participation in another drug/device/observational clinical trial concurrently or concluding within 30 days of screeni

In [136]:
# Define a function to split the criteria column into two new columns
def split_criteria(row):
    # Split the input string
    substrings = row.split('||')
    #print(substrings)
    # Initialize the inclusion and exclusion criteria sections
    inclusion_criteria = ''
    exclusion_criteria = ''

    # Loop through the substrings to build the inclusion and exclusion criteria sections
    for i in range(len(substrings)):
        if 'Key Inclusion Criteria:' in substrings[i] or 'Inclusion Criteria:' in substrings[i]:
            if i+1 < len(substrings):
                inclusion_criteria += substrings[i+1]
        elif 'Key Exclusion Criteria:' in substrings[i] or 'Exclusion Criteria:' in substrings[i]:
            if i+1 < len(substrings):
                exclusion_criteria += substrings[i+1]

    # Return the inclusion and exclusion criteria sections as a tuple
    return pd.Series([inclusion_criteria, exclusion_criteria])

# Apply the split_criteria function to the criteria column and create new columns for the inclusion and exclusion criteria
#Start timer
start_time = time.time()
recruiting_trials_df[['inclusion_criteria', 'exclusion_criteria']] = recruiting_trials_df['EligibilityCriteria'].astype(str).apply(split_criteria)
print("Splitting took: %.2f seconds" % (time.time() - start_time))
recruiting_trials_df

Splitting took: 2.50 seconds


Unnamed: 0,index,Rank,NCTId,BriefTitle,Condition,OverallStatus,EligibilityCriteria,inclusion_criteria,exclusion_criteria
0,5,4,NCT05617755,"AB-1015, an Integrated Circuit T (ICT) Cell Th...","Carcinoma, Ovarian Epithelial|Ovarian Neoplasm...",Recruiting,"Inclusion Criteria:||Recurrent, advanced, plat...","Recurrent, advanced, platinum resistant ovaria...",Cytotoxic chemotherapy within 14 days of time ...
1,7,6,NCT05013086,177Lu-AB-3PRGD2 in Patients With Non Small Cel...,Non Small Cell Lung Cancer,Recruiting,Inclusion Criteria:||confirmed NSCLC patients;...,confirmed NSCLC patients;|tumor lesions with h...,the exclusion criteria were a serum creatinine...
2,18,17,NCT05211570,AB8939 in Patients With Relapsed/Refractory Ac...,Acute Myeloid Leukemia Refractory|Acute Myeloi...,Recruiting,DOSE ESCALATION STUDY||Key Inclusion Criteria:...,Patients with documented diagnosis of acute my...,Patients eligible to a standard of care|Patien...
3,23,22,NCT04943185,"A Prospective, Single Surgeon, Randomized Cont...",The Objective of the Study is to Prospectively...,Recruiting,Inclusion Criteria:||Patients with maximally t...,Patients with maximally tolerated medically tr...,Non-Ocular
4,24,23,NCT05577416,A Study of AB-218 in Patients With IDH1 Mutate...,Glioma,Recruiting,Inclusion Criteria:||Patients will have a radi...,Patients will have a radiological diagnosis of...,Patients who meet any of the following criteri...
...,...,...,...,...,...,...,...,...,...
20150,144304,980,NCT04932668,Home Based Electrical Stimulation on Post-stro...,Spasticity as Sequela of Stroke,Recruiting,Inclusion Criteria:||Post stroke (hemorrhagic ...,Post stroke (hemorrhagic or ischemic) with ank...,Introduction or changes in anti-spastic medica...
20151,144309,985,NCT05321693,Neuromodulatory Effects of Transcranial Pulsed...,Fibromyalgia,Recruiting,Inclusion Criteria:||Women from 30 to 65 years...,Women from 30 to 65 years old|FM diagnosis acc...,Pregnancy or lack of contraceptive use;|Histor...
20152,144313,989,NCT05630911,"Conscious Movement Processing, Postural Stabil...",Fall Injury|Postural; Defect|Old Age; Debility,Recruiting,Inclusion Criteria:||65 years old or above;|ab...,65 years old or above;|able to stand independe...,a history of any major cerebrovascular and/or ...
20153,144321,997,NCT05405738,Cleft Palate Technique and Maxillary Growth,Cleft Palate|Growth,Recruiting,Inclusion Criteria:||5-6 years old childreen w...,5-6 years old childreen with previous treated ...,Patients with syndromic cleft lip.|Redu cases|...


In [132]:
recruiting_trials_df['EligibilityCriteria'][326]

'Inclusion Criteria:||Signed informed consent, demonstrating that the patient understands the procedures required for the study and the purpose of the study|Male or female of 18 years of age or older|Diagnosis of COVID-19 infection by RT- PCR|Recovery from COVID-19||Exclusion Criteria:'