<a href="https://colab.research.google.com/github/victormurcia/CTS_Test/blob/main/CTS_Unifying_Routines.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
#I need to import locale to ensure that the encoding is set to UTF-8 (weird Google Colab bug)
import locale
locale.getpreferredencoding = lambda: "UTF-8"

#Check the current build in Google Colab
!cat /etc/*release
print('\n')

#Check CUDA version
!nvcc --version
print('\n')

#Ensure that the required packages are installed in the current environment
install_pckgs = False
if install_pckgs == True:
  !pip install numpy --quiet
  !pip install pandas --quiet
  !pip install spacy==3.4.4 --quiet
  !pip install scispacy --quiet
  !pip install medspacy --quiet
  !pip install negspacy --quiet
  !pip install transformers
  !pip install seaborn --quiet
  !pip install matplotlib --quiet
  !pip install "dask[complete]" --quiet
  !pip install ipywidgets --quiet
  !pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118 --quiet 
  print('\n')

#Spacy models used for processing biomedical, scientific, or clinical text 
#Spacy pipeline for biomedical data.
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_core_sci_sm-0.5.1.tar.gz --quiet
#Spacy pipeline for biomedical data. Has a larger vocabulary and 50k word vectors
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_core_sci_md-0.5.1.tar.gz --quiet
#This one is another spacy pipeline with 785k vocabulary and uses scibert-base as a transformer model
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_core_sci_scibert-0.5.1.tar.gz --quiet
#Spacy pipeline for biomedical data with 600k word vectors
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_core_sci_lg-0.5.1.tar.gz --quiet
#A spaCy NER model trained on the CRAFT corpus.
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_ner_craft_md-0.5.1.tar.gz --quiet
#A spaCy NER model trained on the JNLPBA corpus.
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_ner_jnlpba_md-0.5.1.tar.gz --quiet
#A spaCy NER model trained on the BC5CDR corpus.
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_ner_bc5cdr_md-0.5.1.tar.gz --quiet
#A spaCy NER model trained on the BIONLP13CG corpus.
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_ner_bionlp13cg_md-0.5.1.tar.gz --quiet
#This is the med7 transformer model found here: https://github.com/kormilitzin/med7
!pip install https://huggingface.co/kormilitzin/en_core_med7_trf/resolve/main/en_core_med7_trf-any-py3-none-any.whl --quiet
#This is the med7 vector model 
!pip install https://huggingface.co/kormilitzin/en_core_med7_lg/resolve/main/en_core_med7_lg-any-py3-none-any.whl --quiet

DISTRIB_ID=Ubuntu
DISTRIB_RELEASE=20.04
DISTRIB_CODENAME=focal
DISTRIB_DESCRIPTION="Ubuntu 20.04.5 LTS"
NAME="Ubuntu"
VERSION="20.04.5 LTS (Focal Fossa)"
ID=ubuntu
ID_LIKE=debian
PRETTY_NAME="Ubuntu 20.04.5 LTS"
VERSION_ID="20.04"
HOME_URL="https://www.ubuntu.com/"
SUPPORT_URL="https://help.ubuntu.com/"
BUG_REPORT_URL="https://bugs.launchpad.net/ubuntu/"
PRIVACY_POLICY_URL="https://www.ubuntu.com/legal/terms-and-policies/privacy-policy"
VERSION_CODENAME=focal
UBUNTU_CODENAME=focal


nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2022 NVIDIA Corporation
Built on Wed_Sep_21_10:33:58_PDT_2022
Cuda compilation tools, release 11.8, V11.8.89
Build cuda_11.8.r11.8/compiler.31833905_0


  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m120.2/120.2 MB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for en-core-sci-md (setup.py) ... [?25l[?25hdone


In [2]:
#Import the required libraries/packages
#General utilities
import numpy as np
import pandas as pd
import dask.dataframe as dd
import seaborn as sns
import matplotlib.pyplot as plt
import os, random, time,sys, re
from ipywidgets import widgets, interact, interactive, fixed, interact_manual
from tqdm import tqdm

#NLP Stuff
#Spacy
import spacy
from spacy.lang.en.stop_words import STOP_WORDS #Load stopwords
from spacy.language import Language
from spacy.tokenizer import Tokenizer
#Scispacy
import scispacy
from scispacy.linking import EntityLinker
from scispacy.abbreviation import AbbreviationDetector
from scispacy.hyponym_detector import HyponymDetector
#Medspacy
import medspacy
from medspacy.ner import TargetRule
from medspacy.visualization import visualize_ent
from negspacy.negation import Negex

#To use Transformers models from HuggingFace
import transformers
from transformers import AutoTokenizer, AutoModel,AutoModelForTokenClassification
#NLTK

#Enable data to be extracted from my Google Drive
from google.colab import drive
drive.mount('/content/drive')



Mounted at /content/drive


In [3]:
#Load Patient Dataframe
url ='https://raw.githubusercontent.com/victormurcia/CTS_Test/main/test_data/multi_veteran_df.csv'
patients_df = pd.read_csv(url)
#patients_df

#Load the inclusion criteria
url ='https://raw.githubusercontent.com/victormurcia/CTS_Test/main/test_data/parsed_ct_ic.csv'
parsed_ct_ic = pd.read_csv(url)
#parsed_ct_ic

#Load the exclusion criteria
url ='https://raw.githubusercontent.com/victormurcia/CTS_Test/main/test_data/parsed_ct_ec.csv'
parsed_ct_ec = pd.read_csv(url)

In [8]:
#Order of operations
# 1. Load the patients dataframe
# 2. Extract the EHR for the current patient
# 3. Preprocess each patient feature column
# 4. Run the NER model on each patient feature
# 5. Run a query on clinicaltrials.gov for all the conditions present in the patient profile
# 6. Extract the eligibility criteria for each queried clinical trial for each condition
# 7. Split eligibility criteria into inclusion/exclusion sections
# 8. Run NER model on both inclusion/exclusion sections
# 9. Determine Sorensen-Dice index between inclusion/exclusion sections and the patient EHR
# 10. Return the complete list of clinical trials stating whether the patient would qualify or not for each clinical trial

In [25]:
########################################################################################################
def create_patient_df_for_NER(patients_df):
  """
  This function creates a dataframe for a single patient from the 
  """
  #Select a single row from the DataFrame
  row_data = patients_df.loc[0]

  #Create a new DataFrame with the single row
  patient_df = pd.DataFrame([row_data], columns=row_data.index)

  #Select columns that summarize patient profile and put them into a list
  allergies     = patient_df['DESCRIPTION_als']
  condition     = patient_df['DESCRIPTION_cds']
  devices       = patient_df['DESCRIPTION_dvs']
  immunizations = patient_df['DESCRIPTION_ims']
  medications   = patient_df['DESCRIPTION_mds']
  observations  = patient_df['DESCRIPTION_obs']
  values        = patient_df['VALUE_obs']
  units         = patient_df['UNITS_obs']
  procedures    = patient_df['DESCRIPTION_prs']
  birthday      = patient_df['BIRTHDATE_pts']
  marital       = patient_df['MARITAL_pts']
  race          = patient_df['RACE_pts']
  ethnicity     = patient_df['ETHNICITY_pts']
  gender        = patient_df['GENDER_pts']
  city          = patient_df['CITY_pts']
  county        = patient_df['COUNTY_pts']

  #Make list for patient profile
  patient_prof_list = [allergies, condition, devices, immunizations, medications, observations, 
                       procedures,birthday,marital, race, ethnicity, gender, city, county]
  patient_prof_cols = ['allergies', 'condition', 'devices', 'immunizations', 
                       'medications', 'observations', 'procedures', 'birthday', 
                       'marital', 'race', 'ethnicity', 'gender', 'city', 'county']

  #Create a dictionary with column names and Series data
  data_dict = dict(zip(patient_prof_cols, patient_prof_list))

  #Create a new DataFrame with the single column
  patient_prof = pd.DataFrame(data_dict)

  #Concatenate the Series data into a single Series
  combined_series = pd.concat(patient_prof_list)

  #Create a DataFrame with a single column using the combined Series
  final_patient_df = pd.DataFrame({'Patient_Profile': combined_series})

  #Get column names of patient df
  final_patient_df['aspects'] = patient_prof_cols

  #Change the order of columns,reset the index, and drop the index column
  final_patient_df = final_patient_df.reindex(columns=['aspects', 'Patient_Profile']).reset_index().drop('index',axis=1)

  #Convert the list column to a string column separated by a single space
  final_patient_df['Patient_Profile'] = final_patient_df['Patient_Profile'].apply(lambda x: ' '.join(map(str, eval(x))))

  return final_patient_df
########################################################################################################
def get_umls_codes(text: str,model):
  """

  """
  # Process the text and extract UMLS codes
  doc = model(text)
  umls_codes = [
      {
          "text": entity.text,
          #"start": entity.start_char,
          #"end": entity.end_char,
          "umls_id": umls_ent[0],
          "score": umls_ent[1]
      }
      for entity in doc.ents
      for umls_ent in entity._.kb_ents
  ]
  
  return umls_codes
########################################################################################################
def extract_values(dicts, key):
  """
  """
  return [d.get(key, None) for d in dicts]
########################################################################################################
def run_ner(df,col_name,model):
  """
  """
  #Run the NER model on patient df and get UMLS codes after parsing and entity linking
  df['umls_codes'] = df[col_name].apply(get_umls_codes, args = (model,))

  # Create new columns from the keys in the dictionaries within the 'info' column lists
  unique_keys = set().union(*(d.keys() for dicts in df['umls_codes'] for d in dicts))

  #Unpack the dictionary keys into separate columns
  for key in unique_keys:
    df[key] = df['umls_codes'].apply(lambda dicts: extract_values(dicts, key))

  return df
########################################################################################################
def get_clinical_trials(condition):
  """
  """
  #cond = 'allergy'#input('Enter the disease condition to find clinical trials: ')
  a = 'https://clinicaltrials.gov/api/query/study_fields?expr='
  b = '&fields=NCTId%2CBriefTitle%2CCondition%2COverallStatus%2CEligibilityCriteria'
  c = '&min_rnk=1&max_rnk=1000&fmt=csv'
  q=(a + condition + b + c)
  print(q)
  qtrials = pd.read_csv(q,skiprows=10)
  return qtrials_df 
########################################################################################################
def get_eligiblity_criteria():
  return -1

########################################################################################################
def cts_parser(patients_df,model):
  """
  This function parses the electronic health records for a patient extracted from the Synthetic Veteran Suicide Dataset and a set of Clinical Trials queried using the clinicaltrials.gov API to 
  determine how good of a match a patient is to a clinical trial. The output of this function will be a dataframe containing a list of clinical trials, their 
  """

  #Start timer
  start_time = time.time()

  #PART 1. Create and prepare single patient dataframe for NER
  final_patient_df = create_patient_df_for_NER(patients_df)

  #PART 2. Run the NER model on the patient EHR
  patient_ner_df = run_ner(final_patient_df,'Patient_Profile',model)

  #Save parsed patient EHR to .csv 
  #patient_ner_df.to_csv('parsed_patient_ehr.csv', index=False)

  #PART 3. Query for clinical trials based on patient conditions
  #qtrials_df = get_clinical_trials('cancer')

  #End timer
  print("Parsing patient records took: %.2f seconds" % (time.time() - start_time))

  return final_patient_df,patient_ner_df#,qtrials_df

In [12]:
#List of available models
models = ["en_core_sci_sm","en_core_sci_md","en_core_sci_scibert","en_core_sci_lg","en_ner_craft_md","en_ner_jnlpba_md","en_ner_bionlp13cg_md","en_core_med7_lg"]

#Load the pre-trained spaCy NER model with sci-spaCy
model = spacy.load(models[0])

#Add the EntityLinker pipe to spacy pipeline
if 'scispacy_linker' not in model.pipe_names:
  model.add_pipe("scispacy_linker", config={"linker_name": "umls", "max_entities_per_mention": 1})

#Add the Negation pipe to spacy pipeline
if 'negex' not in model.pipe_names:
  model.add_pipe("negex")

#Add the abbreviation pipe to the spacy pipeline.
if 'abbreviation_detector' not in model.pipe_names:
  model.add_pipe("abbreviation_detector")

In [26]:
final_patient_df,patient_ner_df = cts_parser(patients_df,model)
patient_ner_df

Parsing patient records took: 4.99 seconds


Unnamed: 0,aspects,Patient_Profile,umls_codes,umls_id,score,text
0,allergies,Latex allergy Allergy to mould House dust mite...,"[{'text': 'Latex allergy Allergy', 'umls_id': ...","[C0577628, C0339808, C0222058, C0740919, C0013...","[0.9578602910041809, 1.0, 1.0, 0.9999998807907...","[Latex allergy Allergy, House dust mite allerg..."
1,condition,Atopic dermatitis Otitis media Childhood asthm...,"[{'text': 'Atopic dermatitis', 'umls_id': 'C00...","[C0011615, C0029882, C0264408, C0018621, C0396...","[1.0, 1.0, 1.0, 1.0, 0.9999999403953552, 1.0, ...","[Atopic dermatitis, Otitis media, Childhood as..."
2,devices,,[],[],[],[]
3,immunizations,Hep B adolescent or pediatric Hep B adolesce...,"[{'text': 'Hep B', 'umls_id': 'C0162569', 'sco...","[C0162569, C1521725, C0162569, C0694742, C0276...","[0.8482469916343689, 1.0, 0.8482469916343689, ...","[Hep B, pediatric, Hep B, pediatric Hib (PRP-O..."
4,medications,Astemizole 10 MG Oral Tablet Amoxicillin 250 M...,"[{'text': 'Astemizole', 'umls_id': 'C0085170',...","[C0085170, C0024443, C1244600, C0024443, C0991...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.81247895...","[Astemizole, MG, Oral Tablet Amoxicillin, MG, ..."
5,observations,Body Height Pain severity - 0-10 verbal numeri...,"[{'text': 'Body Height Pain', 'umls_id': 'C000...","[C0005890, C0449820, C0684224, C0005910, C0001...","[0.8759792447090149, 1.0, 1.0, 1.0, 0.81690454...","[Body Height Pain, Score, Reported, Body Weigh..."
6,procedures,Medication Reconciliation (procedure) Medicati...,"[{'text': 'Medication Reconciliation', 'umls_i...","[C2317067, C1273434, C2317067, C0184661, C0420...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[Medication Reconciliation, Allergy screening ..."
7,birthday,1983-08-07,[],[],[],[]
8,marital,M,[],[],[],[]
9,race,white,"[{'text': 'white', 'umls_id': 'C0007457', 'sco...",[C0007457],[1.0],[white]


In [27]:
final_patient_df

Unnamed: 0,aspects,Patient_Profile,umls_codes,umls_id,score,text
0,allergies,Latex allergy Allergy to mould House dust mite...,"[{'text': 'Latex allergy Allergy', 'umls_id': ...","[C0577628, C0339808, C0222058, C0740919, C0013...","[0.9578602910041809, 1.0, 1.0, 0.9999998807907...","[Latex allergy Allergy, House dust mite allerg..."
1,condition,Atopic dermatitis Otitis media Childhood asthm...,"[{'text': 'Atopic dermatitis', 'umls_id': 'C00...","[C0011615, C0029882, C0264408, C0018621, C0396...","[1.0, 1.0, 1.0, 1.0, 0.9999999403953552, 1.0, ...","[Atopic dermatitis, Otitis media, Childhood as..."
2,devices,,[],[],[],[]
3,immunizations,Hep B adolescent or pediatric Hep B adolesce...,"[{'text': 'Hep B', 'umls_id': 'C0162569', 'sco...","[C0162569, C1521725, C0162569, C0694742, C0276...","[0.8482469916343689, 1.0, 0.8482469916343689, ...","[Hep B, pediatric, Hep B, pediatric Hib (PRP-O..."
4,medications,Astemizole 10 MG Oral Tablet Amoxicillin 250 M...,"[{'text': 'Astemizole', 'umls_id': 'C0085170',...","[C0085170, C0024443, C1244600, C0024443, C0991...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.81247895...","[Astemizole, MG, Oral Tablet Amoxicillin, MG, ..."
5,observations,Body Height Pain severity - 0-10 verbal numeri...,"[{'text': 'Body Height Pain', 'umls_id': 'C000...","[C0005890, C0449820, C0684224, C0005910, C0001...","[0.8759792447090149, 1.0, 1.0, 1.0, 0.81690454...","[Body Height Pain, Score, Reported, Body Weigh..."
6,procedures,Medication Reconciliation (procedure) Medicati...,"[{'text': 'Medication Reconciliation', 'umls_i...","[C2317067, C1273434, C2317067, C0184661, C0420...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[Medication Reconciliation, Allergy screening ..."
7,birthday,1983-08-07,[],[],[],[]
8,marital,M,[],[],[],[]
9,race,white,"[{'text': 'white', 'umls_id': 'C0007457', 'sco...",[C0007457],[1.0],[white]


In [32]:
len(final_patient_df['text'][1])

44

In [31]:
final_patient_df['text'][1]

['Atopic dermatitis',
 'Otitis media',
 'Childhood asthma',
 'Seasonal allergic rhinitis',
 'Acute viral pharyngitis',
 'disorder',
 'Laceration',
 'Acute viral pharyngitis',
 'disorder',
 'Chronic intractable migraine',
 'aura',
 'Impacted molars',
 'Chronic pain',
 'Streptococcal sore throat',
 'Acute viral pharyngitis',
 'disorder',
 'Drug overdose',
 'Viral sinusitis',
 'Laceration',
 'forearm',
 'Viral sinusitis',
 'disorder',
 'Acute bronchitis',
 'disorder',
 'Anemia',
 'disorder',
 'Viral sinusitis',
 'disorder',
 'finding',
 'Sore throat symptom',
 'Sputum finding',
 'finding',
 'Fatigue',
 'finding',
 'Diarrhea symptom',
 'Fever',
 'Loss of taste',
 'finding',
 'Suspected',
 'COVID-19',
 'COVID-19',
 'Pneumonia',
 'Respiratory distress',
 'finding']