<a href="https://colab.research.google.com/github/victormurcia/CTS_Test/blob/main/CTS_Unifying_Routines.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#I need to import locale to ensure that the encoding is set to UTF-8 (weird Google Colab bug)
import locale
locale.getpreferredencoding = lambda: "UTF-8"

#Check the current build in Google Colab
!cat /etc/*release
print('\n')

#Check CUDA version
!nvcc --version
print('\n')

#Ensure that the required packages are installed in the current environment
!pip install numpy --quiet
!pip install pandas --quiet
!pip install spacy==3.4.4 --quiet
!pip install scispacy --quiet
!pip install medspacy --quiet
!pip install negspacy --quiet
!pip install transformers
!pip install seaborn --quiet
!pip install matplotlib --quiet
!pip install "dask[complete]" --quiet
!pip install ipywidgets --quiet
!pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118 --quiet 
print('\n')

#Spacy models used for processing biomedical, scientific, or clinical text 
#Spacy pipeline for biomedical data.
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_core_sci_sm-0.5.1.tar.gz --quiet

DISTRIB_ID=Ubuntu
DISTRIB_RELEASE=20.04
DISTRIB_CODENAME=focal
DISTRIB_DESCRIPTION="Ubuntu 20.04.5 LTS"
NAME="Ubuntu"
VERSION="20.04.5 LTS (Focal Fossa)"
ID=ubuntu
ID_LIKE=debian
PRETTY_NAME="Ubuntu 20.04.5 LTS"
VERSION_ID="20.04"
HOME_URL="https://www.ubuntu.com/"
SUPPORT_URL="https://help.ubuntu.com/"
BUG_REPORT_URL="https://bugs.launchpad.net/ubuntu/"
PRIVACY_POLICY_URL="https://www.ubuntu.com/legal/terms-and-policies/privacy-policy"
VERSION_CODENAME=focal
UBUNTU_CODENAME=focal


nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2022 NVIDIA Corporation
Built on Wed_Sep_21_10:33:58_PDT_2022
Cuda compilation tools, release 11.8, V11.8.89
Build cuda_11.8.r11.8/compiler.31833905_0


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.5/6.5 MB[0m [31m45.8 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflict

In [2]:
#Import the required libraries/packages
#General utilities
import numpy as np
import pandas as pd
import dask.dataframe as dd
import seaborn as sns
import matplotlib.pyplot as plt
import os, random, time,sys, re
from ipywidgets import widgets, interact, interactive, fixed, interact_manual
from tqdm import tqdm

#NLP Stuff
#Spacy
import spacy
from spacy.lang.en.stop_words import STOP_WORDS #Load stopwords
from spacy.language import Language
from spacy.tokenizer import Tokenizer
#Scispacy
import scispacy
from scispacy.linking import EntityLinker
from scispacy.abbreviation import AbbreviationDetector
from scispacy.hyponym_detector import HyponymDetector
#Medspacy
import medspacy
from medspacy.ner import TargetRule
from medspacy.visualization import visualize_ent
from negspacy.negation import Negex

#To use Transformers models from HuggingFace
import transformers
from transformers import AutoTokenizer, AutoModel,AutoModelForTokenClassification
#NLTK

#Enable data to be extracted from my Google Drive
from google.colab import drive
drive.mount('/content/drive')



Mounted at /content/drive


In [3]:
#Load Patient Dataframe
url ='https://raw.githubusercontent.com/victormurcia/CTS_Test/main/test_data/multi_veteran_df.csv'
patients_df = pd.read_csv(url)
#patients_df

#Load the inclusion criteria
url ='https://raw.githubusercontent.com/victormurcia/CTS_Test/main/test_data/parsed_ct_ic.csv'
parsed_ct_ic = pd.read_csv(url)
#parsed_ct_ic

#Load the exclusion criteria
url ='https://raw.githubusercontent.com/victormurcia/CTS_Test/main/test_data/parsed_ct_ec.csv'
parsed_ct_ec = pd.read_csv(url)

In [4]:
#Order of operations
# 1. Load the patients dataframe
# 2. Extract the EHR for the current patient
# 3. Preprocess each patient feature column
# 4. Run the NER model on each patient feature
# 5. Run a query on clinicaltrials.gov for all the conditions present in the patient profile
# 6. Extract the eligibility criteria for each queried clinical trial for each condition
# 7. Split eligibility criteria into inclusion/exclusion sections
# 8. Run NER model on both inclusion/exclusion sections
# 9. Determine Sorensen-Dice index between inclusion/exclusion sections and the patient EHR
# 10. Return the complete list of clinical trials stating whether the patient would qualify or not for each clinical trial

In [4]:
def cts_parser(patients_df):
  """
  This function parses the electronic health records for a patient extracted from the Synthetic Veteran Suicide Dataset and a set of Clinical Trials queried using the clinicaltrials.gov API to 
  determine how good of a match a patient is to a clinical trial. The output of this function will be a dataframe containing a list of clinical trials, their 
  """
  #Create and prepare single patient dataframe for NER
  final_patient_df = create_patient_df_for_NER(patients_df)

  #Load the pre-trained spaCy NER model with sci-spaCy
  ss_sm = spacy.load("en_core_sci_sm")

  return -1

cts_parser(patients_df)

15


-1

In [None]:
def create_patient_df_for_NER(patients_df):
  """
  This function creates a dataframe for a single patient from the 
  """
  #Select a single row from the DataFrame
  row_data = patients_df.loc[0]

  #Create a new DataFrame with the single row
  patient_df = pd.DataFrame([row_data], columns=row_data.index)

  #Select columns that summarize patient profile and put them into a list
  allergies     = patient_df['DESCRIPTION_als']
  condition     = patient_df['DESCRIPTION_cds']
  devices       = patient_df['DESCRIPTION_dvs']
  immunizations = patient_df['DESCRIPTION_ims']
  medications   = patient_df['DESCRIPTION_mds']
  observations  = patient_df['DESCRIPTION_obs']
  values        = patient_df['VALUE_obs']
  units         = patient_df['UNITS_obs']
  procedures    = patient_df['DESCRIPTION_prs']
  birthday      = patient_df['BIRTHDATE_pts']
  marital       = patient_df['MARITAL_pts']
  race          = patient_df['RACE_pts']
  ethnicity     = patient_df['ETHNICITY_pts']
  gender        = patient_df['GENDER_pts']
  city          = patient_df['CITY_pts']
  county        = patient_df['COUNTY_pts']

  #Make list for patient profile
  patient_prof_list = [allergies, condition, devices, immunizations, medications, observations, 
                       procedures,birthday,marital, race, ethnicity, gender, city, county]
  patient_prof_cols = ['allergies', 'condition', 'devices', 'immunizations', 
                       'medications', 'observations', 'procedures', 'birthday', 
                       'marital', 'race', 'ethnicity', 'gender', 'city', 'county']

  #Create a dictionary with column names and Series data
  data_dict = dict(zip(patient_prof_cols, patient_prof_list))

  #Create a new DataFrame with the single column
  patient_prof = pd.DataFrame(data_dict)

  #Concatenate the Series data into a single Series
  combined_series = pd.concat(patient_prof_list)

  #Create a DataFrame with a single column using the combined Series
  final_patient_df = pd.DataFrame({'Patient_Profile': combined_series})

  #Get column names of patient df
  final_patient_df['aspects'] = patient_prof_cols

  #Change the order of columns,reset the index, and drop the index column
  final_patient_df = final_patient_df.reindex(columns=['aspects', 'Patient_Profile']).reset_index().drop('index',axis=1)

  #Convert the list column to a string column
  final_patient_df['Patient_Profile'] = final_patient_df['Patient_Profile'].apply(lambda x: ''.join(map(str, eval(x))))

  return final_patient_df

def get_umls_codes(text: str):
    
    #Add the EntityLinker pipe to spacy pipeline
    if 'scispacy_linker' not in ss_sm.pipe_names:
      ss_sm.add_pipe("scispacy_linker", config={"linker_name": "umls", "max_entities_per_mention": 1})
    
    # Process the text and extract UMLS codes
    doc = ss_sm(text)
    umls_codes = [
        {
            "text": entity.text,
            #"start": entity.start_char,
            #"end": entity.end_char,
            "umls_id": umls_ent[0],
            "score": umls_ent[1]
        }
        for entity in doc.ents
        for umls_ent in entity._.kb_ents
    ]
    
    return umls_codes

def extract_values(dicts, key):
    return [d.get(key, None) for d in dicts]