<a href="https://colab.research.google.com/github/victormurcia/CTS_Test/blob/main/Testing_Parsing_of_Clinical_Trial_Eligibility.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Install Libraries/Packages and Models into Environment


In [1]:
#I need to import locale to ensure that the encoding is set to UTF-8 (weird Google Colab bug)
import locale
locale.getpreferredencoding = lambda: "UTF-8"

#Check the current build in Google Colab
!cat /etc/*release
print('\n')

#Check CUDA version
!nvcc --version
print('\n')

#Ensure that the required packages are installed in the current environment
!pip install ipywidgets --quiet
!pip install spacy==3.4.4 --quiet
!pip install scispacy --quiet
!pip install medspacy --quiet
!pip install negspacy --quiet
!pip install transformers
!pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118 --quiet 
print('\n')

#Spacy models used for processing biomedical, scientific, or clinical text 
#Spacy pipeline for biomedical data.
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_core_sci_sm-0.5.1.tar.gz --quiet

print('\n')

DISTRIB_ID=Ubuntu
DISTRIB_RELEASE=20.04
DISTRIB_CODENAME=focal
DISTRIB_DESCRIPTION="Ubuntu 20.04.5 LTS"
NAME="Ubuntu"
VERSION="20.04.5 LTS (Focal Fossa)"
ID=ubuntu
ID_LIKE=debian
PRETTY_NAME="Ubuntu 20.04.5 LTS"
VERSION_ID="20.04"
HOME_URL="https://www.ubuntu.com/"
SUPPORT_URL="https://help.ubuntu.com/"
BUG_REPORT_URL="https://bugs.launchpad.net/ubuntu/"
PRIVACY_POLICY_URL="https://www.ubuntu.com/legal/terms-and-policies/privacy-policy"
VERSION_CODENAME=focal
UBUNTU_CODENAME=focal


nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2022 NVIDIA Corporation
Built on Wed_Sep_21_10:33:58_PDT_2022
Cuda compilation tools, release 11.8, V11.8.89
Build cuda_11.8.r11.8/compiler.31833905_0


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m18.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.5/6.5 MB[0m [31m39.0 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently t

##Import the required libraries/packages

In [2]:
#Import the required libraries/packages
#General utilities
import numpy as np
import pandas as pd
import dask.dataframe as dd
import seaborn as sns
from ipywidgets import widgets, interact, interactive, fixed, interact_manual
import random, string

#NLP Stuff
#Spacy
import spacy
from spacy.lang.en.stop_words import STOP_WORDS #Load stopwords
from spacy.language import Language
from spacy.tokenizer import Tokenizer
#Scispacy
import scispacy
from scispacy.linking import EntityLinker
from scispacy.abbreviation import AbbreviationDetector
from scispacy.hyponym_detector import HyponymDetector
#Medspacy
import medspacy
from medspacy.ner import TargetRule
from medspacy.visualization import visualize_ent
from negspacy.negation import Negex

#To use Transformers models from HuggingFace
import transformers
from transformers import AutoTokenizer, AutoModel,AutoModelForTokenClassification
#NLTK

#Enable data to be extracted from my Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


##Query for Clinical Trials
I'll start by querying for clinical trials pertinent to the 'allergy' condition. I'll expand and test this eventually for other conditions.

In [3]:
#Query for trials
cond = 'allergy'#input('Enter the disease condition to find clinical trials: ')
a = 'https://clinicaltrials.gov/api/query/study_fields?expr='
b = '&fields=NCTId%2CBriefTitle%2CCondition%2COverallStatus%2CLeadSponsorName%2CEligibilityCriteria'
c = '&min_rnk=1&max_rnk=1000&fmt=csv'
q=(a + cond + b + c)
print(q)
qtrials = pd.read_csv(q,skiprows=10)

@interact
def show_recruiting_studies(column=['OverallStatus','Condition'], 
                            x = ['Recruiting','Completed','Unknown status'],
                            y = ['Food','Antibiotic']
                            ):
    if column == 'OverallStatus':
      return qtrials.loc[qtrials[column] == x]
    elif column == 'Condition':
      return qtrials[qtrials['Condition'].str.contains(y)]

https://clinicaltrials.gov/api/query/study_fields?expr=allergy&fields=NCTId%2CBriefTitle%2CCondition%2COverallStatus%2CLeadSponsorName%2CEligibilityCriteria&min_rnk=1&max_rnk=1000&fmt=csv


interactive(children=(Dropdown(description='column', options=('OverallStatus', 'Condition'), value='OverallSta…

##Parsing Eligibility Criteria of Clinical Trials with NLP
I'll start by using a small NER model just to test out the functionality of the parser and get a feel of what functions and routines I'll need to implement for a full pipeline.

In [4]:
#Load the models
ss_sm = spacy.load("en_core_sci_sm")

In the following cell, I'll play around with removing stopwords and punctuations in order to ultimately separate the eligibility criteria of each clinical trial into the inclusion and exclusion criteria.

In [5]:
#DataFrame containing eligility criteria for all queried trials
qt_ec = qtrials[['EligibilityCriteria']]

#Eligibility criteria for current trial
ec = qt_ec['EligibilityCriteria'][1]

#Split eligibility criteria into exclusion and inclusion criteria sections
sections = ec.split("||Exclusion Criteria:||")
inclusion_criteria = [criteria for criteria in sections[0].replace("Inclusion Criteria:||", "").split("|") if len(criteria) > 0]
exclusion_criteria = [criteria for criteria in sections[1].split("|") if len(criteria) > 0]

print("Inclusion Criteria:")
for criteria in inclusion_criteria:
    print("-", criteria)

print("\nExclusion Criteria:")
for criteria in exclusion_criteria:
    print("-", criteria)

# remove stopwords and punctuations from the inclusion criteria
inclusion_criteria_sw = [' '.join([token.text for token in ss_sm(criteria) if not token.is_stop and not token.is_punct]) for criteria in inclusion_criteria]

# remove stopwords and punctuations from the exclusion criteria
exclusion_criteria_sw = [' '.join([token.text for token in ss_sm(criteria) if not token.is_stop and not token.is_punct]) for criteria in exclusion_criteria]

print('\n-------Remove Stopwords from Eligibility Criteria-------')
print("\nInclusion Criteria:")
for criteria in inclusion_criteria_sw:
    print("-", criteria)

print("\nExclusion Criteria:")
for criteria in exclusion_criteria_sw:
    print("-", criteria)

Inclusion Criteria:
- All children, young people and adults who fit the selection criteria from across all the practices can be referred to the allergy clinic.
- All patients and parents / carers where appropriate must be deemed capable of giving informed consent to take part in the research project.
- Infants under two with suspected food allergy
- Infants under two with moderate-to-severe eczema not responding to standard treatment.
- Children and young people (up to 16 years of age) with suspected allergic rhinitis symptoms that are unresponsive to a combination of oral antihistamines and nasal steroids
- Young people and adults (from 16 years of age) with a history of anaphylaxis or suspected anaphylaxis

Exclusion Criteria:
- Over 2 years of age with delayed type food allergy presenting primarily with gastrointestinal symptoms
- Over 2 years of age with confirmed non IgE-mediated symptoms including food intolerances, coeliac disease etc.
- Single urticarial reactions without an ob

Nice! That worked! Let me now place the inclusion and exclusion criteria into dataframes

In [6]:
#Dataframe containing inclusion criteria for current trial
ic_df = pd.DataFrame({'InclusionCriteria': inclusion_criteria, 'InclusionCriteria_NoSW': inclusion_criteria_sw})
ic_df

Unnamed: 0,InclusionCriteria,InclusionCriteria_NoSW
0,"All children, young people and adults who fit ...",children young people adults fit selection cri...
1,All patients and parents / carers where approp...,patients parents carers appropriate deemed cap...
2,Infants under two with suspected food allergy,Infants suspected food allergy
3,Infants under two with moderate-to-severe ecze...,Infants moderate-to-severe eczema responding s...
4,Children and young people (up to 16 years of a...,Children young people 16 years age suspected a...
5,Young people and adults (from 16 years of age)...,Young people adults 16 years age history anaph...


In [7]:
#Dataframe containing exclusion criteria for current trial
ec_df = pd.DataFrame({'ExclusionCriteria': exclusion_criteria, 'ExclusionCriteria_NoSW': exclusion_criteria_sw})
ec_df

Unnamed: 0,ExclusionCriteria,ExclusionCriteria_NoSW
0,Over 2 years of age with delayed type food all...,2 years age delayed type food allergy presenti...
1,Over 2 years of age with confirmed non IgE-med...,2 years age confirmed non IgE-mediated symptom...
2,Single urticarial reactions without an obvious...,Single urticarial reactions obvious triggers
3,Non-allergic chronic urticaria,Non-allergic chronic urticaria
4,Drug allergy,Drug allergy
5,"Well controlled allergic rhinitis, asthma or a...",controlled allergic rhinitis asthma atopic eczema
6,Mild-to-moderate atopic eczema without an obvi...,Mild-to-moderate atopic eczema obvious allergi...
7,Localised insect sting reactions,Localised insect sting reactions


Now I'll place some of the intermediate results of the parser (i.e. tokens, POS, Entity, Parent) into each criteria dataframe

In [8]:
#Tokenize, identify part of speech, entity and parent word, and remove stopwords/punctuation from each row 
ic_df['tokens'] = ic_df['InclusionCriteria'].apply(lambda x: [token.text      for token in ss_sm(x)  if not token.is_stop and not token.is_punct])
ic_df['pos']    = ic_df['InclusionCriteria'].apply(lambda x: [token.pos_      for token in ss_sm(x)  if not token.is_stop and not token.is_punct])
ic_df['entity'] = ic_df['InclusionCriteria'].apply(lambda x: [token.ent_type_ for token in ss_sm(x)  if not token.is_stop and not token.is_punct])
ic_df['parent'] = ic_df['InclusionCriteria'].apply(lambda x: [token.head.text for token in ss_sm(x)  if not token.is_stop and not token.is_punct])

#Tokenize, identify part of speech, entity and parent word, and remove stopwords/punctuation from each row 
ec_df['tokens'] = ec_df['ExclusionCriteria'].apply(lambda x: [token.text      for token in ss_sm(x)  if not token.is_stop and not token.is_punct])
ec_df['pos']    = ec_df['ExclusionCriteria'].apply(lambda x: [token.pos_      for token in ss_sm(x)  if not token.is_stop and not token.is_punct])
ec_df['entity'] = ec_df['ExclusionCriteria'].apply(lambda x: [token.ent_type_ for token in ss_sm(x)  if not token.is_stop and not token.is_punct])
ec_df['parent'] = ec_df['ExclusionCriteria'].apply(lambda x: [token.head.text for token in ss_sm(x)  if not token.is_stop and not token.is_punct])

In [9]:
ic_df

Unnamed: 0,InclusionCriteria,InclusionCriteria_NoSW,tokens,pos,entity,parent
0,"All children, young people and adults who fit ...",children young people adults fit selection cri...,"[children, young, people, adults, fit, selecti...","[NOUN, ADJ, NOUN, NOUN, VERB, NOUN, NOUN, NOUN...","[ENTITY, ENTITY, ENTITY, ENTITY, , ENTITY, ENT...","[referred, people, referred, people, adults, c..."
1,All patients and parents / carers where approp...,patients parents carers appropriate deemed cap...,"[patients, parents, carers, appropriate, deeme...","[NOUN, NOUN, NOUN, NOUN, VERB, ADJ, VERB, ADJ,...","[ENTITY, ENTITY, ENTITY, , , , , , ENTITY, ENT...","[patients, patients, patients, where, carers, ..."
2,Infants under two with suspected food allergy,Infants suspected food allergy,"[Infants, suspected, food, allergy]","[NOUN, VERB, NOUN, NOUN]","[ENTITY, ENTITY, ENTITY, ENTITY]","[Infants, allergy, allergy, Infants]"
3,Infants under two with moderate-to-severe ecze...,Infants moderate-to-severe eczema responding s...,"[Infants, moderate-to-severe, eczema, respondi...","[NOUN, ADJ, NOUN, VERB, ADJ, NOUN]","[ENTITY, , ENTITY, , ENTITY, ENTITY]","[Infants, eczema, two, Infants, treatment, res..."
4,Children and young people (up to 16 years of a...,Children young people 16 years age suspected a...,"[Children, young, people, 16, years, age, susp...","[NOUN, ADJ, NOUN, NUM, NOUN, NOUN, ADJ, ADJ, N...","[ENTITY, ENTITY, ENTITY, , ENTITY, ENTITY, , E...","[Children, people, Children, years, Children, ..."
5,Young people and adults (from 16 years of age)...,Young people adults 16 years age history anaph...,"[Young, people, adults, 16, years, age, histor...","[ADJ, NOUN, NOUN, NUM, NOUN, NOUN, NOUN, ADJ, ...","[ENTITY, ENTITY, ENTITY, , ENTITY, ENTITY, ENT...","[people, people, people, years, people, years,..."


In [10]:
ec_df

Unnamed: 0,ExclusionCriteria,ExclusionCriteria_NoSW,tokens,pos,entity,parent
0,Over 2 years of age with delayed type food all...,2 years age delayed type food allergy presenti...,"[2, years, age, delayed, type, food, allergy, ...","[NUM, NOUN, NOUN, VERB, NOUN, NOUN, NOUN, VERB...","[, ENTITY, ENTITY, ENTITY, ENTITY, ENTITY, ENT...","[years, years, years, allergy, allergy, allerg..."
1,Over 2 years of age with confirmed non IgE-med...,2 years age confirmed non IgE-mediated symptom...,"[2, years, age, confirmed, non, IgE-mediated, ...","[NUM, NOUN, NOUN, VERB, ADJ, ADJ, NOUN, VERB, ...","[, ENTITY, ENTITY, , ENTITY, ENTITY, ENTITY, ,...","[years, years, years, symptoms, IgE-mediated, ..."
2,Single urticarial reactions without an obvious...,Single urticarial reactions obvious triggers,"[Single, urticarial, reactions, obvious, trigg...","[ADJ, ADJ, NOUN, ADJ, NOUN]","[ENTITY, ENTITY, ENTITY, , ENTITY]","[reactions, reactions, reactions, triggers, re..."
3,Non-allergic chronic urticaria,Non-allergic chronic urticaria,"[Non-allergic, chronic, urticaria]","[ADJ, ADJ, NOUN]","[ENTITY, ENTITY, ENTITY]","[urticaria, urticaria, urticaria]"
4,Drug allergy,Drug allergy,"[Drug, allergy]","[NOUN, NOUN]","[ENTITY, ENTITY]","[allergy, allergy]"
5,"Well controlled allergic rhinitis, asthma or a...",controlled allergic rhinitis asthma atopic eczema,"[controlled, allergic, rhinitis, asthma, atopi...","[VERB, ADJ, NOUN, NOUN, ADJ, NOUN]","[ENTITY, ENTITY, ENTITY, ENTITY, ENTITY, ENTITY]","[rhinitis, rhinitis, rhinitis, rhinitis, eczem..."
6,Mild-to-moderate atopic eczema without an obvi...,Mild-to-moderate atopic eczema obvious allergi...,"[Mild-to-moderate, atopic, eczema, obvious, al...","[ADJ, ADJ, NOUN, ADJ, ADJ, NOUN]","[ENTITY, ENTITY, ENTITY, , ENTITY, ENTITY]","[eczema, eczema, eczema, trigger, trigger, ecz..."
7,Localised insect sting reactions,Localised insect sting reactions,"[Localised, insect, sting, reactions]","[ADJ, NOUN, NOUN, NOUN]","[ENTITY, ENTITY, ENTITY, ENTITY]","[insect, insect, reactions, insect]"


##Linking Entities to Medical Concepts
Cool! Now I can start linking these entities to medical concepts via UMLS

In [11]:
# Load the pre-trained spaCy model with sci-spaCy
ss_sm = spacy.load("en_core_sci_sm")

def get_umls_codes(text: str):
    
    #Add the EntityLinker pipe to spacy pipeline
    if 'scispacy_linker' not in ss_sm.pipe_names:
      ss_sm.add_pipe("scispacy_linker", config={"linker_name": "umls", "max_entities_per_mention": 1})
    
    # Process the text and extract UMLS codes
    doc = ss_sm(text)
    umls_codes = [
        {
            "text": entity.text,
            "start": entity.start_char,
            "end": entity.end_char,
            "umls_id": umls_ent[0],
            "score": umls_ent[1]
        }
        for entity in doc.ents
        for umls_ent in entity._.kb_ents
    ]
    
    return umls_codes

The function above takes a piece of text and returns the UMLS links for each entity in the text as a list of dictionaries. 

In [12]:
ic_df['umls_codes'] = ic_df['InclusionCriteria'].apply(get_umls_codes)
ic_df

https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/linkers/2020-10-09/umls/tfidf_vectors_sparse.npz not found in cache, downloading to /tmp/tmp70v6n91n
Finished download, copying /tmp/tmp70v6n91n to cache at /root/.scispacy/datasets/e9f7327283e43f0482f7c0c71b71dec278a58ccb3ffdd03c2c2350159e7ef146.f2a350ad19015b2591545f7feeed6a6d6d2fffcd635d868a5d7fc0dfc3cadfd8.tfidf_vectors_sparse.npz
https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/linkers/2020-10-09/umls/nmslib_index.bin not found in cache, downloading to /tmp/tmppmki_9w0
Finished download, copying /tmp/tmppmki_9w0 to cache at /root/.scispacy/datasets/f48455d6c79262057cce66b4619123c2b558b21092d42fac97f47bb99a5b8f9f.dd70d3dffe7d90d7ac8914460e16a48375dab32485fb6313a34e6fbcaf53218b.nmslib_index.bin
https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/linkers/2020-10-09/umls/tfidf_vectorizer.joblib not found in cache, downloading to /tmp/tmp6anonjzr
Finished download, copying /tmp/tmp6anonjzr to cache at /root/.scispacy/da

Unnamed: 0,InclusionCriteria,InclusionCriteria_NoSW,tokens,pos,entity,parent,umls_codes
0,"All children, young people and adults who fit ...",children young people adults fit selection cri...,"[children, young, people, adults, fit, selecti...","[NOUN, ADJ, NOUN, NOUN, VERB, NOUN, NOUN, NOUN...","[ENTITY, ENTITY, ENTITY, ENTITY, , ENTITY, ENT...","[referred, people, referred, people, adults, c...","[{'text': 'children', 'start': 4, 'end': 12, '..."
1,All patients and parents / carers where approp...,patients parents carers appropriate deemed cap...,"[patients, parents, carers, appropriate, deeme...","[NOUN, NOUN, NOUN, NOUN, VERB, ADJ, VERB, ADJ,...","[ENTITY, ENTITY, ENTITY, , , , , , ENTITY, ENT...","[patients, patients, patients, where, carers, ...","[{'text': 'patients', 'start': 4, 'end': 12, '..."
2,Infants under two with suspected food allergy,Infants suspected food allergy,"[Infants, suspected, food, allergy]","[NOUN, VERB, NOUN, NOUN]","[ENTITY, ENTITY, ENTITY, ENTITY]","[Infants, allergy, allergy, Infants]","[{'text': 'Infants', 'start': 0, 'end': 7, 'um..."
3,Infants under two with moderate-to-severe ecze...,Infants moderate-to-severe eczema responding s...,"[Infants, moderate-to-severe, eczema, respondi...","[NOUN, ADJ, NOUN, VERB, ADJ, NOUN]","[ENTITY, , ENTITY, , ENTITY, ENTITY]","[Infants, eczema, two, Infants, treatment, res...","[{'text': 'Infants', 'start': 0, 'end': 7, 'um..."
4,Children and young people (up to 16 years of a...,Children young people 16 years age suspected a...,"[Children, young, people, 16, years, age, susp...","[NOUN, ADJ, NOUN, NUM, NOUN, NOUN, ADJ, ADJ, N...","[ENTITY, ENTITY, ENTITY, , ENTITY, ENTITY, , E...","[Children, people, Children, years, Children, ...","[{'text': 'Children', 'start': 0, 'end': 8, 'u..."
5,Young people and adults (from 16 years of age)...,Young people adults 16 years age history anaph...,"[Young, people, adults, 16, years, age, histor...","[ADJ, NOUN, NOUN, NUM, NOUN, NOUN, NOUN, ADJ, ...","[ENTITY, ENTITY, ENTITY, , ENTITY, ENTITY, ENT...","[people, people, people, years, people, years,...","[{'text': 'Young people', 'start': 0, 'end': 1..."


In [13]:
df = ic_df['umls_codes'][0]
df

[{'text': 'children',
  'start': 4,
  'end': 12,
  'umls_id': 'C0008059',
  'score': 0.9999998807907104},
 {'text': 'young people',
  'start': 14,
  'end': 26,
  'umls_id': 'C2963163',
  'score': 0.7737775444984436},
 {'text': 'adults',
  'start': 31,
  'end': 37,
  'umls_id': 'C0001675',
  'score': 1.0},
 {'text': 'selection criteria',
  'start': 50,
  'end': 68,
  'umls_id': 'C0242801',
  'score': 1.0},
 {'text': 'practices',
  'start': 89,
  'end': 98,
  'umls_id': 'C0237607',
  'score': 1.0},
 {'text': 'allergy clinic',
  'start': 122,
  'end': 136,
  'umls_id': 'C3810819',
  'score': 1.0}]

Now, I can unpack the contents of the dictionaries into individual columns in my dataframe

In [14]:
def extract_values(dicts, key):
    return [d.get(key, None) for d in dicts]

# Create new columns from the keys in the dictionaries within the 'info' column lists
unique_keys = set().union(*(d.keys() for dicts in ic_df['umls_codes'] for d in dicts))

for key in unique_keys:
    ic_df[key] = ic_df['umls_codes'].apply(lambda dicts: extract_values(dicts, key))

ic_df

Unnamed: 0,InclusionCriteria,InclusionCriteria_NoSW,tokens,pos,entity,parent,umls_codes,score,text,end,start,umls_id
0,"All children, young people and adults who fit ...",children young people adults fit selection cri...,"[children, young, people, adults, fit, selecti...","[NOUN, ADJ, NOUN, NOUN, VERB, NOUN, NOUN, NOUN...","[ENTITY, ENTITY, ENTITY, ENTITY, , ENTITY, ENT...","[referred, people, referred, people, adults, c...","[{'text': 'children', 'start': 4, 'end': 12, '...","[0.9999998807907104, 0.7737775444984436, 1.0, ...","[children, young people, adults, selection cri...","[12, 26, 37, 68, 98, 136]","[4, 14, 31, 50, 89, 122]","[C0008059, C2963163, C0001675, C0242801, C0237..."
1,All patients and parents / carers where approp...,patients parents carers appropriate deemed cap...,"[patients, parents, carers, appropriate, deeme...","[NOUN, NOUN, NOUN, NOUN, VERB, ADJ, VERB, ADJ,...","[ENTITY, ENTITY, ENTITY, , , , , , ENTITY, ENT...","[patients, patients, patients, where, carers, ...","[{'text': 'patients', 'start': 4, 'end': 12, '...","[1.0, 1.0, 1.0, 0.9999998807907104, 0.99999994...","[patients, parents, carers, consent, research ...","[12, 24, 33, 101, 138]","[4, 17, 27, 94, 122]","[C0030705, C0030551, C0085537, C1511481, C0700..."
2,Infants under two with suspected food allergy,Infants suspected food allergy,"[Infants, suspected, food, allergy]","[NOUN, VERB, NOUN, NOUN]","[ENTITY, ENTITY, ENTITY, ENTITY]","[Infants, allergy, allergy, Infants]","[{'text': 'Infants', 'start': 0, 'end': 7, 'um...","[1.0, 1.0, 1.0]","[Infants, suspected, food allergy]","[7, 32, 45]","[0, 23, 33]","[C0021270, C0332147, C0016470]"
3,Infants under two with moderate-to-severe ecze...,Infants moderate-to-severe eczema responding s...,"[Infants, moderate-to-severe, eczema, respondi...","[NOUN, ADJ, NOUN, VERB, ADJ, NOUN]","[ENTITY, , ENTITY, , ENTITY, ENTITY]","[Infants, eczema, two, Infants, treatment, res...","[{'text': 'Infants', 'start': 0, 'end': 7, 'um...","[1.0, 1.0, 1.0, 1.0]","[Infants, eczema, standard, treatment]","[7, 48, 75, 85]","[0, 42, 67, 76]","[C0021270, C0013595, C1442989, C0039798]"
4,Children and young people (up to 16 years of a...,Children young people 16 years age suspected a...,"[Children, young, people, 16, years, age, susp...","[NOUN, ADJ, NOUN, NUM, NOUN, NOUN, ADJ, ADJ, N...","[ENTITY, ENTITY, ENTITY, , ENTITY, ENTITY, , E...","[Children, people, Children, years, Children, ...","[{'text': 'Children', 'start': 0, 'end': 8, 'u...","[0.9999998807907104, 0.7737775444984436, 1.0, ...","[Children, young people, years, age, allergic ...","[8, 25, 41, 48, 82, 91, 130, 153, 172]","[0, 13, 36, 45, 65, 83, 119, 134, 158]","[C0008059, C2963163, C0439234, C0001779, C1334..."
5,Young people and adults (from 16 years of age)...,Young people adults 16 years age history anaph...,"[Young, people, adults, 16, years, age, histor...","[ADJ, NOUN, NOUN, NUM, NOUN, NOUN, NOUN, ADJ, ...","[ENTITY, ENTITY, ENTITY, , ENTITY, ENTITY, ENT...","[people, people, people, years, people, years,...","[{'text': 'Young people', 'start': 0, 'end': 1...","[0.7737775444984436, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[Young people, adults, years, age, history, an...","[12, 23, 38, 45, 61, 76, 89, 101]","[0, 17, 33, 42, 54, 65, 80, 90]","[C2963163, C0001675, C0439234, C0001779, C0019..."


Nice! Let me do it on the Exclusion Criteria dataframe as well for completion

In [15]:
ec_df['umls_codes'] = ec_df['ExclusionCriteria'].apply(get_umls_codes)

for key in unique_keys:
    ec_df[key] = ec_df['umls_codes'].apply(lambda dicts: extract_values(dicts, key))

ec_df

Unnamed: 0,ExclusionCriteria,ExclusionCriteria_NoSW,tokens,pos,entity,parent,umls_codes,score,text,end,start,umls_id
0,Over 2 years of age with delayed type food all...,2 years age delayed type food allergy presenti...,"[2, years, age, delayed, type, food, allergy, ...","[NUM, NOUN, NOUN, VERB, NOUN, NOUN, NOUN, VERB...","[, ENTITY, ENTITY, ENTITY, ENTITY, ENTITY, ENT...","[years, years, years, allergy, allergy, allerg...","[{'text': 'years', 'start': 7, 'end': 12, 'uml...","[1.0, 1.0, 0.7075120806694031, 1.0]","[years, age, delayed type food allergy, gastro...","[12, 19, 50, 102]","[7, 16, 25, 77]","[C0439234, C0001779, C0016470, C0426576]"
1,Over 2 years of age with confirmed non IgE-med...,2 years age confirmed non IgE-mediated symptom...,"[2, years, age, confirmed, non, IgE-mediated, ...","[NUM, NOUN, NOUN, VERB, ADJ, ADJ, NOUN, VERB, ...","[, ENTITY, ENTITY, , ENTITY, ENTITY, ENTITY, ,...","[years, years, years, symptoms, IgE-mediated, ...","[{'text': 'years', 'start': 7, 'end': 12, 'uml...","[1.0, 1.0, 1.0, 0.7329829335212708, 1.0, 1.0, ...","[years, age, non, IgE-mediated, symptoms, food...","[12, 19, 38, 51, 60, 88, 105]","[7, 16, 35, 39, 52, 71, 90]","[C0439234, C0001779, C1513853, C0020523, C0683..."
2,Single urticarial reactions without an obvious...,Single urticarial reactions obvious triggers,"[Single, urticarial, reactions, obvious, trigg...","[ADJ, ADJ, NOUN, ADJ, NOUN]","[ENTITY, ENTITY, ENTITY, , ENTITY]","[reactions, reactions, reactions, triggers, re...","[{'text': 'triggers', 'start': 47, 'end': 55, ...",[1.0],[triggers],[55],[47],[C0032930]
3,Non-allergic chronic urticaria,Non-allergic chronic urticaria,"[Non-allergic, chronic, urticaria]","[ADJ, ADJ, NOUN]","[ENTITY, ENTITY, ENTITY]","[urticaria, urticaria, urticaria]","[{'text': 'Non-allergic chronic urticaria', 's...",[0.7297990918159485],[Non-allergic chronic urticaria],[30],[0],[C0263338]
4,Drug allergy,Drug allergy,"[Drug, allergy]","[NOUN, NOUN]","[ENTITY, ENTITY]","[allergy, allergy]","[{'text': 'Drug allergy', 'start': 0, 'end': 1...",[1.0],[Drug allergy],[12],[0],[C0013182]
5,"Well controlled allergic rhinitis, asthma or a...",controlled allergic rhinitis asthma atopic eczema,"[controlled, allergic, rhinitis, asthma, atopi...","[VERB, ADJ, NOUN, NOUN, ADJ, NOUN]","[ENTITY, ENTITY, ENTITY, ENTITY, ENTITY, ENTITY]","[rhinitis, rhinitis, rhinitis, rhinitis, eczem...","[{'text': 'Well controlled', 'start': 0, 'end'...","[1.0, 1.0, 1.0, 1.0]","[Well controlled, allergic rhinitis, asthma, a...","[15, 33, 41, 58]","[0, 16, 35, 45]","[C3853142, C1334103, C0004096, C0011615]"
6,Mild-to-moderate atopic eczema without an obvi...,Mild-to-moderate atopic eczema obvious allergi...,"[Mild-to-moderate, atopic, eczema, obvious, al...","[ADJ, ADJ, NOUN, ADJ, ADJ, NOUN]","[ENTITY, ENTITY, ENTITY, , ENTITY, ENTITY]","[eczema, eczema, eczema, trigger, trigger, ecz...","[{'text': 'atopic eczema', 'start': 17, 'end':...",[1.0],[atopic eczema],[30],[17],[C0011615]
7,Localised insect sting reactions,Localised insect sting reactions,"[Localised, insect, sting, reactions]","[ADJ, NOUN, NOUN, NOUN]","[ENTITY, ENTITY, ENTITY, ENTITY]","[insect, insect, reactions, insect]",[],[],[],[],[],[]


##Test out Similarity Metrics
I need a way to assess how well a clinical trial matches up with a patient profile in a quantitative way. This could be potentially achieved via the Sorensen-Dice Index or the Jaccard Index.

The Jaccard index and the Sørensen-Dice index are both similarity coefficients that measure the similarity between two sets. They return values between 0 and 1, where 0 means the sets have no common elements, and 1 means the sets are identical. The higher the value, the more similar the sets are.

You can compare the values of the Jaccard index and the Sørensen-Dice index directly, keeping in mind that they have slightly different properties:

    Jaccard index: It is the ratio of the size of the intersection of the two sets to the size of their union.

    J(A, B) = |A ∩ B| / |A ∪ B|

    Sørensen-Dice index: It is the ratio of twice the size of the intersection of the two sets to the sum of the sizes of the two sets.

    SD(A, B) = 2 × |A ∩ B| / (|A| + |B|)

Both indices are valid measures of similarity, but the Sørensen-Dice index generally gives higher values than the Jaccard index for the same sets, because it's more sensitive to the size of the intersection. This makes the Sørensen-Dice index more appropriate when small intersections are important, while the Jaccard index is more appropriate when the focus is on the overall similarity between the two sets, including both their intersection and their differences.

When comparing the values of the Jaccard index and the Sørensen-Dice index, consider the following:

    If both values are high, it means the sets have a high degree of similarity.
    If both values are low, it means the sets have a low degree of similarity.
    If one index has a high value while the other has a low value, it may indicate that the sets have a small intersection but the size of the union is large (Jaccard index is low) or that the sets have a small intersection with small individual set sizes (Sørensen-Dice index is high).

Below I show how these two indices can be used to gauge similarity: the higher the value the more similar those things are.

In [122]:
# Define the Jaccard index function
def jaccard_index(a, b):
    set_a = set(a)
    set_b = set(b)
    intersection = set_a.intersection(set_b)
    union = set_a.union(set_b)
    return len(intersection) / len(union)

# Define the Sørensen-Dice index function
def sorensen_dice_index(a, b):
    set_a = set(a)
    set_b = set(b)
    intersection = set_a.intersection(set_b)
    return 2 * len(intersection) / (len(set_a) + len(set_b))

In [124]:
# Generate random data for testing
def random_string(length):
    return ''.join(random.choice(string.ascii_lowercase) for _ in range(length))

def random_list(num_elements, max_length):
    return [random_string(random.randint(1, max_length)) for _ in range(num_elements)]

num_rows = 10
data = {
    'column_A': [random_list(random.randint(1, 7), 5) for _ in range(num_rows)],
    'column_B': [random_list(random.randint(1, 7), 5) for _ in range(num_rows)]
}

df = pd.DataFrame(data)

# Calculate the Jaccard index between the two columns
df['jaccard_index'] = df.apply(lambda row: jaccard_index(row['column_A'], row['column_B']), axis=1)

# Calculate the Sørensen-Dice index between the two columns
df['sorensen_dice_index'] = df.apply(lambda row: sorensen_dice_index(row['column_A'], row['column_B']), axis=1)

df

Unnamed: 0,column_A,column_B,jaccard_index,sorensen_dice_index
0,"[nxbj, ycdqt, g, nmw, c, gj]",[ir],0.0,0.0
1,"[s, uycim]",[eq],0.0,0.0
2,[sgo],"[q, hzmz, jfi, lmdue]",0.0,0.0
3,"[pyal, bzj, td, d, jokpl, llwlz]","[wvnmc, ww, p, opx]",0.0,0.0
4,"[skevc, hihq, zhy, esr, zy, rrtfy, pbsez]","[et, ikgc, zz, skw, hkpi, kofks, pl]",0.0,0.0
5,"[kbgt, kfet, ajljq, acpvu, ktt, b]","[y, pa]",0.0,0.0
6,"[o, jkbl, qmg, nqtd, d]","[erfbk, m, vkk, vs, kg]",0.0,0.0
7,"[xe, d, eae, kt, h, qzu]","[uyftp, eed, oazs, riodh, idp, oweno]",0.0,0.0
8,"[pxo, w, x, wlo, q, fyf]","[y, m]",0.0,0.0
9,"[wj, nes, lyvn, bv]","[fd, kswit, yu, zlj, xttf]",0.0,0.0


In [127]:
# Generate similar random strings
def generate_similar_string(s, num_changes, mode):
    s_list = list(s)

    for _ in range(num_changes):
        if mode == 'swap' and len(s) > 1:
            i, j = random.sample(range(len(s)), 2)
            s_list[i], s_list[j] = s_list[j], s_list[i]
        elif mode == 'change':
            i = random.randrange(len(s))
            s_list[i] = random.choice(string.ascii_lowercase)
        elif mode == 'add_remove':
            if random.choice([True, False]):  # Add a character
                i = random.randrange(len(s) + 1)
                s_list.insert(i, random.choice(string.ascii_lowercase))
            else:  # Remove a character
                if len(s) > 1:
                    i = random.randrange(len(s))
                    s_list.pop(i)

    return ''.join(s_list)

def generate_similar_list(base_list, num_changes, mode):
    return [generate_similar_string(s, num_changes, mode) for s in base_list]

num_rows = 5
num_changes = 1
mode = 'change'  # You can try 'swap', 'change', or 'add_remove'
data = {
    'column_A': [random_list(random.randint(1, 7), 5) for _ in range(num_rows)],
    'column_B': [generate_similar_list(row, num_changes, mode) for row in data['column_A']]
}

df = pd.DataFrame(data)

#Ensure that one of the rows equals one another for sanity check 
df['column_A'][4] = df['column_B'][4]

# Calculate the Jaccard index between the two columns
df['jaccard_index'] = df.apply(lambda row: jaccard_index(row['column_A'], row['column_B']), axis=1)

# Calculate the Sørensen-Dice index between the two columns
df['sorensen_dice_index'] = df.apply(lambda row: sorensen_dice_index(row['column_A'], row['column_B']), axis=1)

df

Unnamed: 0,column_A,column_B,jaccard_index,sorensen_dice_index
0,"[rvl, d]","[nlh, d, wnj, nozv, a, teja]",0.142857,0.25
1,"[ofpnn, aq, aaie, p, oosl, avdpk, tlrgk]","[jyccz, vmg, zlmxd, guk]",0.0,0.0
2,"[kjj, ylrv, d]","[x, gdgjw, t, igws, czm]",0.0,0.0
3,"[fvgyb, vg, xzppx, rjf, gk]","[ejip, a, xbeot, zb, hwy]",0.0,0.0
4,"[iw, qkpf]","[iw, qkpf]",1.0,1.0


That seems to be working properly! I think this will work nicely as an initial match quality metric. I'll just need to make some adjustments based on the output my routine will be producing.