<a href="https://colab.research.google.com/github/victormurcia/CTS_Test/blob/main/Testing_Parsing_of_Clinical_Trial_Eligibility.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#I need to import locale to ensure that the encoding is set to UTF-8 (weird Google Colab bug)
import locale
locale.getpreferredencoding = lambda: "UTF-8"

#Check the current build in Google Colab
!cat /etc/*release
print('\n')

#Check CUDA version
!nvcc --version
print('\n')

#Ensure that the required packages are installed in the current environment
!pip install ipywidgets --quiet
!pip install spacy==3.4.4 --quiet
!pip install scispacy --quiet
!pip install medspacy --quiet
!pip install negspacy --quiet
!pip install transformers
!pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118 --quiet 
print('\n')

#Spacy models used for processing biomedical, scientific, or clinical text 
#Spacy pipeline for biomedical data.
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_core_sci_sm-0.5.1.tar.gz --quiet

print('\n')

DISTRIB_ID=Ubuntu
DISTRIB_RELEASE=20.04
DISTRIB_CODENAME=focal
DISTRIB_DESCRIPTION="Ubuntu 20.04.5 LTS"
NAME="Ubuntu"
VERSION="20.04.5 LTS (Focal Fossa)"
ID=ubuntu
ID_LIKE=debian
PRETTY_NAME="Ubuntu 20.04.5 LTS"
VERSION_ID="20.04"
HOME_URL="https://www.ubuntu.com/"
SUPPORT_URL="https://help.ubuntu.com/"
BUG_REPORT_URL="https://bugs.launchpad.net/ubuntu/"
PRIVACY_POLICY_URL="https://www.ubuntu.com/legal/terms-and-policies/privacy-policy"
VERSION_CODENAME=focal
UBUNTU_CODENAME=focal


nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2022 NVIDIA Corporation
Built on Wed_Sep_21_10:33:58_PDT_2022
Cuda compilation tools, release 11.8, V11.8.89
Build cuda_11.8.r11.8/compiler.31833905_0


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.5/6.5 MB[0m [31m44.9 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently t

In [2]:
#Import the required libraries/packages
#General utilities
import numpy as np
import pandas as pd
import dask.dataframe as dd
import seaborn as sns
from ipywidgets import widgets, interact, interactive, fixed, interact_manual

#NLP Stuff
#Spacy
import spacy
from spacy.lang.en.stop_words import STOP_WORDS #Load stopwords
from spacy.language import Language
from spacy.tokenizer import Tokenizer
#Scispacy
import scispacy
from scispacy.linking import EntityLinker
from scispacy.abbreviation import AbbreviationDetector
from scispacy.hyponym_detector import HyponymDetector
#Medspacy
import medspacy
from medspacy.ner import TargetRule
from medspacy.visualization import visualize_ent
from negspacy.negation import Negex

#To use Transformers models from HuggingFace
import transformers
from transformers import AutoTokenizer, AutoModel,AutoModelForTokenClassification
#NLTK

#Enable data to be extracted from my Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
#Query for trials
cond = 'allergy'#input('Enter the disease condition to find clinical trials: ')
a = 'https://clinicaltrials.gov/api/query/study_fields?expr='
b = '&fields=NCTId%2CBriefTitle%2CCondition%2COverallStatus%2CLeadSponsorName%2CEligibilityCriteria'
c = '&min_rnk=1&max_rnk=1000&fmt=csv'
q=(a + cond + b + c)
print(q)
qtrials = pd.read_csv(q,skiprows=10)

@interact
def show_recruiting_studies(column=['OverallStatus','Condition'], 
                            x = ['Recruiting','Completed','Unknown status'],
                            y = ['Food','Antibiotic']
                            ):
    if column == 'OverallStatus':
      return qtrials.loc[qtrials[column] == x]
    elif column == 'Condition':
      return qtrials[qtrials['Condition'].str.contains(y)]

https://clinicaltrials.gov/api/query/study_fields?expr=allergy&fields=NCTId%2CBriefTitle%2CCondition%2COverallStatus%2CLeadSponsorName%2CEligibilityCriteria&min_rnk=1&max_rnk=1000&fmt=csv


interactive(children=(Dropdown(description='column', options=('OverallStatus', 'Condition'), value='OverallSta…

In [4]:
#Load the models
ss_sm = spacy.load("en_core_sci_sm")

In [5]:
#DataFrame containing eligility criteria for all queried trials
qt_ec = qtrials[['EligibilityCriteria']]

#Eligibility criteria for current trial
ec = qt_ec['EligibilityCriteria'][1]

#Split eligibility criteria into exclusion and inclusion criteria sections
sections = ec.split("||Exclusion Criteria:||")
inclusion_criteria = [criteria for criteria in sections[0].replace("Inclusion Criteria:||", "").split("|") if len(criteria) > 0]
exclusion_criteria = [criteria for criteria in sections[1].split("|") if len(criteria) > 0]

print("Inclusion Criteria:")
for criteria in inclusion_criteria:
    print("-", criteria)

print("\nExclusion Criteria:")
for criteria in exclusion_criteria:
    print("-", criteria)

# remove stopwords from the inclusion criteria
inclusion_criteria_sw = [' '.join([token.text for token in ss_sm(criteria) if not token.is_stop]) for criteria in inclusion_criteria]

# remove stopwords from the exclusion criteria
exclusion_criteria_sw = [' '.join([token.text for token in ss_sm(criteria) if not token.is_stop]) for criteria in exclusion_criteria]

print('\n-------Remove Stopwords from Eligibility Criteria-------')
print("\nInclusion Criteria:")
for criteria in inclusion_criteria_sw:
    print("-", criteria)

print("\nExclusion Criteria:")
for criteria in exclusion_criteria_sw:
    print("-", criteria)

Inclusion Criteria:
- All children, young people and adults who fit the selection criteria from across all the practices can be referred to the allergy clinic.
- All patients and parents / carers where appropriate must be deemed capable of giving informed consent to take part in the research project.
- Infants under two with suspected food allergy
- Infants under two with moderate-to-severe eczema not responding to standard treatment.
- Children and young people (up to 16 years of age) with suspected allergic rhinitis symptoms that are unresponsive to a combination of oral antihistamines and nasal steroids
- Young people and adults (from 16 years of age) with a history of anaphylaxis or suspected anaphylaxis

Exclusion Criteria:
- Over 2 years of age with delayed type food allergy presenting primarily with gastrointestinal symptoms
- Over 2 years of age with confirmed non IgE-mediated symptoms including food intolerances, coeliac disease etc.
- Single urticarial reactions without an ob

In [6]:
#Dataframe containing inclusion criteria for current trial
ic_df = pd.DataFrame({'InclusionCriteria': inclusion_criteria, 'InclusionCriteria_NoSW': inclusion_criteria_sw})
ic_df

Unnamed: 0,InclusionCriteria,InclusionCriteria_NoSW
0,"All children, young people and adults who fit ...","children , young people adults fit selection c..."
1,All patients and parents / carers where approp...,patients parents / carers appropriate deemed c...
2,Infants under two with suspected food allergy,Infants suspected food allergy
3,Infants under two with moderate-to-severe ecze...,Infants moderate-to-severe eczema responding s...
4,Children and young people (up to 16 years of a...,Children young people ( 16 years age ) suspect...
5,Young people and adults (from 16 years of age)...,Young people adults ( 16 years age ) history a...


In [7]:
#Dataframe containing exclusion criteria for current trial
ec_df = pd.DataFrame({'ExclusionCriteria': exclusion_criteria, 'ExclusionCriteria_NoSW': exclusion_criteria_sw})
ec_df

Unnamed: 0,ExclusionCriteria,ExclusionCriteria_NoSW
0,Over 2 years of age with delayed type food all...,2 years age delayed type food allergy presenti...
1,Over 2 years of age with confirmed non IgE-med...,2 years age confirmed non IgE-mediated symptom...
2,Single urticarial reactions without an obvious...,Single urticarial reactions obvious triggers
3,Non-allergic chronic urticaria,Non-allergic chronic urticaria
4,Drug allergy,Drug allergy
5,"Well controlled allergic rhinitis, asthma or a...","controlled allergic rhinitis , asthma atopic e..."
6,Mild-to-moderate atopic eczema without an obvi...,Mild-to-moderate atopic eczema obvious allergi...
7,Localised insect sting reactions,Localised insect sting reactions


In [8]:
#Tokenize, identify part of speech, entity and parent word, and remove stopwords/punctuation from each row 
ic_df['tokens'] = ic_df['InclusionCriteria'].apply(lambda x: [token.text      for token in ss_sm(x)  if not token.is_stop and not token.is_punct])
ic_df['pos']    = ic_df['InclusionCriteria'].apply(lambda x: [token.pos_      for token in ss_sm(x)  if not token.is_stop and not token.is_punct])
ic_df['entity'] = ic_df['InclusionCriteria'].apply(lambda x: [token.ent_type_ for token in ss_sm(x)  if not token.is_stop and not token.is_punct])
ic_df['parent'] = ic_df['InclusionCriteria'].apply(lambda x: [token.head.text for token in ss_sm(x)  if not token.is_stop and not token.is_punct])

#Tokenize, identify part of speech, entity and parent word, and remove stopwords/punctuation from each row 
ec_df['tokens'] = ec_df['ExclusionCriteria'].apply(lambda x: [token.text      for token in ss_sm(x)  if not token.is_stop and not token.is_punct])
ec_df['pos']    = ec_df['ExclusionCriteria'].apply(lambda x: [token.pos_      for token in ss_sm(x)  if not token.is_stop and not token.is_punct])
ec_df['entity'] = ec_df['ExclusionCriteria'].apply(lambda x: [token.ent_type_ for token in ss_sm(x)  if not token.is_stop and not token.is_punct])
ec_df['parent'] = ec_df['ExclusionCriteria'].apply(lambda x: [token.head.text for token in ss_sm(x)  if not token.is_stop and not token.is_punct])

In [9]:
ic_df

Unnamed: 0,InclusionCriteria,InclusionCriteria_NoSW,tokens,pos,entity,parent
0,"All children, young people and adults who fit ...","children , young people adults fit selection c...","[children, young, people, adults, fit, selecti...","[NOUN, ADJ, NOUN, NOUN, VERB, NOUN, NOUN, NOUN...","[ENTITY, ENTITY, ENTITY, ENTITY, , ENTITY, ENT...","[referred, people, referred, people, adults, c..."
1,All patients and parents / carers where approp...,patients parents / carers appropriate deemed c...,"[patients, parents, carers, appropriate, deeme...","[NOUN, NOUN, NOUN, NOUN, VERB, ADJ, VERB, ADJ,...","[ENTITY, ENTITY, ENTITY, , , , , , ENTITY, ENT...","[patients, patients, patients, where, carers, ..."
2,Infants under two with suspected food allergy,Infants suspected food allergy,"[Infants, suspected, food, allergy]","[NOUN, VERB, NOUN, NOUN]","[ENTITY, ENTITY, ENTITY, ENTITY]","[Infants, allergy, allergy, Infants]"
3,Infants under two with moderate-to-severe ecze...,Infants moderate-to-severe eczema responding s...,"[Infants, moderate-to-severe, eczema, respondi...","[NOUN, ADJ, NOUN, VERB, ADJ, NOUN]","[ENTITY, , ENTITY, , ENTITY, ENTITY]","[Infants, eczema, two, Infants, treatment, res..."
4,Children and young people (up to 16 years of a...,Children young people ( 16 years age ) suspect...,"[Children, young, people, 16, years, age, susp...","[NOUN, ADJ, NOUN, NUM, NOUN, NOUN, ADJ, ADJ, N...","[ENTITY, ENTITY, ENTITY, , ENTITY, ENTITY, , E...","[Children, people, Children, years, Children, ..."
5,Young people and adults (from 16 years of age)...,Young people adults ( 16 years age ) history a...,"[Young, people, adults, 16, years, age, histor...","[ADJ, NOUN, NOUN, NUM, NOUN, NOUN, NOUN, ADJ, ...","[ENTITY, ENTITY, ENTITY, , ENTITY, ENTITY, ENT...","[people, people, people, years, people, years,..."


In [10]:
ec_df

Unnamed: 0,ExclusionCriteria,ExclusionCriteria_NoSW,tokens,pos,entity,parent
0,Over 2 years of age with delayed type food all...,2 years age delayed type food allergy presenti...,"[2, years, age, delayed, type, food, allergy, ...","[NUM, NOUN, NOUN, VERB, NOUN, NOUN, NOUN, VERB...","[, ENTITY, ENTITY, ENTITY, ENTITY, ENTITY, ENT...","[years, years, years, allergy, allergy, allerg..."
1,Over 2 years of age with confirmed non IgE-med...,2 years age confirmed non IgE-mediated symptom...,"[2, years, age, confirmed, non, IgE-mediated, ...","[NUM, NOUN, NOUN, VERB, ADJ, ADJ, NOUN, VERB, ...","[, ENTITY, ENTITY, , ENTITY, ENTITY, ENTITY, ,...","[years, years, years, symptoms, IgE-mediated, ..."
2,Single urticarial reactions without an obvious...,Single urticarial reactions obvious triggers,"[Single, urticarial, reactions, obvious, trigg...","[ADJ, ADJ, NOUN, ADJ, NOUN]","[ENTITY, ENTITY, ENTITY, , ENTITY]","[reactions, reactions, reactions, triggers, re..."
3,Non-allergic chronic urticaria,Non-allergic chronic urticaria,"[Non-allergic, chronic, urticaria]","[ADJ, ADJ, NOUN]","[ENTITY, ENTITY, ENTITY]","[urticaria, urticaria, urticaria]"
4,Drug allergy,Drug allergy,"[Drug, allergy]","[NOUN, NOUN]","[ENTITY, ENTITY]","[allergy, allergy]"
5,"Well controlled allergic rhinitis, asthma or a...","controlled allergic rhinitis , asthma atopic e...","[controlled, allergic, rhinitis, asthma, atopi...","[VERB, ADJ, NOUN, NOUN, ADJ, NOUN]","[ENTITY, ENTITY, ENTITY, ENTITY, ENTITY, ENTITY]","[rhinitis, rhinitis, rhinitis, rhinitis, eczem..."
6,Mild-to-moderate atopic eczema without an obvi...,Mild-to-moderate atopic eczema obvious allergi...,"[Mild-to-moderate, atopic, eczema, obvious, al...","[ADJ, ADJ, NOUN, ADJ, ADJ, NOUN]","[ENTITY, ENTITY, ENTITY, , ENTITY, ENTITY]","[eczema, eczema, eczema, trigger, trigger, ecz..."
7,Localised insect sting reactions,Localised insect sting reactions,"[Localised, insect, sting, reactions]","[ADJ, NOUN, NOUN, NOUN]","[ENTITY, ENTITY, ENTITY, ENTITY]","[insect, insect, reactions, insect]"


In [21]:
# Load the pre-trained spaCy model with sci-spaCy
ss_sm = spacy.load("en_core_sci_sm")

def get_umls_codes(text: str):
    
    #Add the EntityLinker pipe to spacy pipeline
    if 'scispacy_linker' not in ss_sm.pipe_names:
      ss_sm.add_pipe("scispacy_linker", config={"linker_name": "umls", "max_entities_per_mention": 1})
    
    # Process the text and extract UMLS codes
    doc = ss_sm(text)
    umls_codes = [
        {
            "text": entity.text,
            "start": entity.start_char,
            "end": entity.end_char,
            "umls_id": umls_ent[0],
            "score": umls_ent[1]
        }
        for entity in doc.ents
        for umls_ent in entity._.kb_ents
    ]
    
    return umls_codes

In [22]:
ic_df['umls_codes'] = ic_df['InclusionCriteria'].apply(get_umls_codes)
ic_df

Unnamed: 0,InclusionCriteria,InclusionCriteria_NoSW,tokens,pos,entity,parent,umls_codes
0,"All children, young people and adults who fit ...","children , young people adults fit selection c...","[children, young, people, adults, fit, selecti...","[NOUN, ADJ, NOUN, NOUN, VERB, NOUN, NOUN, NOUN...","[ENTITY, ENTITY, ENTITY, ENTITY, , ENTITY, ENT...","[referred, people, referred, people, adults, c...","[{'text': 'children', 'start': 4, 'end': 12, '..."
1,All patients and parents / carers where approp...,patients parents / carers appropriate deemed c...,"[patients, parents, carers, appropriate, deeme...","[NOUN, NOUN, NOUN, NOUN, VERB, ADJ, VERB, ADJ,...","[ENTITY, ENTITY, ENTITY, , , , , , ENTITY, ENT...","[patients, patients, patients, where, carers, ...","[{'text': 'patients', 'start': 4, 'end': 12, '..."
2,Infants under two with suspected food allergy,Infants suspected food allergy,"[Infants, suspected, food, allergy]","[NOUN, VERB, NOUN, NOUN]","[ENTITY, ENTITY, ENTITY, ENTITY]","[Infants, allergy, allergy, Infants]","[{'text': 'Infants', 'start': 0, 'end': 7, 'um..."
3,Infants under two with moderate-to-severe ecze...,Infants moderate-to-severe eczema responding s...,"[Infants, moderate-to-severe, eczema, respondi...","[NOUN, ADJ, NOUN, VERB, ADJ, NOUN]","[ENTITY, , ENTITY, , ENTITY, ENTITY]","[Infants, eczema, two, Infants, treatment, res...","[{'text': 'Infants', 'start': 0, 'end': 7, 'um..."
4,Children and young people (up to 16 years of a...,Children young people ( 16 years age ) suspect...,"[Children, young, people, 16, years, age, susp...","[NOUN, ADJ, NOUN, NUM, NOUN, NOUN, ADJ, ADJ, N...","[ENTITY, ENTITY, ENTITY, , ENTITY, ENTITY, , E...","[Children, people, Children, years, Children, ...","[{'text': 'Children', 'start': 0, 'end': 8, 'u..."
5,Young people and adults (from 16 years of age)...,Young people adults ( 16 years age ) history a...,"[Young, people, adults, 16, years, age, histor...","[ADJ, NOUN, NOUN, NUM, NOUN, NOUN, NOUN, ADJ, ...","[ENTITY, ENTITY, ENTITY, , ENTITY, ENTITY, ENT...","[people, people, people, years, people, years,...","[{'text': 'Young people', 'start': 0, 'end': 1..."


In [17]:
df = ic_df['umls_codes'][0]
df

[{'text': 'children',
  'start': 4,
  'end': 12,
  'umls_id': 'C0008059',
  'score': 0.9999998807907104},
 {'text': 'children',
  'start': 4,
  'end': 12,
  'umls_id': 'C0680063',
  'score': 0.9999998807907104},
 {'text': 'young people',
  'start': 14,
  'end': 26,
  'umls_id': 'C2963163',
  'score': 0.7737775444984436},
 {'text': 'adults',
  'start': 31,
  'end': 37,
  'umls_id': 'C0001675',
  'score': 1.0},
 {'text': 'adults',
  'start': 31,
  'end': 37,
  'umls_id': 'C0220615',
  'score': 0.8224515914916992},
 {'text': 'selection criteria',
  'start': 50,
  'end': 68,
  'umls_id': 'C0242801',
  'score': 1.0},
 {'text': 'selection criteria',
  'start': 50,
  'end': 68,
  'umls_id': 'C4684631',
  'score': 0.8625878691673279},
 {'text': 'practices',
  'start': 89,
  'end': 98,
  'umls_id': 'C0237607',
  'score': 1.0},
 {'text': 'practices',
  'start': 89,
  'end': 98,
  'umls_id': 'C3245512',
  'score': 0.8198214173316956},
 {'text': 'allergy clinic',
  'start': 122,
  'end': 136,
  'u

In [26]:
def extract_values(dicts, key):
    return [d.get(key, None) for d in dicts]

# Create new columns from the keys in the dictionaries within the 'info' column lists
unique_keys = set().union(*(d.keys() for dicts in ic_df['umls_codes'] for d in dicts))

for key in unique_keys:
    ic_df[key] = ic_df['umls_codes'].apply(lambda dicts: extract_values(dicts, key))

ic_df

Unnamed: 0,InclusionCriteria,InclusionCriteria_NoSW,tokens,pos,entity,parent,umls_codes,score,text,end,umls_id,start
0,"All children, young people and adults who fit ...","children , young people adults fit selection c...","[children, young, people, adults, fit, selecti...","[NOUN, ADJ, NOUN, NOUN, VERB, NOUN, NOUN, NOUN...","[ENTITY, ENTITY, ENTITY, ENTITY, , ENTITY, ENT...","[referred, people, referred, people, adults, c...","[{'text': 'children', 'start': 4, 'end': 12, '...","[0.9999998807907104, 0.7737775444984436, 1.0, ...","[children, young people, adults, selection cri...","[12, 26, 37, 68, 98, 136]","[C0008059, C2963163, C0001675, C0242801, C0237...","[4, 14, 31, 50, 89, 122]"
1,All patients and parents / carers where approp...,patients parents / carers appropriate deemed c...,"[patients, parents, carers, appropriate, deeme...","[NOUN, NOUN, NOUN, NOUN, VERB, ADJ, VERB, ADJ,...","[ENTITY, ENTITY, ENTITY, , , , , , ENTITY, ENT...","[patients, patients, patients, where, carers, ...","[{'text': 'patients', 'start': 4, 'end': 12, '...","[1.0, 1.0, 1.0, 0.9999998807907104, 0.99999994...","[patients, parents, carers, consent, research ...","[12, 24, 33, 101, 138]","[C0030705, C0030551, C0085537, C1511481, C0700...","[4, 17, 27, 94, 122]"
2,Infants under two with suspected food allergy,Infants suspected food allergy,"[Infants, suspected, food, allergy]","[NOUN, VERB, NOUN, NOUN]","[ENTITY, ENTITY, ENTITY, ENTITY]","[Infants, allergy, allergy, Infants]","[{'text': 'Infants', 'start': 0, 'end': 7, 'um...","[1.0, 1.0, 1.0]","[Infants, suspected, food allergy]","[7, 32, 45]","[C0021270, C0332147, C0016470]","[0, 23, 33]"
3,Infants under two with moderate-to-severe ecze...,Infants moderate-to-severe eczema responding s...,"[Infants, moderate-to-severe, eczema, respondi...","[NOUN, ADJ, NOUN, VERB, ADJ, NOUN]","[ENTITY, , ENTITY, , ENTITY, ENTITY]","[Infants, eczema, two, Infants, treatment, res...","[{'text': 'Infants', 'start': 0, 'end': 7, 'um...","[1.0, 1.0, 1.0, 1.0]","[Infants, eczema, standard, treatment]","[7, 48, 75, 85]","[C0021270, C0013595, C1442989, C0039798]","[0, 42, 67, 76]"
4,Children and young people (up to 16 years of a...,Children young people ( 16 years age ) suspect...,"[Children, young, people, 16, years, age, susp...","[NOUN, ADJ, NOUN, NUM, NOUN, NOUN, ADJ, ADJ, N...","[ENTITY, ENTITY, ENTITY, , ENTITY, ENTITY, , E...","[Children, people, Children, years, Children, ...","[{'text': 'Children', 'start': 0, 'end': 8, 'u...","[0.9999998807907104, 0.7737775444984436, 1.0, ...","[Children, young people, years, age, allergic ...","[8, 25, 41, 48, 82, 91, 130, 153, 172]","[C0008059, C2963163, C0439234, C0001779, C1334...","[0, 13, 36, 45, 65, 83, 119, 134, 158]"
5,Young people and adults (from 16 years of age)...,Young people adults ( 16 years age ) history a...,"[Young, people, adults, 16, years, age, histor...","[ADJ, NOUN, NOUN, NUM, NOUN, NOUN, NOUN, ADJ, ...","[ENTITY, ENTITY, ENTITY, , ENTITY, ENTITY, ENT...","[people, people, people, years, people, years,...","[{'text': 'Young people', 'start': 0, 'end': 1...","[0.7737775444984436, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[Young people, adults, years, age, history, an...","[12, 23, 38, 45, 61, 76, 89, 101]","[C2963163, C0001675, C0439234, C0001779, C0019...","[0, 17, 33, 42, 54, 65, 80, 90]"


In [27]:
# Define the Jaccard index function
def jaccard_index(a, b):
    set_a = set(a)
    set_b = set(b)
    intersection = set_a.intersection(set_b)
    union = set_a.union(set_b)
    return len(intersection) / len(union)

# Define the Sørensen-Dice index function
def sorensen_dice_index(a, b):
    set_a = set(a)
    set_b = set(b)
    intersection = set_a.intersection(set_b)
    return 2 * len(intersection) / (len(set_a) + len(set_b))


In [None]:
# Calculate the Jaccard index between the two columns
df['jaccard_index'] = df.apply(lambda row: jaccard_index(row['column_A'], row['column_B']), axis=1)

# Calculate the Sørensen-Dice index between the two columns
df['sorensen_dice_index'] = df.apply(lambda row: sorensen_dice_indaex(row['column_A'], row['column_B']), axis=1)
