In [8]:
from datasets import load_dataset
import pandas as pd
import huggingface_hub
import matplotlib.pyplot as plt
import numpy as np
import spacy
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [9]:
nltk.download('wordnet')
nlp = spacy.load("en_core_web_sm")
med7 = spacy.load("en_core_med7_lg")

def process_text(text):
    text = text.replace('\n', ' ')
    text = text.replace('\t', ' ')
    text = text.replace('\r', ' ')
    text = text.replace('  ', ' ')

    # Word tokenization using NLTK
    tokens = word_tokenize(text)

    # Normalization (lowercasing and lemmatization) using NLTK
    tokens = [token.lower() for token in tokens]
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Token filtering (stopwords removal) using NLTK

    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Remove punctuation tokens
    tokens = [token for token in tokens if token.isalpha()]

    # join all
    text = ' '.join(tokens)

    return text

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\marta\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [10]:
dataset = load_dataset("argilla/medical-domain")

In [11]:
dataset['train'][0]['text']

'PREOPERATIVE DIAGNOSIS:,  Iron deficiency anemia.,POSTOPERATIVE DIAGNOSIS:,  Diverticulosis.,PROCEDURE:,  Colonoscopy.,MEDICATIONS: , MAC.,PROCEDURE: , The Olympus pediatric variable colonoscope was introduced into the rectum and advanced carefully through the colon to the cecum identified by the ileocecal valve and the appendiceal orifice.  Preparation was good, although there was some residual material in the cecum that was difficult to clear completely.  The mucosa was normal throughout the colon.  No polyps or other lesions were identified, and no blood was noted.  Some diverticula were seen of the sigmoid colon with no luminal narrowing or evidence of inflammation.  A retroflex view of the anorectal junction showed no hemorrhoids.  The patient tolerated the procedure well and was sent to the recovery room.,FINAL DIAGNOSES:,1.  Diverticulosis in the sigmoid.,2.  Otherwise normal colonoscopy to the cecum.,RECOMMENDATIONS:,1.  Follow up with Dr. X as needed.,2.  Screening colonoscop

In [12]:
# preprocess the data
dataset_p = pd.DataFrame(columns=['tokens'])
for i in range(len(dataset['train'])):
    text = dataset['train'][i]['text']
    text_p = process_text(text)
    dataset_p.loc[i] = [text_p]

    # dataset['train'][i]['text'] = tokens

In [13]:
dataset_p

Unnamed: 0,tokens
0,preoperative diagnosis iron deficiency postope...
1,clinical indication normal stress procedure pe...
2,finding scan performed reformatted image obtai...
3,preoperative diagnosis blood loss postoperativ...
4,reason visit elevated psa nocturia occasional ...
...,...
4961,single chamber pacemaker implantation preopera...
4962,procedure caudal epidural steroid injection wi...
4963,preoperative diagnosis hemangioma nasal postop...
4964,preoperative diagnosis right trigger postopera...


In [15]:
dataset_p.loc[0]

tokens    preoperative diagnosis iron deficiency postope...
Name: 0, dtype: object

In [39]:
med7 = spacy.load("en_core_med7_lg")
nlp = spacy.load("en_core_web_sm")
text = dataset_p.loc[120, 'tokens']
doc = med7.add_pipe(nlp(text))

In [47]:
col_dict = {}
seven_colours = ['#e6194B', '#3cb44b', '#ffe119', '#ffd8b1', '#f58231', '#f032e6', '#42d4f4', '#fabebe', '#469990', 
                 '#e6beff', '#9A6324', '#fffac8', '#800000', '#aaffc3', '#808000', '#ffd8b1', '#000075', '#a9a9a9',
                 '#ffffff', '#000000', '#f58231', '#911eb4', '#46f0f0', '#f032e6', '#bcf60c']
med7_ner = list(med7.pipe_labels['ner'])
nlp_ner = list(nlp.pipe_labels['ner'])
combined_ner = med7_ner + nlp_ner
for label, colour in zip(combined_ner, seven_colours):
    col_dict[label] = colour

options = {'ents': combined_ner, 'colors':col_dict}

text = dataset_p.loc[120, 'tokens']
doc_med7 = med7(text)
doc_web_sm = nlp(text)

entities_med7 = [(ent.text, ent.label_) for ent in doc_med7.ents]
entities_web_sm = [(ent.text, ent.label_) for ent in doc_web_sm.ents]

docs = [doc_med7, doc_web_sm]
spacy.displacy.render(docs, style='ent', jupyter=True, options=options)

combined_entities = []
for doc in docs:
    combined_entities.extend([(ent.text, ent.label_) for ent in doc.ents])

print(combined_entities)

[('chlorhexidine', 'DRUG'), ('phenol', 'DRUG'), ('phenol', 'DRUG'), ('injection', 'FORM'), ('injection', 'FORM'), ('injected', 'ROUTE'), ('botulinum toxin', 'DRUG'), ('botulinum toxin', 'DRUG'), ('injected', 'ROUTE'), ('vastus medialis', 'PERSON'), ('one', 'CARDINAL'), ('two', 'CARDINAL')]


In [48]:
len(dataset_p)

4966

In [None]:
# CUSTOMIZE NER MODEL
nlp = spacy.load("en_core_web_sm")
ner = nlp.get_pipe("ner")
if ner is None:
    ner = nlp.create_pipe("ner")
    nlp.add_pipe(ner)


ner.add_label("CUSTOM_LABEL")

import random
from spacy.training.example import Example

TRAIN_DATA = [...]  # List of training data in spaCy format
random.shuffle(TRAIN_DATA)

for epoch in range(20):  # Number of training epochs
    for texts, annotations in TRAIN_DATA:
        doc = nlp.make_doc(texts)
        example = Example.from_dict(doc, annotations)
        nlp.update([example], drop=0.5)  # Drop is a dropout rate