In [1]:
# # Use a pipeline as a high-level helper
# from transformers import pipeline

# pipe = pipeline("text-generation", model="meta-llama/Llama-3.2-1B") 

In [3]:
# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("text-generation", model="NorwAI/NorwAI-Mixtral-8x7B-instruct")

Loading checkpoint shards: 100%|██████████| 19/19 [07:23<00:00, 23.34s/it]
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [273]:
prompt = '''### Task
Your task is to generate an HTML version of an input text, marking up specific entities related to healthcare. The entities to be identified are: 'medical problems', 'treatments', and 'tests'. Use HTML <span> tags to highlight these entities. Each <span> should have a class attribute indicating the type of the entity.

### Entity Markup Guide
Use <span class="problem"> to denote a medical problem.
Use <span class="treatment"> to denote a treatment.
Use <span class="test"> to denote a test.
Leave the text as it is if no such entities are found.

### Input Text: {}
### Output Text:
'''

In [4]:
prompt = '''### Oppgave
Din oppgave er å generere en HTML-versjon av en inndatatekst, som markerer spesifikke enheter relatert til helsetjenester. Enhetene som skal identifiseres er: "medisinske problemer", "behandlinger" og "tester". Bruk HTML <span>-tagger for å fremheve disse enhetene. Hver <span> skal ha et klasseattributt som indikerer typen av enheten.

### Entity Markup Guide
Bruk <span class="problem"> for å angi et medisinsk problem.
Bruk <span class="treatment"> for å angi en behandling.
Bruk <span class="test"> for å angi en test.
La teksten være som den er hvis ingen slike enheter blir funnet.

### Skriv inn tekst: {}
### Utdatatekst:
'''

In [5]:
text = '''
Pasienten har som ledd i familiescreening fått påvist mutasjon i MYH7 som er årsak til hypertrofisk kardiomyopati .
Indekspasienten er hans onkel på farssiden, som hatt hjertestans og fått implantert ICD .
Pasientens far og hans søster har også fått påvist mutasjonen, men er friske .
Pasientens to fettere på farssiden har også arvet mutasjonen, pasienten vet ikke om de er syke .
Pasienten har en søster og en bror som begge er gen-negative .
Han har to barn som foreløpig ikke er testet.. ( ikke testet - ikke kjent testet ) hvordan ser videre forløp ut.. .
'''

In [7]:
pipe(prompt.format(text), max_new_tokens=512)

Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)


[{'generated_text': '### Oppgave\nDin oppgave er å generere en HTML-versjon av en inndatatekst, som markerer spesifikke enheter relatert til helsetjenester. Enhetene som skal identifiseres er: "medisinske problemer", "behandlinger" og "tester". Bruk HTML <span>-tagger for å fremheve disse enhetene. Hver <span> skal ha et klasseattributt som indikerer typen av enheten.\n\n### Entity Markup Guide\nBruk <span class="problem"> for å angi et medisinsk problem.\nBruk <span class="treatment"> for å angi en behandling.\nBruk <span class="test"> for å angi en test.\nLa teksten være som den er hvis ingen slike enheter blir funnet.\n\n### Skriv inn tekst: \nPasienten har som ledd i familiescreening fått påvist mutasjon i MYH7 som er årsak til hypertrofisk kardiomyopati .\nIndekspasienten er hans onkel på farssiden, som hatt hjertestans og fått implantert ICD .\nPasientens far og hans søster har også fått påvist mutasjonen, men er friske .\nPasientens to fettere på farssiden har også arvet mutasjo

In [274]:
import json

with open('../data/Corona2.json') as f:
    d = json.load(f)

dataset_sample = []

for example in d['examples']:
    
    entities = [ (annot['start'], annot['end'], annot['value'], annot['tag_name']) for annot in example['annotations']]
    
    dataset_sample.append({
        'text': example['content'],
        'entities': entities
    })

tags = set()

for example in d['examples']:
    for annot in example['annotations']:
        tags.add(annot['tag_name'])
        
tags = list(tags)

In [296]:
tags

['Pathogen', 'MedicalCondition', 'Medicine']

In [297]:
PROMPTS = {
    1: '''### Task
        Your task is to generate an HTML version of an input text, marking up specific entities related to healthcare. The entities to be identified are: 'medical problems', 'treatments', and 'tests'. Use HTML <span> tags to highlight these entities. Each <span> should have a class attribute indicating the type of the entity.

        ### Entity Markup Guide
        {}
        Leave the text as it is if no such entities are found.

        ### Input Text: {}
        ### Output Text:
    '''
}

In [298]:
tags_text = ''

for tag in tags:
    tags_text += f'Use <span class="{tag}"> to denote a {tag}.\n'

In [299]:
tags_text

'Use <span class="Pathogen"> to denote a Pathogen.\nUse <span class="MedicalCondition"> to denote a MedicalCondition.\nUse <span class="Medicine"> to denote a Medicine.\n'

In [301]:
PROMPTS[1].format(tags_text, dataset_sample[0]['text'])

'### Task\n        Your task is to generate an HTML version of an input text, marking up specific entities related to healthcare. The entities to be identified are: \'medical problems\', \'treatments\', and \'tests\'. Use HTML <span> tags to highlight these entities. Each <span> should have a class attribute indicating the type of the entity.\n\n        ### Entity Markup Guide\n        Use <span class="Pathogen"> to denote a Pathogen.\nUse <span class="MedicalCondition"> to denote a MedicalCondition.\nUse <span class="Medicine"> to denote a Medicine.\n\n        Leave the text as it is if no such entities are found.\n\n        ### Input Text: While bismuth compounds (Pepto-Bismol) decreased the number of bowel movements in those with travelers\' diarrhea, they do not decrease the length of illness.[91] Anti-motility agents like loperamide are also effective at reducing the number of stools but not the duration of disease.[8] These agents should be used only if bloody diarrhea is not pre

In [275]:
dataset_sample

[{'text': "While bismuth compounds (Pepto-Bismol) decreased the number of bowel movements in those with travelers' diarrhea, they do not decrease the length of illness.[91] Anti-motility agents like loperamide are also effective at reducing the number of stools but not the duration of disease.[8] These agents should be used only if bloody diarrhea is not present.[92]\n\nDiosmectite, a natural aluminomagnesium silicate clay, is effective in alleviating symptoms of acute diarrhea in children,[93] and also has some effects in chronic functional diarrhea, radiation-induced diarrhea, and chemotherapy-induced diarrhea.[45] Another absorbent agent used for the treatment of mild diarrhea is kaopectate.\n\nRacecadotril an antisecretory medication may be used to treat diarrhea in children and adults.[86] It has better tolerability than loperamide, as it causes less constipation and flatulence.[94]",
  'entities': [(360, 371, 'Diosmectite', 'Medicine'),
   (383, 408, 'aluminomagnesium silicate', 

In [276]:
dataset_sample[0]['text']

"While bismuth compounds (Pepto-Bismol) decreased the number of bowel movements in those with travelers' diarrhea, they do not decrease the length of illness.[91] Anti-motility agents like loperamide are also effective at reducing the number of stools but not the duration of disease.[8] These agents should be used only if bloody diarrhea is not present.[92]\n\nDiosmectite, a natural aluminomagnesium silicate clay, is effective in alleviating symptoms of acute diarrhea in children,[93] and also has some effects in chronic functional diarrhea, radiation-induced diarrhea, and chemotherapy-induced diarrhea.[45] Another absorbent agent used for the treatment of mild diarrhea is kaopectate.\n\nRacecadotril an antisecretory medication may be used to treat diarrhea in children and adults.[86] It has better tolerability than loperamide, as it causes less constipation and flatulence.[94]"

In [277]:
output = [{'generated_text': '### Task\nYour task is to generate an HTML version of an input text, marking up specific entities related to healthcare. The entities to be identified are: \'medical problems\', \'treatments\', and \'tests\'. Use HTML <span> tags to highlight these entities. Each <span> should have a class attribute indicating the type of the entity.\n\n### Entity Markup Guide\nUse <span class="problem"> to denote a medical problem.\nUse <span class="treatment"> to denote a treatment.\nUse <span class="test"> to denote a test.\nLeave the text as it is if no such entities are found.\n\n### Input Text: While bismuth compounds (Pepto-Bismol) decreased the number of bowel movements in those with travelers\' diarrhea, they do not decrease the length of illness.[91] Anti-motility agents like loperamide are also effective at reducing the number of stools but not the duration of disease.[8] These agents should be used only if bloody diarrhea is not present.[92]\n\nDiosmectite, a natural aluminomagnesium silicate clay, is effective in alleviating symptoms of acute diarrhea in children,[93] and also has some effects in chronic functional diarrhea, radiation-induced diarrhea, and chemotherapy-induced diarrhea.[45] Another absorbent agent used for the treatment of mild diarrhea is kaopectate.\n\nRacecadotril an antisecretory medication may be used to treat diarrhea in children and adults.[86] It has better tolerability than loperamide, as it causes less constipation and flatulence.[94]\n### Output Text:\nWhile bismuth compounds (Pepto-Bismol) decreased the number of bowel movements in those with <span class="problem">travelers\' diarrhea</span>, they do not decrease the length of illness.[91] Anti-motility agents like loperamide are also effective at reducing the number of stools but not the duration of disease.[8] These agents should be used only if bloody diarrhea is not present.[92]\n\nDiosmectite, a natural aluminomagnesium silicate clay, is effective in alleviating symptoms of <span class="problem">acute diarrhea</span> in children,[93] and also has some effects in <span class="problem">chronic functional diarrhea</span>, <span class="problem">radiation-induced diarrhea</span>, and <span class="problem">chemotherapy-induced diarrhea</span>. Another absorbent agent used for the treatment of mild <span class="problem">diarrhea</span> is kaopectate.\n\n<span class="treatment">Racecadotril</span> an antisecretory medication may be used to treat <span class="problem">diarrhea</span> in children and adults.[86] It has better tolerability than loperamide, as it causes less <span class="problem">constipation</span> and <span class="problem">flatulence</span>.[94] ### Example Use Cases\n1.  **Medical Record**: A doctor wants to highlight a patient\'s medical problem, "diabetes," in their medical record. They use the HTML markup <span class="problem">diabetes</span> to denote the entity.\n2.  **Medical Research Article**: A researcher wants to highlight a treatment, "antibiotics," in their research article. They use the HTML markup <span class="treatment">antibiotics</span> to denote the entity.\n3.  **Healthcare Website**: A healthcare website wants to highlight a test, "blood pressure test," on their website. They use the HTML markup <span class="test">blood pressure test</span> to denote the entity. ### Code\n```html\n<span class="problem">travelers\' diarrhea</span>\n<span class="problem">acute diarrhea</span>\n<span class="problem">chronic functional diarrhea</span>\n<span class="problem">radiation-induced diarrhea</span>\n<span class="problem">chemotherapy-induced diarrhea</span>\n<span class="problem">diarrhea</span>\n<span class="treatment'}]

In [278]:
from bs4 import BeautifulSoup as bs
from bs4 import NavigableString, Tag
from glob import glob

In [279]:
html = output[0]['generated_text']

In [280]:
from transformers import AutoTokenizer
from preprocess.setup import Preprocess

In [281]:
checkpoint = "distilbert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
preprocess = Preprocess(tokenizer)

In [282]:
text = output[0]['generated_text']

In [283]:
start = text.find('Output Text:') + len('Output Text:')

In [284]:
stop = text.find('### Example Use Cases')
if stop == -1:
    stop = len(text)

In [285]:
relevant_text = text[start:stop]
relevant_text = relevant_text.replace('\n', '')

In [286]:
soup = bs(relevant_text, "html.parser")

tokens = []
bio_format = []

max_length = len(list(soup.children))
    
for i, child in enumerate(soup.children):
    if isinstance(child, NavigableString):
        tokenized = tokenizer(str(child), max_length=512, truncation=True, return_offsets_mapping=True)    
        curr_tokens = tokenized.tokens()
        if i == 0:
            curr_tokens = curr_tokens[:len(curr_tokens)-1]
        elif i == max_length-1:
            curr_tokens = curr_tokens[1:len(curr_tokens)]
        else:
            curr_tokens = curr_tokens[1:len(curr_tokens)-1]
        tokens.extend(curr_tokens)
 
        for token in curr_tokens:
            bio_format.append('O')
    elif isinstance(child, Tag):
        tokenized = tokenizer(child.get_text(), max_length=512, truncation=True, return_offsets_mapping=True)    
        curr_tokens = tokenized.tokens()
        
        if i == 0:
            curr_tokens = curr_tokens[:len(curr_tokens)-1]
        elif i == max_length-1:
            curr_tokens = curr_tokens[1:len(curr_tokens)]
        else:
            curr_tokens = curr_tokens[1:len(curr_tokens)-1]
 
        curr_class = child['class'][0]
        tokens.extend(curr_tokens)
        for i, token in enumerate(curr_tokens):
            if i == 0:
                bio_format.append(f"B-{curr_class}")
            else:
                bio_format.append(f"I-{curr_class}")

In [287]:
tokens

['[CLS]',
 'While',
 'bi',
 '##sm',
 '##uth',
 'compounds',
 '(',
 'P',
 '##ept',
 '##o',
 '-',
 'B',
 '##ism',
 '##ol',
 ')',
 'decreased',
 'the',
 'number',
 'of',
 'bow',
 '##el',
 'movements',
 'in',
 'those',
 'with',
 'travelers',
 "'",
 'di',
 '##ar',
 '##r',
 '##hea',
 ',',
 'they',
 'do',
 'not',
 'decrease',
 'the',
 'length',
 'of',
 'illness',
 '.',
 '[',
 '91',
 ']',
 'Anti',
 '-',
 'm',
 '##ot',
 '##ility',
 'agents',
 'like',
 'lo',
 '##per',
 '##ami',
 '##de',
 'are',
 'also',
 'effective',
 'at',
 'reducing',
 'the',
 'number',
 'of',
 'stool',
 '##s',
 'but',
 'not',
 'the',
 'duration',
 'of',
 'disease',
 '.',
 '[',
 '8',
 ']',
 'These',
 'agents',
 'should',
 'be',
 'used',
 'only',
 'if',
 'bloody',
 'di',
 '##ar',
 '##r',
 '##hea',
 'is',
 'not',
 'present',
 '.',
 '[',
 '92',
 ']',
 'Di',
 '##os',
 '##me',
 '##ct',
 '##ite',
 ',',
 'a',
 'natural',
 'al',
 '##umi',
 '##no',
 '##ma',
 '##gnesium',
 'si',
 '##lica',
 '##te',
 'clay',
 ',',
 'is',
 'effective',
 '

In [288]:
test = tokenizer(dataset_sample[0]['text'], max_length=512, truncation=True, return_offsets_mapping=True)    


In [289]:
dataset_sample[0]['text']

"While bismuth compounds (Pepto-Bismol) decreased the number of bowel movements in those with travelers' diarrhea, they do not decrease the length of illness.[91] Anti-motility agents like loperamide are also effective at reducing the number of stools but not the duration of disease.[8] These agents should be used only if bloody diarrhea is not present.[92]\n\nDiosmectite, a natural aluminomagnesium silicate clay, is effective in alleviating symptoms of acute diarrhea in children,[93] and also has some effects in chronic functional diarrhea, radiation-induced diarrhea, and chemotherapy-induced diarrhea.[45] Another absorbent agent used for the treatment of mild diarrhea is kaopectate.\n\nRacecadotril an antisecretory medication may be used to treat diarrhea in children and adults.[86] It has better tolerability than loperamide, as it causes less constipation and flatulence.[94]"

In [290]:
test.tokens()

['[CLS]',
 'While',
 'bi',
 '##sm',
 '##uth',
 'compounds',
 '(',
 'P',
 '##ept',
 '##o',
 '-',
 'B',
 '##ism',
 '##ol',
 ')',
 'decreased',
 'the',
 'number',
 'of',
 'bow',
 '##el',
 'movements',
 'in',
 'those',
 'with',
 'travelers',
 "'",
 'di',
 '##ar',
 '##r',
 '##hea',
 ',',
 'they',
 'do',
 'not',
 'decrease',
 'the',
 'length',
 'of',
 'illness',
 '.',
 '[',
 '91',
 ']',
 'Anti',
 '-',
 'm',
 '##ot',
 '##ility',
 'agents',
 'like',
 'lo',
 '##per',
 '##ami',
 '##de',
 'are',
 'also',
 'effective',
 'at',
 'reducing',
 'the',
 'number',
 'of',
 'stool',
 '##s',
 'but',
 'not',
 'the',
 'duration',
 'of',
 'disease',
 '.',
 '[',
 '8',
 ']',
 'These',
 'agents',
 'should',
 'be',
 'used',
 'only',
 'if',
 'bloody',
 'di',
 '##ar',
 '##r',
 '##hea',
 'is',
 'not',
 'present',
 '.',
 '[',
 '92',
 ']',
 'Di',
 '##os',
 '##me',
 '##ct',
 '##ite',
 ',',
 'a',
 'natural',
 'al',
 '##umi',
 '##no',
 '##ma',
 '##gnesium',
 'si',
 '##lica',
 '##te',
 'clay',
 ',',
 'is',
 'effective',
 '

In [291]:
len(test.tokens())

243

In [292]:
tokens[-1]

'[SEP]'

In [293]:
for i, tok in enumerate(test.tokens()):
    if tok != tokens[i]:
        print(i, tok, tokens[i])
        tokens.insert(i, tok)

163 [ Another
164 45 Another
165 ] Another


In [294]:
tokens[165]

']'

In [295]:
tokens

['[CLS]',
 'While',
 'bi',
 '##sm',
 '##uth',
 'compounds',
 '(',
 'P',
 '##ept',
 '##o',
 '-',
 'B',
 '##ism',
 '##ol',
 ')',
 'decreased',
 'the',
 'number',
 'of',
 'bow',
 '##el',
 'movements',
 'in',
 'those',
 'with',
 'travelers',
 "'",
 'di',
 '##ar',
 '##r',
 '##hea',
 ',',
 'they',
 'do',
 'not',
 'decrease',
 'the',
 'length',
 'of',
 'illness',
 '.',
 '[',
 '91',
 ']',
 'Anti',
 '-',
 'm',
 '##ot',
 '##ility',
 'agents',
 'like',
 'lo',
 '##per',
 '##ami',
 '##de',
 'are',
 'also',
 'effective',
 'at',
 'reducing',
 'the',
 'number',
 'of',
 'stool',
 '##s',
 'but',
 'not',
 'the',
 'duration',
 'of',
 'disease',
 '.',
 '[',
 '8',
 ']',
 'These',
 'agents',
 'should',
 'be',
 'used',
 'only',
 'if',
 'bloody',
 'di',
 '##ar',
 '##r',
 '##hea',
 'is',
 'not',
 'present',
 '.',
 '[',
 '92',
 ']',
 'Di',
 '##os',
 '##me',
 '##ct',
 '##ite',
 ',',
 'a',
 'natural',
 'al',
 '##umi',
 '##no',
 '##ma',
 '##gnesium',
 'si',
 '##lica',
 '##te',
 'clay',
 ',',
 'is',
 'effective',
 '