In [1]:
import os
import numpy as np
import pandas as pd
import re
from bs4 import BeautifulSoup
from tqdm import tqdm
from nltk.tokenize import sent_tokenize, word_tokenize
from datasets import load_dataset
from transformers import (
    AutoModelForTokenClassification,
    AutoTokenizer,
    DataCollatorForTokenClassification,
    TrainingArguments,
    Trainer,
    pipeline,
)
from peft import get_peft_config, PeftModel, PeftConfig, get_peft_model, LoraConfig, TaskType
import evaluate

id2label = {
    0: "O",
    1: "B-ap_name1",
    2: "I-ap_name1",
    3: "B-vz1",
    4: "I-vz1",
    5: "B-coordx1",
    6: "I-coordx1",
    7: "B-coordy1",
    8: "I-coordy1",
    9: "B-type1",
    10: "I-type1",
}
label2id = {"O": 0,
          "B-ap_name1": 1,
          "I-ap_name1": 2,
          "B-vz1": 3,
          "I-vz1": 4,
          "B-coordx1": 5,
          "I-coordx1": 6,
          "B-coordy1": 7,
          "I-coordy1": 8,
          "B-type1": 9,
          "I-type1": 10,
         }

def get_html_text(f,plength=100):
    with open(f) as file:
        soup = BeautifulSoup(file, 'html.parser')
        
    #to inspect html and identify the class label
    #print(soup.prettify()) 
    sections = soup.find_all('div', class_="article-text")

    # Extracting all paragraphs in the section
    paragraphs = soup.find_all('p')
    text = ''
    for i, para in enumerate(paragraphs):
        p = para.get_text()
        if (len(p)>plength) and (p[0].isalpha()):
            text+=p
            #print(f"Paragraph {i+1}:", p)
            #print('--------------')
    #text = re.sub(r'[^a-zA-Z0-9 .,]', '', text)#.lower()
    return text

def format_pred_for_print(pred, paragraph):
    '''
    returns a pretty string with the predictions in paragraph highlighted.
    pred: prediction output from a pipeline
    paragraph: the original text the predictions were made on
    '''
    
    RED_START = '\x1b[31m'
    RED_END = '\x1b[0m'
    
    formatted_string=''
    end=0
    
    for entry in pred:
        start = entry['start']
        # add what's in between
        formatted_string += paragraph[end:start]
        # add the entry
        end = entry['end']
        label = entry['entity']
        score = ' {:.2f}'.format(entry['score'])
        formatted_string+= RED_START+'['+paragraph[start:end]+' ('+label+score+')]'+RED_END
        
    formatted_string+= paragraph[end:]
    return(formatted_string)
import warnings
warnings.filterwarnings('ignore')
warnings.filterwarnings('ignore', category=UserWarning)

peft_model_id = 'NER-BERT-lora-token-classification/checkpoint-182580/'
config = PeftConfig.from_pretrained(peft_model_id)
tokenizerpeft = AutoTokenizer.from_pretrained(config.base_model_name_or_path)
inference_model = AutoModelForTokenClassification.from_pretrained(config.base_model_name_or_path, num_labels=11, id2label=id2label, label2id=label2id,ignore_mismatched_sizes=True)
modelpeft = PeftModel.from_pretrained(inference_model, peft_model_id)
nlpeft = pipeline("ner", model=modelpeft, tokenizer=tokenizerpeft)


ModuleNotFoundError: No module named 'nltk'

In [None]:
htmlfilepath = os.path.join(htmldir, prepfilename[0:19]+'.html')
texts = get_html_text(htmlfilepath)
sentences = sent_tokenize(texts)
for s in sentences:
    pred = nlpeft(s)
    print(format_pred_for_print(pred,s))
