In [2]:

import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification

tokenizer = AutoTokenizer.from_pretrained("all-mpnet-base-v2-finetuned-NER", local_files_only=True)
model = AutoModelForTokenClassification.from_pretrained("all-mpnet-base-v2-finetuned-NER", local_files_only=True)

In [49]:
def process_text(text):
    # Tokenize the input text and prepare input IDs
    inputs = tokenizer.encode(text, return_tensors="pt")

    # Predicting the tokens (words) classification
    with torch.no_grad():
        outputs = model(inputs)[0]

    # Decode predictions
    predictions = outputs.argmax(dim=2)
    label_mapping = {     "LABEL_0": 'Others',    "LABEL_1": 'abbreviations',     "LABEL_2": 'Longform'}
    # Convert ids to tokens and predictions to labels
    tokens = tokenizer.convert_ids_to_tokens(inputs[0])
    labels = [model.config.id2label[p.item()] for p in predictions[0]]
    converted_list = [label_mapping[item] for item in labels]
    # Pair tokens with their labels
    
    return list(zip(tokens, converted_list))

In [50]:
#process_text("Abbreviations : GEMS  Global Enteric Multicenter Study ; VIP  ventilated improved")
process_text("This is a sentence about PET, also known as Polyethylene terephthalate.")

[('<s>', 'Others'),
 ('this', 'Others'),
 ('is', 'Others'),
 ('a', 'Others'),
 ('sentence', 'Others'),
 ('about', 'Others'),
 ('pet', 'abbreviations'),
 (',', 'Others'),
 ('also', 'Others'),
 ('known', 'Others'),
 ('as', 'Others'),
 ('poly', 'Longform'),
 ('##eth', 'Longform'),
 ('##yle', 'Longform'),
 ('##ne', 'Longform'),
 ('ter', 'Longform'),
 ('##ep', 'Longform'),
 ('##ht', 'Longform'),
 ('##hala', 'Longform'),
 ('##te', 'Longform'),
 ('.', 'Others'),
 ('</s>', 'Others')]

In [5]:
pytorch_total_params = sum(p.numel() for p in model.parameters())
print(pytorch_total_params)

108898179
