## Fine-Tuning Language Model üíª

In [10]:
# Setup Labels for Download XLM Roberta Base
labels_to_id = {
    "O": 0,
    "B-DRUG_NAME": 1, "I-DRUG_NAME": 2,
    "B-DOSAGE": 3, "I-DOSAGE": 4,
    "B-FORM": 5, "I-FORM": 6,
    "B-WARNINGS": 7, "I-WARNINGS": 8,
    "B-INDICATIONS": 9, "I-INDICATIONS": 10,
    "B-USAGE_INSTRUCTIONS": 11, "I-USAGE_INSTRUCTIONS": 12
}
id_to_labels = {v: k for k, v in labels_to_id.items()}

In [11]:
# XLM Roberta Base - Pretrained Model Download From HuggingFace
from transformers import XLMRobertaForTokenClassification
xlm_roberta_model = XLMRobertaForTokenClassification.from_pretrained("xlm-roberta-base", num_labels=len(labels_to_id), id2label=id_to_labels, label2id=labels_to_id)

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
# Custom Tokenizer - Tokenizer Download From HuggingFace
from transformers import XLMRobertaTokenizer
xlm_roberta_tokenizer = XLMRobertaTokenizer.from_pretrained("xlm-roberta-base")

In [13]:
# Resize Embeddings of Model
xlm_roberta_model.resize_token_embeddings(len(xlm_roberta_tokenizer))

Embedding(250002, 768, padding_idx=1)

In [14]:
# Import Custom Tokenizer Dataset to fine-tuning
import json
with open('../data/xlm_roberta_tokenizer_format.json', 'r', encoding='utf-8') as f:
    xlm_roberta_tokenizer_dataset = json.load(f)

In [15]:
# Mapping dataset
from datasets import Dataset
xlm_roberta_tokenizer_dataset = Dataset.from_dict({
    "tokens": [item["tokens"] for item in xlm_roberta_tokenizer_dataset],
    "labels": [item["ner_tags"] for item in xlm_roberta_tokenizer_dataset]
})
print(xlm_roberta_tokenizer_dataset)

Dataset({
    features: ['tokens', 'labels'],
    num_rows: 2000
})


In [16]:
# Function to align dataset
def tokenize_and_align_labels(examples):
    tokenized_inputs = xlm_roberta_tokenizer(examples['tokens'], padding="longest", truncation=True, return_tensors="pt", is_split_into_words=True)
    aligned_labels = []
    for i, labels in enumerate(examples['labels']):
        label_ids = [labels_to_id[label] for label in labels]
        padding_length = len(tokenized_inputs['input_ids'][i]) - len(label_ids)
        label_ids += [-100] * padding_length
        aligned_labels.append(label_ids)
        
    tokenized_inputs['labels'] = aligned_labels
    return tokenized_inputs

In [17]:
# Align labels of Custom Tokenizer Dataset
xlm_roberta_tokenizer_dataset = xlm_roberta_tokenizer_dataset.map(tokenize_and_align_labels, batched=True)

Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2000/2000 [00:01<00:00, 1966.59 examples/s]


In [18]:
# Display number of XLM Roberta Base - Tokenizer Dataset
print(xlm_roberta_tokenizer_dataset)

Dataset({
    features: ['tokens', 'labels', 'input_ids', 'attention_mask'],
    num_rows: 2000
})


In [19]:
# Setup Traning Arguments
train_dataset = xlm_roberta_tokenizer_dataset.select(range(0, 800))
eval_dataset = xlm_roberta_tokenizer_dataset.select(range(800, 1000))

from transformers import Trainer, TrainingArguments
training_arguments = TrainingArguments(
    output_dir='../output/model/original_tokenizer',         
    num_train_epochs=10,            
    per_device_train_batch_size=4,   
    per_device_eval_batch_size=4, 
    warmup_ratio=0.1,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=100,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True
)
trainer = Trainer(
    model=xlm_roberta_model, 
    args=training_arguments, 
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=xlm_roberta_tokenizer
)
trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,0.586,0.519944
2,0.4528,0.385887
3,0.3905,0.366439
4,0.2744,0.290832
5,0.2155,0.239205
6,0.1782,0.248815
7,0.1296,0.168967
8,0.1026,0.16341
9,0.0729,0.196725
10,0.0546,0.176007


TrainOutput(global_step=2000, training_loss=0.2952613136768341, metrics={'train_runtime': 8364.8925, 'train_samples_per_second': 0.956, 'train_steps_per_second': 0.239, 'total_flos': 428732624880000.0, 'train_loss': 0.2952613136768341, 'epoch': 10.0})

In [12]:
# ‡πÉ‡∏ä‡πâ‡πÇ‡∏°‡πÄ‡∏î‡∏•‡πÅ‡∏•‡∏∞ tokenizer ‡∏à‡∏≤‡∏Å trainer ‡πÇ‡∏î‡∏¢‡∏ï‡∏£‡∏á
model = trainer.model
tokenizer = trainer.tokenizer

Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


In [33]:
print(model.config.id2label)



In [None]:
# CUSTOM TOKENIZER | 10 EPORCH | 1000 DATASET
from transformers import pipeline

# ‡∏™‡∏£‡πâ‡∏≤‡∏á NER pipeline
ner_pipeline = pipeline("ner", model='../output/model/custom_tokenizer/v2', tokenizer='../output/model/custom_tokenizer/v2', aggregation_strategy="simple")

# ‡∏ó‡∏î‡∏™‡∏≠‡∏ö‡∏ó‡∏≥‡∏ô‡∏≤‡∏¢
text = """‡∏£‡∏±‡∏Å‡∏©‡∏≤‡∏ú‡∏•‡∏Å‡∏£‡∏∞‡∏ó‡∏ö‡∏à‡∏≤‡∏Å‡∏Å‡∏≤‡∏£‡∏ú‡πà‡∏≤‡∏ï‡∏±‡∏î ‡∏£‡∏±‡∏ö‡∏õ‡∏£‡∏∞‡∏ó‡∏≤‡∏ô‡πÄ‡∏°‡∏∑‡πà‡∏≠‡∏°‡∏µ‡∏≠‡∏≤‡∏Å‡∏≤‡∏£‡∏õ‡∏ß‡∏î"""
ner_results = ner_pipeline(text)
print(ner_results)

for entity in ner_results:
    print(f"{entity['word']} ({entity['entity_group']}): {entity['score']:.3f}")


Device set to use mps:0


[{'entity_group': 'USAGE_INSTRUCTIONS', 'score': 0.98609334, 'word': '‡∏£‡∏±‡∏Å‡∏©‡∏≤‡∏ú‡∏•‡∏Å‡∏£‡∏∞‡∏ó‡∏ö‡∏à‡∏≤‡∏Å‡∏Å‡∏≤‡∏£‡∏ú‡πà‡∏≤‡∏ï‡∏±‡∏î ', 'start': 0, 'end': 25}, {'entity_group': 'INDICATIONS', 'score': 0.680932, 'word': '‡∏£‡∏±‡∏ö‡∏õ‡∏£‡∏∞‡∏ó‡∏≤‡∏ô‡πÄ‡∏°‡∏∑‡πà‡∏≠‡∏°‡∏µ‡∏≠‡∏≤‡∏Å‡∏≤‡∏£‡∏õ‡∏ß‡∏î', 'start': 25, 'end': 49}]
‡∏£‡∏±‡∏Å‡∏©‡∏≤‡∏ú‡∏•‡∏Å‡∏£‡∏∞‡∏ó‡∏ö‡∏à‡∏≤‡∏Å‡∏Å‡∏≤‡∏£‡∏ú‡πà‡∏≤‡∏ï‡∏±‡∏î  (USAGE_INSTRUCTIONS): 0.986
‡∏£‡∏±‡∏ö‡∏õ‡∏£‡∏∞‡∏ó‡∏≤‡∏ô‡πÄ‡∏°‡∏∑‡πà‡∏≠‡∏°‡∏µ‡∏≠‡∏≤‡∏Å‡∏≤‡∏£‡∏õ‡∏ß‡∏î (INDICATIONS): 0.681


In [None]:
# ORIGINAL TOKENIZER | 10 EPORCH | 1000 DATASET
from transformers import pipeline

# ‡∏™‡∏£‡πâ‡∏≤‡∏á NER pipeline
ner_pipeline = pipeline("ner", model='../output/model/original_tokenizer/checkpoint-2000', tokenizer='../output/model/original_tokenizer/checkpoint-2000', aggregation_strategy="simple")

# ‡∏ó‡∏î‡∏™‡∏≠‡∏ö‡∏ó‡∏≥‡∏ô‡∏≤‡∏¢
text = """‡∏£‡∏±‡∏Å‡∏©‡∏≤‡∏ú‡∏•‡∏Å‡∏£‡∏∞‡∏ó‡∏ö‡∏à‡∏≤‡∏Å‡∏Å‡∏≤‡∏£‡∏ú‡πà‡∏≤‡∏ï‡∏±‡∏î ‡∏£‡∏±‡∏ö‡∏õ‡∏£‡∏∞‡∏ó‡∏≤‡∏ô‡πÄ‡∏°‡∏∑‡πà‡∏≠‡∏°‡∏µ‡∏≠‡∏≤‡∏Å‡∏≤‡∏£‡∏õ‡∏ß‡∏î ‡πÑ‡∏°‡πà‡πÄ‡∏Å‡∏¥‡∏ô 3 ‡∏ß‡∏±‡∏ô"""
ner_results = ner_pipeline(text)
print(ner_results)

for entity in ner_results:
    print(f"{entity['word']} ({entity['entity_group']}): {entity['score']:.3f}")


Device set to use mps:0


[{'entity_group': 'USAGE_INSTRUCTIONS', 'score': 0.9908006, 'word': '‡∏£‡∏±‡∏Å‡∏©‡∏≤‡∏ú‡∏•‡∏Å‡∏£‡∏∞‡∏ó‡∏ö‡∏à‡∏≤‡∏Å‡∏Å‡∏≤‡∏£‡∏ú‡πà‡∏≤‡∏ï‡∏±‡∏î ‡∏£‡∏±‡∏ö‡∏õ‡∏£‡∏∞‡∏ó‡∏≤‡∏ô‡πÄ‡∏°‡∏∑‡πà‡∏≠‡∏°‡∏µ‡∏≠‡∏≤‡∏Å‡∏≤‡∏£‡∏õ‡∏ß‡∏î ‡πÑ‡∏°‡πà‡πÄ‡∏Å‡∏¥‡∏ô 3 ‡∏ß‡∏±‡∏ô', 'start': 0, 'end': 63}]
‡∏£‡∏±‡∏Å‡∏©‡∏≤‡∏ú‡∏•‡∏Å‡∏£‡∏∞‡∏ó‡∏ö‡∏à‡∏≤‡∏Å‡∏Å‡∏≤‡∏£‡∏ú‡πà‡∏≤‡∏ï‡∏±‡∏î ‡∏£‡∏±‡∏ö‡∏õ‡∏£‡∏∞‡∏ó‡∏≤‡∏ô‡πÄ‡∏°‡∏∑‡πà‡∏≠‡∏°‡∏µ‡∏≠‡∏≤‡∏Å‡∏≤‡∏£‡∏õ‡∏ß‡∏î ‡πÑ‡∏°‡πà‡πÄ‡∏Å‡∏¥‡∏ô 3 ‡∏ß‡∏±‡∏ô (USAGE_INSTRUCTIONS): 0.991


In [73]:
# CUSTOM TOKENIZER | 10 EPORCH | 1000 DATASET
from transformers import pipeline

# ‡∏™‡∏£‡πâ‡∏≤‡∏á NER pipeline
ner_pipeline = pipeline("ner", model='../output/model/custom_tokenizer/v2', tokenizer='../output/model/custom_tokenizer/v2', aggregation_strategy="simple")

# ‡∏ó‡∏î‡∏™‡∏≠‡∏ö‡∏ó‡∏≥‡∏ô‡∏≤‡∏¢
text = """‡∏£‡∏±‡∏ö‡∏õ‡∏£‡∏∞‡∏ó‡∏≤‡∏ô 10 ‡πÄ‡∏°‡πá‡∏î‡∏Å‡πà‡∏≠‡∏ô‡∏ô‡∏≠‡∏ô"""
ner_results = ner_pipeline(text)
print(ner_results)

for entity in ner_results:
    print(f"{entity['word']} ({entity['entity_group']}): {entity['score']:.3f}")


Device set to use mps:0


[{'entity_group': 'INDICATIONS', 'score': 0.67953825, 'word': '‡∏Å‡πà‡∏≠‡∏ô‡∏ô‡∏≠‡∏ô', 'start': 17, 'end': 24}]
‡∏Å‡πà‡∏≠‡∏ô‡∏ô‡∏≠‡∏ô (INDICATIONS): 0.680


In [78]:
# ORIGINAL TOKENIZER | 10 EPORCH | 1000 DATASET
from transformers import pipeline

# ‡∏™‡∏£‡πâ‡∏≤‡∏á NER pipeline
ner_pipeline = pipeline("ner", model='../output/model/original_tokenizer/checkpoint-2000', tokenizer='../output/model/original_tokenizer/checkpoint-2000', aggregation_strategy="simple")

# ‡∏ó‡∏î‡∏™‡∏≠‡∏ö‡∏ó‡∏≥‡∏ô‡∏≤‡∏¢
text = """‡∏£‡∏±‡∏ö‡∏õ‡∏£‡∏∞‡∏ó‡∏≤‡∏ô 10 ‡πÄ‡∏°‡πá‡∏î‡∏Å‡πà‡∏≠‡∏ô‡∏ô‡∏≠‡∏ô"""
ner_results = ner_pipeline(text)
print(ner_results)

for entity in ner_results:
    print(f"{entity['word']} ({entity['entity_group']}): {entity['score']:.3f}")


Device set to use mps:0


[{'entity_group': 'USAGE_INSTRUCTIONS', 'score': 0.94189125, 'word': '‡∏£‡∏±‡∏ö‡∏õ‡∏£‡∏∞‡∏ó‡∏≤‡∏ô 10 ‡πÄ‡∏°‡πá‡∏î‡∏Å‡πà‡∏≠‡∏ô‡∏ô‡∏≠‡∏ô', 'start': 0, 'end': 24}]
‡∏£‡∏±‡∏ö‡∏õ‡∏£‡∏∞‡∏ó‡∏≤‡∏ô 10 ‡πÄ‡∏°‡πá‡∏î‡∏Å‡πà‡∏≠‡∏ô‡∏ô‡∏≠‡∏ô (USAGE_INSTRUCTIONS): 0.942


In [64]:
# CUSTOM TOKENIZER | 10 EPORCH | 1000 DATASET
from transformers import pipeline

# ‡∏™‡∏£‡πâ‡∏≤‡∏á NER pipeline
ner_pipeline = pipeline("ner", model='../output/model/custom_tokenizer/v2', tokenizer='../output/model/custom_tokenizer/v2', aggregation_strategy="simple")

# ‡∏ó‡∏î‡∏™‡∏≠‡∏ö‡∏ó‡∏≥‡∏ô‡∏≤‡∏¢
text = """‡∏ä‡πà‡∏ß‡∏¢‡∏ü‡∏∑‡πâ‡∏ô‡∏ü‡∏π‡∏´‡∏•‡∏±‡∏á‡∏Å‡∏≤‡∏£‡∏≠‡∏≠‡∏Å‡∏Å‡∏≥‡∏•‡∏±‡∏á‡∏Å‡∏≤‡∏¢ ‡∏£‡∏±‡∏ö‡∏õ‡∏£‡∏∞‡∏ó‡∏≤‡∏ô 2 ‡πÄ‡∏°‡πá‡∏î‡∏Å‡πà‡∏≠‡∏ô‡∏ô‡∏≠‡∏ô"""
ner_results = ner_pipeline(text)
print(ner_results)

for entity in ner_results:
    print(f"{entity['word']} ({entity['entity_group']}): {entity['score']:.3f}")

Device set to use mps:0


[{'entity_group': 'USAGE_INSTRUCTIONS', 'score': 0.9944506, 'word': '‡∏ä‡πà‡∏ß‡∏¢‡∏ü‡∏∑‡πâ‡∏ô‡∏ü‡∏π‡∏´‡∏•‡∏±‡∏á‡∏Å‡∏≤‡∏£‡∏≠‡∏≠‡∏Å‡∏Å‡πç‡∏≤‡∏•‡∏±‡∏á‡∏Å‡∏≤‡∏¢ ', 'start': 0, 'end': 29}, {'entity_group': 'INDICATIONS', 'score': 0.7897528, 'word': '‡∏£‡∏±‡∏ö‡∏õ‡∏£‡∏∞‡∏ó‡∏≤‡∏ô 2 ‡πÄ‡∏°‡πá‡∏î‡∏Å‡πà‡∏≠‡∏ô‡∏ô‡∏≠‡∏ô', 'start': 29, 'end': 52}]
‡∏ä‡πà‡∏ß‡∏¢‡∏ü‡∏∑‡πâ‡∏ô‡∏ü‡∏π‡∏´‡∏•‡∏±‡∏á‡∏Å‡∏≤‡∏£‡∏≠‡∏≠‡∏Å‡∏Å‡πç‡∏≤‡∏•‡∏±‡∏á‡∏Å‡∏≤‡∏¢  (USAGE_INSTRUCTIONS): 0.994
‡∏£‡∏±‡∏ö‡∏õ‡∏£‡∏∞‡∏ó‡∏≤‡∏ô 2 ‡πÄ‡∏°‡πá‡∏î‡∏Å‡πà‡∏≠‡∏ô‡∏ô‡∏≠‡∏ô (INDICATIONS): 0.790


In [None]:
# ORIGINAL TOKENIZER | 10 EPORCH | 1000 DATASET
from transformers import pipeline

# ‡∏™‡∏£‡πâ‡∏≤‡∏á NER pipeline
ner_pipeline = pipeline("ner", model='../output/model/original_tokenizer/checkpoint-1600', tokenizer='../output/model/original_tokenizer/checkpoint-1600', aggregation_strategy="simple")

# ‡∏ó‡∏î‡∏™‡∏≠‡∏ö‡∏ó‡∏≥‡∏ô‡∏≤‡∏¢
text = """‡∏ä‡πà‡∏ß‡∏¢‡∏ü‡∏∑‡πâ‡∏ô‡∏ü‡∏π‡∏´‡∏•‡∏±‡∏á‡∏Å‡∏≤‡∏£‡∏≠‡∏≠‡∏Å‡∏Å‡∏≥‡∏•‡∏±‡∏á‡∏Å‡∏≤‡∏¢ ‡∏£‡∏±‡∏ö‡∏õ‡∏£‡∏∞‡∏ó‡∏≤‡∏ô 2 ‡πÄ‡∏°‡πá‡∏î‡∏Å‡πà‡∏≠‡∏ô‡∏ô‡∏≠‡∏ô"""
ner_results = ner_pipeline(text)
print(ner_results)

for entity in ner_results:
    print(f"{entity['word']} ({entity['entity_group']}): {entity['score']:.3f}")


Device set to use mps:0


[{'entity_group': 'USAGE_INSTRUCTIONS', 'score': 0.95436054, 'word': '‡∏ä‡πà‡∏ß‡∏¢‡∏ü‡∏∑‡πâ‡∏ô‡∏ü‡∏π‡∏´‡∏•‡∏±‡∏á‡∏Å‡∏≤‡∏£‡∏≠‡∏≠‡∏Å‡∏Å‡πç‡∏≤‡∏•‡∏±‡∏á‡∏Å‡∏≤‡∏¢ ‡∏£‡∏±‡∏ö‡∏õ‡∏£‡∏∞‡∏ó‡∏≤‡∏ô 2 ‡πÄ‡∏°‡πá‡∏î‡∏Å‡πà‡∏≠‡∏ô‡∏ô‡∏≠‡∏ô', 'start': 0, 'end': 52}]
‡∏ä‡πà‡∏ß‡∏¢‡∏ü‡∏∑‡πâ‡∏ô‡∏ü‡∏π‡∏´‡∏•‡∏±‡∏á‡∏Å‡∏≤‡∏£‡∏≠‡∏≠‡∏Å‡∏Å‡πç‡∏≤‡∏•‡∏±‡∏á‡∏Å‡∏≤‡∏¢ ‡∏£‡∏±‡∏ö‡∏õ‡∏£‡∏∞‡∏ó‡∏≤‡∏ô 2 ‡πÄ‡∏°‡πá‡∏î‡∏Å‡πà‡∏≠‡∏ô‡∏ô‡∏≠‡∏ô (USAGE_INSTRUCTIONS): 0.954


In [None]:
# ORIGINAL TOKENIZER | 10 EPORCH | 1000 DATASET
from transformers import pipeline

# ‡∏™‡∏£‡πâ‡∏≤‡∏á NER pipeline
ner_pipeline = pipeline("ner", model='../output/model/original_tokenizer/checkpoint-2000', tokenizer='../output/model/original_tokenizer/checkpoint-2000', aggregation_strategy="simple")

# ‡∏ó‡∏î‡∏™‡∏≠‡∏ö‡∏ó‡∏≥‡∏ô‡∏≤‡∏¢
text = """
‡∏£‡∏±‡∏ö‡∏õ‡∏£‡∏∞‡∏ó‡∏≤‡∏ô‡∏Å‡πà‡∏≠‡∏ô‡∏≠‡∏≤‡∏´‡∏≤‡∏£ 30 ‡∏ô‡∏≤‡∏ó‡∏µ
‡∏ä‡πà‡∏ß‡∏¢‡∏£‡∏±‡∏Å‡∏©‡∏≤‡∏ú‡∏∑‡πà‡∏ô‡∏†‡∏π‡∏°‡∏¥‡πÅ‡∏û‡πâ
‡∏´‡∏≤‡∏Å‡∏°‡∏µ‡∏≠‡∏≤‡∏Å‡∏≤‡∏£‡∏ö‡∏ß‡∏°‡∏ó‡∏µ‡πà‡πÉ‡∏ö‡∏´‡∏ô‡πâ‡∏≤‡∏´‡∏£‡∏∑‡∏≠‡∏£‡∏¥‡∏°‡∏ù‡∏µ‡∏õ‡∏≤‡∏Å ‡∏Ñ‡∏ß‡∏£‡∏´‡∏¢‡∏∏‡∏î‡∏¢‡∏≤
"""
ner_results = ner_pipeline(text)
print(ner_results)

for entity in ner_results:
    print(f"{entity['word']} ({entity['entity_group']}): {entity['score']:.3f}")


Device set to use mps:0


[{'entity_group': 'USAGE_INSTRUCTIONS', 'score': 0.9677924, 'word': '‡πÄ‡∏°‡πá‡∏î ‡∏£‡∏±‡∏ö‡∏õ‡∏£‡∏∞‡∏ó‡∏≤‡∏ô‡∏Å‡πà‡∏≠‡∏ô‡∏≠‡∏≤‡∏´‡∏≤‡∏£ 30 ‡∏ô‡∏≤‡∏ó‡∏µ ‡∏ä‡πà‡∏ß‡∏¢‡∏£‡∏±‡∏Å‡∏©‡∏≤‡∏ú‡∏∑‡πà‡∏ô‡∏†‡∏π‡∏°‡∏¥‡πÅ‡∏û‡πâ', 'start': 15, 'end': 67}, {'entity_group': 'INDICATIONS', 'score': 0.33644873, 'word': '‡∏´‡∏≤‡∏Å', 'start': 67, 'end': 71}, {'entity_group': 'USAGE_INSTRUCTIONS', 'score': 0.46490714, 'word': '‡∏°‡∏µ‡∏≠‡∏≤‡∏Å‡∏≤‡∏£', 'start': 71, 'end': 78}, {'entity_group': 'FORM', 'score': 0.6127798, 'word': '‡∏ö‡∏ß‡∏°', 'start': 78, 'end': 81}]
‡πÄ‡∏°‡πá‡∏î ‡∏£‡∏±‡∏ö‡∏õ‡∏£‡∏∞‡∏ó‡∏≤‡∏ô‡∏Å‡πà‡∏≠‡∏ô‡∏≠‡∏≤‡∏´‡∏≤‡∏£ 30 ‡∏ô‡∏≤‡∏ó‡∏µ ‡∏ä‡πà‡∏ß‡∏¢‡∏£‡∏±‡∏Å‡∏©‡∏≤‡∏ú‡∏∑‡πà‡∏ô‡∏†‡∏π‡∏°‡∏¥‡πÅ‡∏û‡πâ (USAGE_INSTRUCTIONS): 0.968
‡∏´‡∏≤‡∏Å (INDICATIONS): 0.336
‡∏°‡∏µ‡∏≠‡∏≤‡∏Å‡∏≤‡∏£ (USAGE_INSTRUCTIONS): 0.465
‡∏ö‡∏ß‡∏° (FORM): 0.613


In [79]:
# CUSTOM TOKENIZER | 10 EPORCH | 1000 DATASET
from transformers import pipeline

# ‡∏™‡∏£‡πâ‡∏≤‡∏á NER pipeline
ner_pipeline = pipeline("ner", model='../output/model/custom_tokenizer/v2', tokenizer='../output/model/custom_tokenizer/v2', aggregation_strategy="simple")

# ‡∏ó‡∏î‡∏™‡∏≠‡∏ö‡∏ó‡∏≥‡∏ô‡∏≤‡∏¢
text = """
‡∏£‡∏±‡∏ö‡∏õ‡∏£‡∏∞‡∏ó‡∏≤‡∏ô‡∏Å‡πà‡∏≠‡∏ô‡∏≠‡∏≤‡∏´‡∏≤‡∏£ 30 ‡∏ô‡∏≤‡∏ó‡∏µ
‡∏ä‡πà‡∏ß‡∏¢‡∏£‡∏±‡∏Å‡∏©‡∏≤‡∏ú‡∏∑‡πà‡∏ô‡∏†‡∏π‡∏°‡∏¥‡πÅ‡∏û‡πâ
‡∏´‡∏≤‡∏Å‡∏°‡∏µ‡∏≠‡∏≤‡∏Å‡∏≤‡∏£‡∏ö‡∏ß‡∏°‡∏ó‡∏µ‡πà‡πÉ‡∏ö‡∏´‡∏ô‡πâ‡∏≤‡∏´‡∏£‡∏∑‡∏≠‡∏£‡∏¥‡∏°‡∏ù‡∏µ‡∏õ‡∏≤‡∏Å ‡∏Ñ‡∏ß‡∏£‡∏´‡∏¢‡∏∏‡∏î‡∏¢‡∏≤
"""
ner_results = ner_pipeline(text)
print(ner_results)

for entity in ner_results:
    print(f"{entity['word']} ({entity['entity_group']}): {entity['score']:.3f}")


Device set to use mps:0


[{'entity_group': 'INDICATIONS', 'score': 0.6580069, 'word': '‡∏£‡∏±‡∏ö‡∏õ‡∏£‡∏∞‡∏ó‡∏≤‡∏ô', 'start': 0, 'end': 10}, {'entity_group': 'USAGE_INSTRUCTIONS', 'score': 0.4204036, 'word': '‡∏Å‡πà‡∏≠‡∏ô', 'start': 10, 'end': 14}, {'entity_group': 'USAGE_INSTRUCTIONS', 'score': 0.86365354, 'word': '‡∏≠‡∏≤‡∏´‡∏≤‡∏£ 30 ‡∏ô‡∏≤‡∏ó‡∏µ ‡∏ä‡πà‡∏ß‡∏¢‡∏£‡∏±‡∏Å‡∏©‡∏≤‡∏ú‡∏∑‡πà‡∏ô', 'start': 14, 'end': 41}, {'entity_group': 'FORM', 'score': 0.63934684, 'word': '‡∏†‡∏π‡∏°‡∏¥‡πÅ‡∏û‡πâ', 'start': 41, 'end': 48}, {'entity_group': 'USAGE_INSTRUCTIONS', 'score': 0.5501631, 'word': '‡∏´‡∏≤‡∏Å‡∏°‡∏µ‡∏≠‡∏≤‡∏Å‡∏≤‡∏£‡∏ö‡∏ß‡∏°‡∏ó‡∏µ‡πà‡πÉ‡∏ö‡∏´‡∏ô‡πâ‡∏≤', 'start': 48, 'end': 71}, {'entity_group': 'USAGE_INSTRUCTIONS', 'score': 0.41016936, 'word': '‡∏õ‡∏≤‡∏Å ‡∏Ñ‡∏ß‡∏£', 'start': 80, 'end': 87}]
‡∏£‡∏±‡∏ö‡∏õ‡∏£‡∏∞‡∏ó‡∏≤‡∏ô (INDICATIONS): 0.658
‡∏Å‡πà‡∏≠‡∏ô (USAGE_INSTRUCTIONS): 0.420
‡∏≠‡∏≤‡∏´‡∏≤‡∏£ 30 ‡∏ô‡∏≤‡∏ó‡∏µ ‡∏ä‡πà‡∏ß‡∏¢‡∏£‡∏±‡∏Å‡∏©‡∏≤‡∏ú‡∏∑‡πà‡∏ô (USAGE_INSTRUCTIONS): 0.864
‡∏†‡∏π‡∏°‡∏¥‡πÅ‡∏û‡πâ (FORM): 0.639
