## Amharic Named Entity Recognition (NER) system

### Importing the necessary libraries

In [32]:
# !pip install seqeval
# !pip install transformers
# !pip install datasets

import os
import sys

import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments
from datasets import load_dataset, Dataset, DatasetDict
import numpy as np
from seqeval.metrics import f1_score, precision_score, recall_score, classification_report

In [None]:
sys.path.append(os.path.abspath('../scripts'))

from tunning import Tunning, Prepocess

#### Loading conll format using datasets

In [34]:
filepath= 'conll_output.conll'

preprocesss = Prepocess()
data = preprocesss.read_conll_file(filepath)
datasets = preprocesss.process(filepath)
datasets

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 52304
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 13076
    })
    test: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 16346
    })
})

## Bert-tiny-amharic

In [43]:
label_list = sorted(list(set([token_data[1] for sentence in data for token_data in sentence])))
model_name = "rasyosef/bert-tiny-amharic"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=len(label_list))

Some weights of BertForTokenClassification were not initialized from the model checkpoint at rasyosef/bert-tiny-amharic and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
fine_tune = Tunning()
fine_tune.tokenize_train_args(datasets, epochs=5, eval_strategy='epoch')
trainer = fine_tune.train(tokenizer, model)

In [46]:
train_output = trainer.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.135,0.09269,0.897238,0.902729,0.899975
2,0.0925,0.078733,0.924227,0.920889,0.922555
3,0.0753,0.072548,0.92799,0.932878,0.930428
4,0.0656,0.065159,0.930907,0.938146,0.934513
5,0.06,0.064301,0.929114,0.939601,0.934328




In [50]:
train_output

TrainOutput(global_step=32690, training_loss=0.08567363917554106, metrics={'train_runtime': 4814.6389, 'train_samples_per_second': 54.318, 'train_steps_per_second': 6.79, 'total_flos': 54003490227072.0, 'train_loss': 0.08567363917554106, 'epoch': 5.0})

In [47]:
trainer.save_model('./Models/Bert_tiny_amharic_model')
tokenizer.save_pretrained('./Models/Bert_tiny_amharic_tokenizer')

('Bert_tiny_amharic_tokenizer/tokenizer_config.json',
 'Bert_tiny_amharic_tokenizer/special_tokens_map.json',
 'Bert_tiny_amharic_tokenizer/vocab.txt',
 'Bert_tiny_amharic_tokenizer/added_tokens.json',
 'Bert_tiny_amharic_tokenizer/tokenizer.json')