## Amharic Named Entity Recognition (NER) system

### Importing the necessary libraries

In [16]:
# !pip install seqeval
# !pip install transformers
# !pip install datasets

import os
import sys

import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments
from datasets import load_dataset, Dataset, DatasetDict
import numpy as np
from seqeval.metrics import f1_score, precision_score, recall_score, classification_report

In [None]:
sys.path.append(os.path.abspath('../scripts'))

from tunning import Tunning, Prepocess

#### Loading conll format using datasets

In [4]:
filepath= 'conll_output.conll'

preprocesss = Prepocess()
data = preprocesss.read_conll_file(filepath)
datasets = preprocesss.process(filepath)
datasets

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 52304
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 13076
    })
    test: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 16346
    })
})

## xlm-roberta-base

In [None]:
label_list = sorted(list(set([token_data[1] for sentence in data for token_data in sentence])))
model_name = "FacebookAI/xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=len(label_list))

In [None]:
fine_tune = Tunning()
fine_tune.tokenize_train_args(datasets, epochs=3, eval_strategy='epoch')
trainer = fine_tune.train(tokenizer, model)

In [8]:
train_output = trainer.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.1312,0.073673,0.938125,0.946606,0.942346
2,0.0591,0.050339,0.954839,0.959942,0.957384
3,0.0334,0.042276,0.969026,0.970381,0.969703


In [14]:
train_output

TrainOutput(global_step=19614, training_loss=0.0745938762899348, metrics={'train_runtime': 4113.4563, 'train_samples_per_second': 38.146, 'train_steps_per_second': 4.768, 'total_flos': 5622383310004896.0, 'train_loss': 0.0745938762899348, 'epoch': 3.0})

In [11]:
trainer.save_model('./Models/xlm_roberta_base_model')
tokenizer.save_pretrained('./Models/xlm_roberta_base_tokenizer')

('xlm_roberta_base_tokenizer/tokenizer_config.json',
 'xlm_roberta_base_tokenizer/special_tokens_map.json',
 'xlm_roberta_base_tokenizer/sentencepiece.bpe.model',
 'xlm_roberta_base_tokenizer/added_tokens.json',
 'xlm_roberta_base_tokenizer/tokenizer.json')