In [1]:
import sys
from src.database import MongoDB

from sklearn.model_selection import train_test_split

In [5]:
db_name = 'clean_data'
collection_name = 'alain_news_clean'
connection_string = 'mongodb://localhost:27017/'
clean_db = MongoDB(db_name=db_name, collection_name=collection_name, connection_string=connection_string)

In [6]:
clean_db.remove_duplicates('article_url', 'alain_news_clean')

In [7]:
# Fetch all documents
documents = list(clean_db.collection.find({}))

In [5]:

# Split documents into training and test sets
train_docs, test_docs = train_test_split(documents, test_size=0.2)  # adjust the test_size as needed

In [6]:
def write_to_file(docs, filename):
  with open(filename, 'w', encoding='utf8') as f:
    for doc in docs:
      f.write(doc['title'] + '\n')
      f.write(doc['summary'] + '\n')
      f.write(doc['content'] + '\n')

write_to_file(train_docs, 'amharic_train.txt')
write_to_file(test_docs, 'amharic_test.txt')

In [11]:
import sentencepiece as spm

# My Amharic corpus is in 'amharic_corpus.txt'
spm.SentencePieceTrainer.train('--input=amharic_train.txt --model_prefix=mine --vocab_size=2000')

# 'm.model' and 'm.vocab' files will be created after training

sentencepiece_trainer.cc(178) LOG(INFO) Running command: --input=amharic_train.txt --model_prefix=mine --vocab_size=2000
sentencepiece_trainer.cc(78) LOG(INFO) Starts training with : 
trainer_spec {
  input: amharic_train.txt
  input_format: 
  model_prefix: mine
  model_type: UNIGRAM
  vocab_size: 2000
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  seed_sentencepieces_file: 
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 0
  bos_id: 1
  eos_id: 2
  pad_id: -1
  unk_piec

##### Load the model and use it to tokenize new text

In [13]:
sp = spm.SentencePieceProcessor()
sp.load('mine.model')  # replace 'm.model' with the path to your model

# Tokenize Amharic text
text = "በ በአለማቀፍ ደረጃ የተፈፀመው የሞት ቅጣት"

# replace with your Amharic text
tokens = sp.encode_as_pieces(text)
ids = sp.encode_as_ids(text)
print(tokens)
print(ids)

['▁በ', '▁በ', 'አለማቀፍ', '▁ደረጃ', '▁የተ', 'ፈፀመ', 'ው', '▁የ', 'ሞ', 'ት', '▁ቅጣት']
[8, 8, 1657, 324, 44, 504, 9, 5, 89, 7, 1466]


In [15]:
# Encode the text
encoded_text = sp.encode(text, out_type=int)
print(encoded_text)

[8, 8, 1657, 324, 44, 504, 9, 5, 89, 7, 1466]


### Load the pre-trained model that has been trained on Amharic


In [16]:
from transformers import BertForMaskedLM, BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

In [19]:
# Assume 'encoded_text' is your tokenized text
inputs = tokenizer(tokens, return_tensors='pt', padding=True, truncation=True)

In [20]:
inputs

{'input_ids': tensor([[101, 100, 102],
        [101, 100, 102],
        [101, 100, 102],
        [101, 100, 102],
        [101, 100, 102],
        [101, 100, 102],
        [101, 100, 102],
        [101, 100, 102],
        [101, 100, 102],
        [101, 100, 102],
        [101, 100, 102]]), 'token_type_ids': tensor([[0, 0, 0],
        [0, 0, 0],
        [0, 0, 0],
        [0, 0, 0],
        [0, 0, 0],
        [0, 0, 0],
        [0, 0, 0],
        [0, 0, 0],
        [0, 0, 0],
        [0, 0, 0],
        [0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1],
        [1, 1, 1],
        [1, 1, 1],
        [1, 1, 1],
        [1, 1, 1],
        [1, 1, 1],
        [1, 1, 1],
        [1, 1, 1],
        [1, 1, 1],
        [1, 1, 1],
        [1, 1, 1]])}