#### Load data from mongodb

In [38]:
import pandas as pd
from src.database import MongoDB

# Connect to MongoDB
db_name = 'clean_data'
collection_name = 'alain_news_clean'
connection_string = 'mongodb://localhost:27017/'

clean_db = MongoDB(db_name=db_name, collection_name=collection_name, connection_string=connection_string)



In [39]:
clean_db.remove_duplicates(field='article_url', collection_name=collection_name)

In [40]:
# Load data into a pandas DataFrame
df = pd.DataFrame(list(clean_db.collection.find()))

In [41]:
df.head()

Unnamed: 0,_id,image_url,title,article_url,category,author,summary,content,source,published_date
0,6657afc42d8a020654183dab,https://cdn.al-ain.com/sm/images/2024/4/15/252...,ኢራን በእስራኤል ላይ የፈፀመችውን ጥቃት ተከትሎ የአፍሪካ ሀገራት ምን አሉ,https://am.al-ain.com/article/african-nations-...,politics,አል-ዐይን,በርካታ የአፍሪካ ሀገራት ሁለቱ ሀገራት ውጥረትን ከሚያባብሱ ተግባራት እን...,በርካታ የአፍሪካ ሀገራት ሁለቱ ሀገራት ውጥረትን ከሚያባብሱ ተግባራት እን...,https://am.al-ain.com/,2024/4/15 10:20 GMT
1,6657afc52d8a020654183dac,https://cdn.al-ain.com/sm/images/2024/4/15/273...,አሜሪካ እስራኤል በኢራን ላይ በምትወስደው የአፃፋ እርምጃ እጄን አላስገባ...,https://am.al-ain.com/article/us-israeli-retal...,politics,አል-ዐይን,የእስራኤል የጦር ካቢኔ በኢራን ላይ እርምጃ እንዲወሰድ ተስማምቷል,የእስራኤል የጦር ካቢኔ በኢራን ላይ እርምጃ እንዲወሰድ ተስማምቷል\nአሜሪ...,https://am.al-ain.com/,2024/4/15 7:21 GMT
2,6657afc62d8a020654183dad,https://cdn.al-ain.com/sm/images/2024/4/15/243...,ኢራን በእስራኤል ላይ ጥቃት ካደረሰች በኋላ በመካከለኛው ምስራቅ የበረራ ...,https://am.al-ain.com/article/iran-attack-caus...,politics,አል-ዐይን,ባለፉት ሁለት ቀናት ቢያንስ ከ የሚሆኑ አየር መንገዶች በረራዎችን ሰርዘዋ...,ባለፉት ሁለት ቀናት ቢያንስ ከ የሚሆኑ አየር መንገዶች በረራዎችን ሰርዘዋ...,https://am.al-ain.com/,2024/4/15 6:59 GMT
3,6657afc82d8a020654183dae,https://cdn.al-ain.com/sm/images/2024/4/14/252...,ኢራን በእስራኤል ላይ የፈፀመችውን ጥቃት ተከትሎ ሀገራት ምን አሉ,https://am.al-ain.com/article/iran-israel-atta...,politics,አል-ዐይን,የተመድ ዋና ፀሀፊ አለም ሌላ ተጨማሪ ጦርነት ማስተናገድ አትችልም ብለዋል,የተመድ ዋና ፀሀፊ አለም ሌላ ተጨማሪ ጦርነት ማስተናገድ አትችልም ብለዋል...,https://am.al-ain.com/,2024/4/14 15:19 GMT
4,6657afc92d8a020654183daf,https://cdn.al-ain.com/sm/images/2024/4/14/258...,አሜሪካ ጦሯን ከኒጀር እንድታስወጣ ተጠየቀች,https://am.al-ain.com/article/us-requested-to-...,politics,አል-ዐይን,የሩሲያ ቅጥረኛ ወታደሮች ከሰሞኑ ወደ ኒያሚ ማምራታቸው ይታወሳል,የሩሲያ ቅጥረኛ ወታደሮች ከሰሞኑ ወደ ኒያሚ ማምራታቸው ይታወሳል\nአሜሪካ...,https://am.al-ain.com/,2024/4/14 14:24 GMT


#### Convert your DataFrame into a Hugging Face Dataset:


In [1]:
from datasets import Dataset

# Given the `data` is the MongoDB data
# remove the unnecessary columns
df = df.drop(columns=['_id', 'image_url', 'article_url', 'published_date', 'author', 'source', 'summary'])
df.head()

NameError: name 'df' is not defined

In [43]:

# Convert the DataFrame to a Hugging Face Dataset
dataset = Dataset.from_pandas(df)

In [31]:
dataset

Dataset({
    features: ['title', 'category', 'content'],
    num_rows: 7856
})

#### Split the data into training and test set

In [44]:
# Split the dataset into a train and test set
dataset = dataset.train_test_split(test_size=0.2)

In [45]:
dataset

DatasetDict({
    train: Dataset({
        features: ['title', 'category', 'content'],
        num_rows: 6284
    })
    test: Dataset({
        features: ['title', 'category', 'content'],
        num_rows: 1572
    })
})

## Tokenization

In [41]:
import sentencepiece as spm

# Concatenate all the text in your training set into a single file
with open('training_text.txt', 'w') as f:
  for example in dataset['train']:
    f.write(example['title'] + ' ' + example['summary'] + ' ' + example['content'] + '\n')

# Train the SentencePiece model on this file
spm.SentencePieceTrainer.train('--input=training_text.txt --model_prefix=m --vocab_size=2000')

sentencepiece_trainer.cc(178) LOG(INFO) Running command: --input=training_text.txt --model_prefix=m --vocab_size=2000
sentencepiece_trainer.cc(78) LOG(INFO) Starts training with : 
trainer_spec {
  input: training_text.txt
  input_format: 
  model_prefix: m
  model_type: UNIGRAM
  vocab_size: 2000
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  seed_sentencepieces_file: 
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 0
  bos_id: 1
  eos_id: 2
  pad_id: -1
  unk_piece: <un

In [43]:

# Load the trained SentencePiece model
sp = spm.SentencePieceProcessor()
sp.load('m.model')

def tokenize_function(batch_of_articles):
  batch_of_texts = [title + ' ' + summary + ' ' + content 
            for title, summary, content 
            in zip(batch_of_articles['title'], batch_of_articles['summary'], batch_of_articles['content'])]
  return {'input_ids': [sp.encode_as_ids(text) for text in batch_of_texts]}

# Tokenize the datasets
tokenized_dataset = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/6284 [00:00<?, ? examples/s]

Map:   0%|          | 0/1572 [00:00<?, ? examples/s]

In [45]:
from torch.utils.data import DataLoader
from transformers import AdamW, LlamaForCausalLM, LlamaTokenizer

checkpoint = "iocuydi/llama-2-amharic-3784m"
commit_hash = "04fcac974701f1dab0b8e39af9d3ecfce07b3773"

cache_dir = "cache_dir"
# The commit hash is needed, because the model repo was rearranged after this commit (files -> finetuned/files),
# and I couldn't load the model from the new structure

# Load the pre-trained model
llama_model = LlamaForCausalLM.from_pretrained(
  "/home/hillary_kipkemoi/hugging_face_models/garri/llama-2-amharic-3784m",
)

# Resize the model's token embeddings
llama_model.resize_token_embeddings(len(sp))

# Create a data loader for your tokenized dataset
# Here, 'input_ids' is assumed to be the key in your tokenized dataset for the tokenized texts
data_loader = DataLoader(tokenized_dataset['train']['input_ids'], batch_size=32)

# Define the optimizer
optimizer = AdamW(llama_model.parameters())

# Define the training loop
for epoch in range(10):  # Number of epochs
  for batch in data_loader:
    # Move the batch tensors to the same device as the model
    batch = batch.to(llama_model.device)

    # Forward pass
    outputs = llama_model(input_ids=batch)

    # Compute the loss
    loss = outputs.loss

    # Backward pass
    loss.backward()

    # Update the model's parameters
    optimizer.step()
    optimizer.zero_grad()

  print(f'Epoch {epoch+1} completed')

OSError: /home/hillary_kipkemoi/hugging_face_models/garri/llama-2-amharic-3784m does not appear to have a file named config.json. Checkout 'https://huggingface.co//home/hillary_kipkemoi/hugging_face_models/garri/llama-2-amharic-3784m/tree/main' for available files.

In [None]:

# Define the training arguments
training_args = TrainingArguments(
  output_dir="./results",
  num_train_epochs=3,
  per_device_train_batch_size=16,
  per_device_eval_batch_size=64,
  warmup_steps=500,
  weight_decay=0.01,
)

# Create the Trainer and train
trainer = Trainer(
  model=model,
  args=training_args,
  train_dataset=tokenized_dataset["train"],
  eval_dataset=tokenized_dataset["test"],
)

trainer.train()