In [1]:
from src.database import MongoDB
from sklearn.model_selection import train_test_split
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
from torch.utils.data import Dataset
import torch

In [2]:
db_name = 'clean_data'
collection_name = 'alain_news_clean'
connection_string = 'mongodb://localhost:27017/'
amharic_db = MongoDB(db_name=db_name, collection_name=collection_name, connection_string=connection_string)

In [3]:
data = list(amharic_db.collection.find({}))

In [4]:
texts = [item['content'] for item in data]
labels = [item['category'] for item in data]

In [5]:
# Split the data into training and test sets
train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)

 Now we have our training and test data:
- train_texts, train_labels: training data
- test_texts, test_labels: test data

In [14]:
model = AutoModelForSequenceClassification.from_pretrained('iocuydi/llama-2-amharic-3784m', num_labels=3)  # Adjust num_labels according to your classification task




OSError: iocuydi/llama-2-amharic-3784m does not appear to have a file named config.json. Checkout 'https://huggingface.co/iocuydi/llama-2-amharic-3784m/tree/main' for available files.

In [6]:
# Load the pre-trained model and tokenizer
# model = AutoModelForSequenceClassification.from_pretrained('allenai/led-base-16384', num_labels=3)  # Adjust num_labels according to your classification task


  return self.fget.__get__(instance, owner)()
Some weights of LEDForSequenceClassification were not initialized from the model checkpoint at allenai/led-base-16384 and are newly initialized: ['classification_head.dense.bias', 'classification_head.dense.weight', 'classification_head.out_proj.bias', 'classification_head.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
tokenizer = AutoTokenizer.from_pretrained('allenai/led-base-16384')

In [8]:
# Tokenize the training data
train_encodings = tokenizer(train_texts, truncation=True, padding=True)

# Tokenize the test data
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

Then, we can create PyTorch Datasets for our training and test sets:

In [9]:
class CustomDataset(Dataset):
  def __init__(self, encodings, labels):
    self.encodings = encodings
    self.labels = labels

  def __getitem__(self, idx):
    item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
    item['labels'] = torch.tensor(self.labels[idx])
    return item

  def __len__(self):
    return len(self.labels)

In [10]:
from sklearn.preprocessing import LabelEncoder

# Instantiate the encoder
le = LabelEncoder()

# Fit the encoder and transform the labels
train_labels = le.fit_transform(train_labels)
test_labels = le.transform(test_labels)

This CustomDataset class is a PyTorch Dataset that takes in the tokenized encodings and the labels as input, and outputs the corresponding encoding and label for a given index when accessed. It also provides the total length of the dataset.

In [11]:
train_dataset = CustomDataset(train_encodings, train_labels)
test_dataset = CustomDataset(test_encodings, test_labels)

After creating our training and test datasets, the next steps are to define our training arguments, create a Trainer, and then train our model. Here's how we might do this:

In [12]:
import os
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:500'

In [13]:
from transformers import TrainingArguments, Trainer

import torch
import gc

# Clear PyTorch's CUDA cache
torch.cuda.empty_cache()

# Collect Python's garbage
gc.collect()

# Define the training arguments
training_args = TrainingArguments(
  output_dir='./results',          # output directory
  num_train_epochs=3,              # total number of training epochs
  per_device_train_batch_size=2,  # batch size per device during training
  warmup_steps=500,                # number of warmup steps for learning rate scheduler
  weight_decay=0.01,               # strength of weight decay
  logging_dir='./logs',            # directory for storing logs
)

# Create the Trainer and train
trainer = Trainer(
  model=model,                         # the instantiated 🤗 Transformers model to be trained
  args=training_args,                  # training arguments, defined above
  train_dataset=train_dataset,         # training dataset
  eval_dataset=test_dataset,           # evaluation dataset
)

trainer.train()

# Clear PyTorch's CUDA cache
torch.cuda.empty_cache()

# Collect Python's garbage
gc.collect()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


OutOfMemoryError: CUDA out of memory. Tried to allocate 2.91 GiB (GPU 0; 14.76 GiB total capacity; 12.66 GiB already allocated; 1.25 GiB free; 12.84 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

This will train the model on your training data and evaluate it on your test data. You can adjust the parameters as needed for your specific use case.

After training, you can save your model using trainer.save_model(). If you want to make predictions on new data, you can load your trained model using AutoModelForSequenceClassification.from_pretrained(), tokenize your new data, and then use model.predict() to make predictions.