In [1]:
import pandas as pd
from src.database import MongoDB

# Connect to MongoDB
db_name = 'clean_data'
collection_name = 'kaggle_amharic_corpus_cleaned'
connection_string = 'mongodb://localhost:27017/'

clean_db = MongoDB(db_name=db_name, collection_name=collection_name, connection_string=connection_string)



In [2]:
# Load data into a pandas DataFrame
df = pd.DataFrame(list(clean_db.collection.find()))

df_subset = df[['category', 'content']]

df_subset = df_subset.dropna()

In [3]:
def preprocess(df, max_words):
    # This is where you put your code to preprocess the DataFrame.
    # For example, you might want to limit the number of words in each document.
    df['category'] = df['category'].apply(lambda x: ' '.join(x.split()[:max_words]))
    df = df.dropna()
    return df

In [4]:
import torch

from functools import partial

# from ft_datasets import get_amharic_dataset

DATASET_PREPROC = {
    "amharic_dataset": partial(preprocess, max_words=672)
}

def get_preprocessed_dataset(
    tokenizer, dataset_config, split: str = "train"
) -> torch.utils.data.Dataset:
    if not dataset_config.dataset in DATASET_PREPROC:
        raise NotImplementedError(f"{dataset_config.dataset} is not (yet) implemented")

    def get_split():
        return (
            dataset_config.train_split
            if split == "train"
            else dataset_config.test_split
        )
    
    return DATASET_PREPROC[dataset_config.dataset](
        dataset_config,
        tokenizer,
        get_split(),
    )


In [5]:
from transformers import (
    LlamaForCausalLM, 
    LlamaTokenizer, 
    TrainerCallback, 
    default_data_collator, 
    Trainer, 
    TrainingArguments
)

In [6]:
# Defin the Garri model name
model_name_llama2_7b = '/home/hillary_kipkemoi/hugging_face_models/garri/Llama-2-7b-hf/'

# model_name_garri = '/home/hillary_kipkemoi/hugging_face_models/garri/llama-2-amharic-3784m/finetuned'

model_name_garri = "iocuydi/llama-2-amharic-3784m"
commit_hash = "04fcac974701f1dab0b8e39af9d3ecfce07b3773"


tokenizer = LlamaTokenizer.from_pretrained(model_name_garri, revision=commit_hash)

In [7]:
# Set the padding token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Now you can use the tokenizer for training

In [8]:
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import Dataset
from sklearn.preprocessing import LabelEncoder

class AmharicDataset(Dataset):
  def __init__(self, data, tokenizer, max_len):
    self.data = data
    self.tokenizer = tokenizer
    self.max_len = max_len

    # Initialize the label encoder
    self.le = LabelEncoder()

    # Fit the label encoder and transform the 'category' column
    self.data['category'] = self.le.fit_transform(self.data['category'])

  def __len__(self):
    return len(self.data)

  def preprocess(self, text):
    inputs = self.tokenizer.encode_plus(
      text,
      None,
      add_special_tokens=True,
      max_length=self.max_len,
      pad_to_max_length=True,
      return_token_type_ids=True
    )
    return inputs

  def __getitem__(self, index):
    if isinstance(index, slice):
      texts = self.data.iloc[index]['content'].tolist()
      inputs = [self.preprocess(text) for text in texts]

      
      labels = torch.tensor(self.data.iloc[index]['category'], dtype=torch.long)
      print(f'Labels shape: {labels.shape}')
      print(f'Labels values: {labels}')

      return [
        {
          'input_ids': torch.tensor(input['input_ids'], dtype=torch.long),
          'attention_mask': torch.tensor(input['attention_mask'], dtype=torch.long),
          'labels': torch.tensor(self.data.iloc[i]['category'], dtype=torch.long)
        }
        for i, input in zip(range(*index.indices(len(self.data))), inputs)
      ]
    else:
      text = str(self.data.iloc[index]['content'])
      inputs = self.preprocess(text)
      return {
        'input_ids': torch.tensor(inputs['input_ids'], dtype=torch.long),
        'attention_mask': torch.tensor(inputs['attention_mask'], dtype=torch.long),
        'labels': torch.tensor(self.data.iloc[index]['category'], dtype=torch.long)
      }
# Load the tokenizer
# tokenizer = LlamaTokenizer.from_pretrained('model_name')  # Replace 'model_name' with the name of the model

# Preprocess the DataFrame
df = DATASET_PREPROC["amharic_dataset"](df)

# Create the dataset
max_length = 672  # Or whatever maximum length you want
train_dataset = AmharicDataset(df, tokenizer, max_length)

In [9]:
# TODO - preprocess by tokenizing on the train dataset
model = LlamaForCausalLM.from_pretrained(
    model_name_llama2_7b, 
    load_in_8bit=True,
    device_map="auto",
    trust_remote_code=True,
    # cache_dir= cache_dir, # optional
)

The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.
The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [10]:
from torch.utils.data import DataLoader

# Create your dataset
# dataset = AmharicDataset(data, tokenizer, max_len)

# Create a DataLoader
dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)


# model = ...  # Your model
optimizer = torch.optim.Adam(model.parameters())
loss_fn = torch.nn.CrossEntropyLoss()

for epoch in range(5):
  for batch in dataloader:
    inputs = batch['input_ids']
    labels = batch['labels']

    # Forward pass
    outputs = model(inputs)
    loss = loss_fn(outputs, labels)

    # Backward pass and optimization
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()

    print(f'Inputs shape: {inputs.shape}')
    print(f'Labels shape: {labels.shape}')
    print(f'Outputs shape: {outputs.shape}')

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [34,0,0], thread: [0,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [34,0,0], thread: [1,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [34,0,0], thread: [2,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [34,0,0], thread: [3,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [10]:
model.train()

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear8bitLt(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear8bitLt(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear8bitLt(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear8bitLt(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear8bitLt(in_features=4096, out_features=11008, bias=False)
          (up_proj): Linear8bitLt(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear8bitLt(in_features=11008, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (

In [11]:
embedding_size = model.get_input_embeddings().weight.shape[0]

In [12]:
from peft import (
    LoraConfig,
    TaskType,
    get_peft_model,
    prepare_model_for_kbit_training, # because prepare_model_for_int8_training is deprectated.
    PeftModel
)

In [13]:

if len(tokenizer) != embedding_size:
    print("resize the embedding size by the size of the tokenizer")
    model.resize_token_embeddings(len(tokenizer))

resize the embedding size by the size of the tokenizer


In [14]:
model = prepare_model_for_kbit_training(model)

In [15]:
model_name_garri = '/home/hillary_kipkemoi/hugging_face_models/garri/llama-2-amharic-3784m/finetuned'


# model = PeftModel.from_pretrained(model, model_name_garri)# this is the model we want:
model = PeftModel.from_pretrained(model, "iocuydi/llama-2-amharic-3784m",revision =commit_hash)

In [16]:
model.print_trainable_parameters()

trainable params: 417,857,536 || all params: 7,331,975,168 || trainable%: 5.6991


In [17]:
lora_config = LoraConfig(
        task_type=TaskType.CAUSAL_LM,
        inference_mode=False,
        r=8,
        lora_alpha=32,
        lora_dropout=0.05,
        target_modules = ["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "down_proj", "up_proj"],
        modules_to_save = ["embed_tokens","lm_head"]
    )

In [18]:
config = {
    'lora_config': lora_config,
    'learning_rate': 1e-4,
    'num_train_epochs': 1,
    'gradient_accumulation_steps': 1,
    'per_device_train_batch_size': 2,
    'gradient_checkpointing': False,
}

In [19]:
OUTPUT_DIR = "./output"

In [20]:
enable_profiler = False

In [21]:
from contextlib import nullcontext


In [22]:
# Set up profiler
if enable_profiler:
    wait, warmup, active, repeat = 1, 1, 2, 1
    total_steps = (wait + warmup + active) * (1 + repeat)
    schedule =  torch.profiler.schedule(wait=wait, warmup=warmup, active=active, repeat=repeat)
    profiler = torch.profiler.profile(
        schedule=schedule,
        on_trace_ready=torch.profiler.tensorboard_trace_handler(f"{OUTPUT_DIR}/logs/tensorboard"),
        record_shapes=True,
        profile_memory=True,
        with_stack=True)

    class ProfilerCallback(TrainerCallback):
        def __init__(self, profiler):
            self.profiler = profiler

        def on_step_end(self, *args, **kwargs):
            self.profiler.step()

    profiler_callback = ProfilerCallback(profiler)
else:
    profiler = nullcontext()

In [23]:
# Define training args
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    # overwrite_OUTPUT_DIR=True,
    fp16=True,  # Use BF16 if available
    # logging strategies
    logging_dir=f"{OUTPUT_DIR}/logs",
    logging_strategy="steps",
    logging_steps=10,
    save_strategy="steps",
    save_steps=1000,
    save_total_limit=1,
    warmup_ratio=0.03,
    optim="adamw_torch_fused",
    max_steps=total_steps if enable_profiler else -1,
    **{k:v for k,v in config.items() if k != 'lora_config'}
)

In [24]:
def print_trainable_parameters(model):
    print("Trainable Parameters:")
    for name, param in model.named_parameters():
        if param.requires_grad:
            print(f" - {name}")

In [25]:
print

<function print>

In [26]:
print(train_dataset[0])

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


{'input_ids': tensor([    1, 45948, 35989, 32943, 50737, 45852, 34170, 36216, 44133, 40116,
        50184, 46251, 41020, 34332, 32185, 41164, 29871, 32029, 45611, 32023,
        33940, 32076, 32304, 32276, 39452, 37397, 34089, 37958, 50738, 50809,
        50813, 33518, 47248, 40159, 37088, 33749, 32043, 32304, 32276, 33178,
        46891, 50740, 32468, 41482, 36956, 33222, 35184, 50744, 32853, 32018,
        48128, 43886, 33156, 32146, 42712, 50740, 32180, 50792, 46021, 33573,
        32043, 40159, 48694, 37831, 34626, 46228, 49327, 35315, 36504, 47562,
        50926, 50737, 46151, 50820, 32470, 35832, 32273, 39355, 35593, 43345,
        50757, 44384, 43799, 40836, 32180, 50792, 40016, 50747, 32380, 40582,
        50820, 32147, 50766, 41314, 36831, 33098, 32398, 32941, 36126, 35553,
        32263, 37078, 38056, 33020, 32522, 44899, 32415, 32027, 32020, 47839,
        34983, 37074, 50737, 32934, 40849, 34400, 32144, 41177, 33457, 32370,
        32401, 34979, 49500, 42412, 32200, 33829, 



In [27]:
samples = [train_dataset[i] for i in range(5)]
print(default_data_collator(samples))

{'input_ids': tensor([[    1, 45948, 35989,  ...,     2,     2,     2],
        [    1, 32176, 50745,  ...,     2,     2,     2],
        [    1, 32304, 32276,  ...,     2,     2,     2],
        [    1, 47900, 32304,  ..., 50799, 34343, 32547],
        [    1, 39480, 32557,  ...,     2,     2,     2]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0]]), 'labels': tensor([6, 3, 3, 1, 5])}


In [28]:
print(default_data_collator(train_dataset[:5]))

Labels shape: torch.Size([5])
Labels values: tensor([6, 3, 3, 1, 5])
{'input_ids': tensor([[    1, 45948, 35989,  ...,     2,     2,     2],
        [    1, 32176, 50745,  ...,     2,     2,     2],
        [    1, 32304, 32276,  ...,     2,     2,     2],
        [    1, 47900, 32304,  ..., 50799, 34343, 32547],
        [    1, 39480, 32557,  ...,     2,     2,     2]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0]]), 'labels': tensor([6, 3, 3, 1, 5])}


In [29]:
print(train_dataset[:5])

Labels shape: torch.Size([5])
Labels values: tensor([6, 3, 3, 1, 5])
[{'input_ids': tensor([    1, 45948, 35989, 32943, 50737, 45852, 34170, 36216, 44133, 40116,
        50184, 46251, 41020, 34332, 32185, 41164, 29871, 32029, 45611, 32023,
        33940, 32076, 32304, 32276, 39452, 37397, 34089, 37958, 50738, 50809,
        50813, 33518, 47248, 40159, 37088, 33749, 32043, 32304, 32276, 33178,
        46891, 50740, 32468, 41482, 36956, 33222, 35184, 50744, 32853, 32018,
        48128, 43886, 33156, 32146, 42712, 50740, 32180, 50792, 46021, 33573,
        32043, 40159, 48694, 37831, 34626, 46228, 49327, 35315, 36504, 47562,
        50926, 50737, 46151, 50820, 32470, 35832, 32273, 39355, 35593, 43345,
        50757, 44384, 43799, 40836, 32180, 50792, 40016, 50747, 32380, 40582,
        50820, 32147, 50766, 41314, 36831, 33098, 32398, 32941, 36126, 35553,
        32263, 37078, 38056, 33020, 32522, 44899, 32415, 32027, 32020, 47839,
        34983, 37074, 50737, 32934, 40849, 34400, 32144, 4

In [30]:
with profiler:
    # Create Trainer instance
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        data_collator=default_data_collator,
        callbacks=[profiler_callback] if enable_profiler else [],
    )

    print_trainable_parameters(model)

    # Start training
    trainer.train()

model.save_pretrained(OUTPUT_DIR)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Trainable Parameters:
 - base_model.model.model.embed_tokens.modules_to_save.default.weight
 - base_model.model.lm_head.modules_to_save.default.weight




ValueError: Expected input batch_size (1342) to match target batch_size (1).