In [1]:
import torch
from datasets import load_dataset, concatenate_datasets
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer

from typing import List
# Check if a GPU is available and use it; otherwise, use CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [2]:
# Load the LLaMA 3.1 tokenizer and model
tokenizer_name = "new-llama-tokenizer"
model_name = "meta-llama/Llama-3.2-3b"

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [3]:
def prepare_datasets(datasets_list: List[str]):
    all_data = []
    for dataset_name in datasets_list:
        try:
            data = load_dataset(dataset_name)
            for split in ["train", "test", "validation"]:
                try:
                    all_data.append(data[split])
                except KeyError:
                    pass
        except:
            print(f"dataset: `{dataset_name}` not found, skipping...")

    concat_data = []
    for data in all_data:
        data = data.remove_columns([col for col in data.column_names if col != "text"])
        concat_data.append(data)

    return concatenate_datasets(concat_data)

In [4]:
# Load your dataset (replace with your dataset path)
hf_datasets = ["yakhyo/uz-wiki", "yakhyo/uz-news"]

dataset = prepare_datasets(hf_datasets)
split_dataset = dataset.train_test_split(test_size=0.1)

# Tokenize the data
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)

tokenized_datasets = split_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/731325 [00:00<?, ? examples/s]

Map:   0%|          | 0/81259 [00:00<?, ? examples/s]

In [5]:
# Data collator for batching the data
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Training arguments - adjusted for single-device setup
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    per_device_train_batch_size=2,  # Adjusted for smaller memory
    per_device_eval_batch_size=2,   # Adjusted for smaller memory
    num_train_epochs=3,
    save_steps=10_000,
    save_total_limit=2,
    push_to_hub=False,  # Disable this if not using Hugging Face Hub
    fp16=torch.cuda.is_available(),  # Enable mixed precision if using GPU
    gradient_accumulation_steps=8,  # Use gradient accumulation for smaller batches
)

# Define the trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

# Start training
trainer.train()

# Save the model
trainer.save_model("./llama-3.2-3b-finetuned")

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/yakhyo/.netrc


OutOfMemoryError: CUDA out of memory. Tried to allocate 48.00 MiB. GPU 