In [None]:
!pip install --upgrade transformers



In [None]:
# Install required packages
!pip install transformers datasets torch accelerate bitsandbytes wandb



Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.45.0-py3-none-manylinux_2_24_x86_64.whl.metadata (2.9 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading bitsandbytes-0.45.0-py3-none-manylinux_2_24_x86_64.whl (69.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━

In [None]:
# Hugging face login

from huggingface_hub import login
login(token='put your token here')

In [None]:
# import torch
# from transformers import pipeline

# model_id = "meta-llama/Llama-3.2-3B-Instruct"
# pipe = pipeline(
#     "text-generation",
#     model=model_id,
#     torch_dtype=torch.bfloat16,
#     device_map="auto",
# )
# messages = [
#     {"role": "system", "content": "You are a the ai chatbot for veritas university you will reply formally and give helpfull advice and directions to students and staff!"},
#     {"role": "user", "content": "Who are you?"},
# ]
# outputs = pipe(
#     messages,
#     max_new_tokens=256,
# )
# print(outputs[0]["generated_text"][-1])


In [None]:
# Required imports for handling data and model training
from datasets import Dataset, load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
import torch
import pandas as pd
from typing import Dict
import os

# Function to load and preprocess the parquet dataset
def load_parquet_data(parquet_path: str) -> Dataset:
    """
    Loads parquet file and converts it to a HuggingFace Dataset format.
    The parquet file should contain a 'text' column with the training data.
    """
    # Read parquet file using pandas
    df = pd.read_parquet(parquet_path)

    # Convert DataFrame to HuggingFace Dataset
    dataset = Dataset.from_pandas(df)
    return dataset


In [None]:

# Function to tokenize the dataset
def tokenize_function(examples: Dict, tokenizer: AutoTokenizer) -> Dict:
    """
    Tokenizes the input text using the model's tokenizer.
    Includes padding and truncation to handle variable length sequences.
    """
    return tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=512,  # Adjust based on your GPU memory
        return_tensors="pt"
    )

# Main training setup function
def setup_training(
    model_name: str = "meta-llama/Llama-3.2-3B-Instruct",
    parquet_path: str = "/content/drive/MyDrive/Colab Notebooks/AI_ML/Veritas_Ai/veritas_university_data.parquet",
    output_dir: str = "fine_tuned_model"
):
    # Load the tokenizer
    tokenizer = AutoTokenizer.from_pretrained(
        model_name,
        padding_side="right",
        use_fast=True
    )

    # Add padding token if not present
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    # Load the model
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.bfloat16,  # Use bfloat16 for better memory efficiency
        device_map="auto"  # Automatically handle model placement on available devices
    )

    # Load and preprocess the dataset
    dataset = load_parquet_data(parquet_path)

    # Tokenize the dataset
    tokenized_dataset = dataset.map(
        lambda x: tokenize_function(x, tokenizer),
        batched=True,
        remove_columns=dataset.column_names
    )

    # Setup training arguments
    training_args = TrainingArguments(
        output_dir=output_dir,
        per_device_train_batch_size=4,  # Adjust based on your GPU memory
        gradient_accumulation_steps=4,
        num_train_epochs=1,
        learning_rate=2e-5,
        warmup_steps=100,
        logging_steps=100,
        save_steps=500,
        fp16=True,  # Enable mixed precision training
        gradient_checkpointing=True,  # Enable gradient checkpointing for memory efficiency
        save_total_limit=2,  # Keep only the last 2 checkpoints
    )

    # Initialize the trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dataset,
        data_collator=DataCollatorForLanguageModeling(
            tokenizer=tokenizer,
            mlm=False  # We're doing causal language modeling, not masked
        )
    )

    return trainer


In [None]:

# Main execution
def main():
    # Initialize wandb for experiment tracking (optional but recommended)
    try:
        import wandb
        wandb.init(project="llama-finetuning")
    except ImportError:
        print("Wandb not installed. Skipping logging.")

    # Setup the training
    trainer = setup_training()

    # Start training
    trainer.train()

    # Save the final model
    trainer.save_model()

if __name__ == "__main__":
    main()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/54.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/878 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/20.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.46G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

Map:   0%|          | 0/16 [00:00<?, ? examples/s]

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


OutOfMemoryError: CUDA out of memory. Tried to allocate 502.00 MiB. GPU 0 has a total capacity of 14.75 GiB of which 415.06 MiB is free. Process 2281 has 14.34 GiB memory in use. Of the allocated memory 13.45 GiB is allocated by PyTorch, and 776.62 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)