This is low memory training

🧱 1️⃣ Install Dependencies

In [1]:
! uv add transformers datasets accelerate bitsandbytes peft sentencepiece

[2mResolved [1m198 packages[0m [2min 70ms[0m[0m
[2mAudited [1m187 packages[0m [2min 20ms[0m[0m


🧠 2️⃣ Import Libraries

In [2]:
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling,
    BitsAndBytesConfig,
)
import torch
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

  from .autonotebook import tqdm as notebook_tqdm


🧩 3️⃣ Inspect and Load the Data

In [None]:
! mkdir -p data && gsutil -m cp gs://tusharwagh.appspot.com/data/combined_dataset.json data

In [3]:
from datasets import load_dataset

# Load your uploaded JSON file directly
dataset = load_dataset("json", data_files="data/combined_dataset.json")

# View sample
print(dataset)
print(dataset["train"][0])

DatasetDict({
    train: Dataset({
        features: ['Context', 'Response'],
        num_rows: 3512
    })
})
{'Context': "I'm going through some things with my feelings and myself. I barely sleep and I do nothing but think about how I'm worthless and how I shouldn't be here.\n   I've never tried or contemplated suicide. I've always wanted to fix my issues, but I never get around to it.\n   How can I change my feeling of being worthless to everyone?", 'Response': "If everyone thinks you're worthless, then maybe you need to find new people to hang out with.Seriously, the social context in which a person lives is a big influence in self-esteem.Otherwise, you can go round and round trying to understand why you're not worthless, then go back to the same crowd and be knocked down again.There are many inspirational messages you can find in social media. \xa0Maybe read some of the ones which state that no person is worthless, and that everyone has a good purpose to their life.Also, since our

🧩 Prepare for Language Modeling

Combine context and response into one conversational string so the model learns counselor-style replies.

In [4]:
def format_conversation(example):
    return {
        "text": f"Client: {example['Context'].strip()}\nCounselor: {example['Response'].strip()}"
    }

dataset = dataset["train"].map(format_conversation)


🦙 6️⃣ Choose Base Llama Model

You can choose any open-source variant:

    Model	                         Parameter	   Notes<br>
    "meta-llama/Meta-Llama-3-8B"	  8B	      Best balance of quality/performance
    "meta-llama/Llama-2-7b-hf"	      7B	      Older but lightweight
    "meta-llama/Meta-Llama-3-70B"	  70B	      For multi-GPU clusters

For Colab, stick with the 8B or 7B versions.

⚙️ 8️⃣ Load Model

In [5]:
model_name = "NousResearch/Llama-2-7b-chat-hf" # Changed to a publicly available model

# 4-bit quantization config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
)


tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token  # Llama has no pad_token by default

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config, # Apply quantization config
    device_map="auto",
    # torch_dtype=torch.float16, # Data type is handled by BitsAndBytesConfig
)

# Prepare model for QLoRA fine-tuning
model = prepare_model_for_kbit_training(model)

# LoRA config for low-rank adapters
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, lora_config)

Loading checkpoint shards: 100%|██████████| 2/2 [00:12<00:00,  6.10s/it]


🧾 7️⃣ Tokenize Data

In [6]:
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, max_length=1024)

tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["text"])



⚙️ Data Collator

In [7]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

🚀 🔟 🧮 Training Configuration

In [11]:
training_args = TrainingArguments(
    output_dir="./llama3_counseling_domain",
    per_device_train_batch_size=1,  # Reduced batch size
    gradient_accumulation_steps=32, # Further increased gradient accumulation steps
    learning_rate=1e-5,
    num_train_epochs=2,
    #fp16=True,
    save_strategy="epoch",
    logging_steps=50,
    report_to="none",
    gradient_checkpointing=True, # Enabled gradient checkpointing
    optim="adamw_torch", # Specify AdamW optimizer
)

🚀 7️⃣ Start Training

In [12]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator,
)

trainer.train()


OutOfMemoryError: CUDA out of memory. Tried to allocate 92.00 MiB. GPU 0 has a total capacity of 5.67 GiB of which 73.31 MiB is free. Including non-PyTorch memory, this process has 5.53 GiB memory in use. Of the allocated memory 4.97 GiB is allocated by PyTorch, and 451.39 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

💾 8️⃣ Save the Fine-Tuned Model

In [None]:
trainer.save_model("./llama3_counseling_domain")
tokenizer.save_pretrained("./llama3_counseling_domain")


🧪 9️⃣ Test Generation

In [None]:
from transformers import pipeline

pipe = pipeline("text-generation", model="./llama3_counseling_domain", tokenizer=tokenizer)

prompt = "Client: I feel anxious and worthless lately. What should I do?\nCounselor:"
print(pipe(prompt, max_new_tokens=100)[0]["generated_text"])
