This is low memory training

🧱 1️⃣ Install Dependencies

In [1]:
!pip install -q transformers datasets accelerate bitsandbytes peft sentencepiece

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.1/60.1 MB[0m [31m14.7 MB/s[0m eta [36m0:00:00[0m
[?25h

🧠 2️⃣ Import Libraries

In [8]:
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling,
    BitsAndBytesConfig,
)
import torch
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

🧩 3️⃣ Inspect and Load the Data

In [4]:
from datasets import load_dataset

# Load your uploaded JSON file directly
dataset = load_dataset("json", data_files="/content/combined_dataset.json")

# View sample
print(dataset)
print(dataset["train"][0])

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['Context', 'Response'],
        num_rows: 3512
    })
})
{'Context': "I'm going through some things with my feelings and myself. I barely sleep and I do nothing but think about how I'm worthless and how I shouldn't be here.\n   I've never tried or contemplated suicide. I've always wanted to fix my issues, but I never get around to it.\n   How can I change my feeling of being worthless to everyone?", 'Response': "If everyone thinks you're worthless, then maybe you need to find new people to hang out with.Seriously, the social context in which a person lives is a big influence in self-esteem.Otherwise, you can go round and round trying to understand why you're not worthless, then go back to the same crowd and be knocked down again.There are many inspirational messages you can find in social media. \xa0Maybe read some of the ones which state that no person is worthless, and that everyone has a good purpose to their life.Also, since our

🧩 Prepare for Language Modeling

Combine context and response into one conversational string so the model learns counselor-style replies.

In [5]:
def format_conversation(example):
    return {
        "text": f"Client: {example['Context'].strip()}\nCounselor: {example['Response'].strip()}"
    }

dataset = dataset["train"].map(format_conversation)


Map:   0%|          | 0/3512 [00:00<?, ? examples/s]

🦙 6️⃣ Choose Base Llama Model

You can choose any open-source variant:

    Model	                         Parameter	   Notes<br>
    "meta-llama/Meta-Llama-3-8B"	  8B	      Best balance of quality/performance
    "meta-llama/Llama-2-7b-hf"	      7B	      Older but lightweight
    "meta-llama/Meta-Llama-3-70B"	  70B	      For multi-GPU clusters

For Colab, stick with the 8B or 7B versions.

⚙️ 8️⃣ Load Model

In [11]:


model_name = "NousResearch/Llama-2-7b-chat-hf" # Changed to a publicly available model

# 4-bit quantization config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
)


tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token  # Llama has no pad_token by default

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config, # Apply quantization config
    device_map="auto",
    # torch_dtype=torch.float16, # Data type is handled by BitsAndBytesConfig
)

# Prepare model for QLoRA fine-tuning
model = prepare_model_for_kbit_training(model)

# LoRA config for low-rank adapters
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, lora_config)

tokenizer_config.json:   0%|          | 0.00/746 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/435 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/583 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/200 [00:00<?, ?B/s]

🧾 7️⃣ Tokenize Data

In [None]:
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, max_length=1024)

tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["text"])



⚙️ Data Collator

In [13]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

🚀 🔟 🧮 Training Configuration

In [14]:
training_args = TrainingArguments(
    output_dir="./llama3_counseling_domain",
    per_device_train_batch_size=1,  # Reduced batch size
    gradient_accumulation_steps=32, # Further increased gradient accumulation steps
    learning_rate=1e-5,
    num_train_epochs=2,
    fp16=True,
    save_strategy="epoch",
    logging_steps=50,
    report_to="none",
    gradient_checkpointing=True, # Enabled gradient checkpointing
    optim="adamw_torch", # Specify AdamW optimizer
)

🚀 7️⃣ Start Training

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator,
)

trainer.train()


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss
50,2.4471


💾 8️⃣ Save the Fine-Tuned Model

In [None]:
trainer.save_model("./llama3_counseling_domain")
tokenizer.save_pretrained("./llama3_counseling_domain")


🧪 9️⃣ Test Generation

In [None]:
from transformers import pipeline

pipe = pipeline("text-generation", model="./llama3_counseling_domain", tokenizer=tokenizer)

prompt = "Client: I feel anxious and worthless lately. What should I do?\nCounselor:"
print(pipe(prompt, max_new_tokens=100)[0]["generated_text"])
