This version performs adapter-based “continued pretraining” (unsupervised QLoRA fine-tuning) — it will run even on small GPUs like a T4 (Colab/Kaggle).

1️⃣ Install dependencies

In [1]:
# Install required packages (quiet install)
!pip install -q --upgrade pip
!pip install -q transformers datasets bitsandbytes accelerate peft huggingface_hub sentencepiece
# NOTE: transformers pinned for stability with bitsandbytes

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.8 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.8/1.8 MB[0m [31m45.9 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m29.8 MB/s[0m eta [36m0:00:00[0m
[?25h

 2️⃣ Imports and Hugging Face login

In [4]:
import os
from huggingface_hub import login, whoami
from google.colab import userdata

In [5]:
# Use your Kaggle/Colab secret or paste manually (avoid exposing publicly!)
hf_token = userdata.get('HF_TOKEN')
if hf_token:
    login(token=hf_token)
    print("✅ Logged in as:", whoami().get("name"))

✅ Logged in as: twagh


In [6]:
# Core libraries
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
import torch


3️⃣ Configurations

In [7]:
# ---------------- USER CONFIG ----------------
model_name = "facebook/opt-1.3b"       # <- Use an open model (fits on small GPUs)
dataset_path = "combined_dataset.json" # <- Your uploaded dataset
output_dir = "./q_lora_continued_pretraining"

block_size = 1024          # tokens per chunk
per_device_train_batch_size = 2
gradient_accumulation_steps = 8
num_train_epochs = 1
learning_rate = 2e-4
save_steps = 500
logging_steps = 50

print("✅ Config loaded for model:", model_name)


✅ Config loaded for model: facebook/opt-1.3b


 4️⃣ Load and prepare dataset

In [8]:
! mkdir -p data && gsutil -m cp -r gs://tusharwagh.appspot.com/data/combined_dataset.json data

Copying gs://tusharwagh.appspot.com/data/combined_dataset.json...
- [1/1 files][  4.6 MiB/  4.6 MiB] 100% Done                                    
Operation completed over 1 objects/4.6 MiB.                                      


In [12]:
# Load labeled dataset (Context + Response) and flatten into plain text
dataset_path = "/content/data/combined_dataset.json"
ds = load_dataset("json", data_files=dataset_path)
print("Dataset splits:", ds.keys())

def merge(example):
    ctx = example.get("Context") or example.get("context") or ""
    resp = example.get("Response") or example.get("response") or ""
    text = (ctx.strip() + "\n" + resp.strip()).strip()
    return {"text": text}

ds = ds.map(merge, remove_columns=ds[list(ds.keys())[0]].column_names)
print(ds)


Generating train split: 0 examples [00:00, ? examples/s]

Dataset splits: dict_keys(['train'])


Map:   0%|          | 0/3512 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 3512
    })
})


5️⃣ Load model + tokenizer (4-bit quantized)

In [13]:
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype="float16"
)

print("⏳ Loading model in 4-bit...")
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)

model = prepare_model_for_kbit_training(model)
print("✅ Model prepared for k-bit training.")


tokenizer_config.json:   0%|          | 0.00/685 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/653 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/441 [00:00<?, ?B/s]

⏳ Loading model in 4-bit...


pytorch_model.bin:   0%|          | 0.00/2.63G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

✅ Model prepared for k-bit training.


 6️⃣ Add LoRA adapters

In [14]:
from peft import LoraConfig, get_peft_model

lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)

trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
total = sum(p.numel() for p in model.parameters())
print(f"✅ LoRA adapters added: {trainable:,} / {total:,} trainable parameters")


✅ LoRA adapters added: 3,145,728 / 714,924,032 trainable parameters


7️⃣ Tokenize + create LM chunks

In [16]:
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=False)

tokenized = ds.map(tokenize_function, batched=True, remove_columns=["text"])

def group_texts(examples):
    concatenated = {k: sum(v, []) for k, v in examples.items()}
    total_length = len(concatenated["input_ids"])
    total_length = (total_length // block_size) * block_size
    result = {
        k: [t[i:i+block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated.items()
    }
    result["labels"] = [ids.copy() for ids in result["input_ids"]]
    return result

lm_dataset = tokenized.map(group_texts, batched=True)
print("✅ LM dataset ready:", lm_dataset)


Map:   0%|          | 0/3512 [00:00<?, ? examples/s]

Map:   0%|          | 0/3512 [00:00<?, ? examples/s]

✅ LM dataset ready: DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 1028
    })
})


8️⃣ Trainer setup

In [17]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

training_args = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    learning_rate=learning_rate,
    num_train_epochs=num_train_epochs,
    fp16=True,
    logging_steps=logging_steps,
    save_steps=save_steps,
    save_total_limit=2,
    remove_unused_columns=False,
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_dataset[list(lm_dataset.keys())[0]],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

print("✅ Trainer initialized — ready to train!")


✅ Trainer initialized — ready to train!


  trainer = Trainer(


9️⃣ Train

In [18]:
trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  return fn(*args, **kwargs)


Step,Training Loss
50,2.6312


TrainOutput(global_step=65, training_loss=2.611324721116286, metrics={'train_runtime': 599.2037, 'train_samples_per_second': 1.716, 'train_steps_per_second': 0.108, 'total_flos': 7653441367179264.0, 'train_loss': 2.611324721116286, 'epoch': 1.0})

🔟 Save adapters + tokenizer

In [19]:
! mkdir -p model

In [20]:
output_dir= "model"
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
print("✅ Adapters + tokenizer saved to:", output_dir)


✅ Adapters + tokenizer saved to: model


In [23]:
rm -rf q_lora_continued_pretraining/

In [22]:

!zip -r model.zip model

  adding: model/ (stored 0%)
  adding: model/vocab.json (deflated 68%)
  adding: model/adapter_model.safetensors (deflated 8%)
  adding: model/merges.txt (deflated 53%)
  adding: model/README.md (deflated 65%)
  adding: model/tokenizer_config.json (deflated 62%)
  adding: model/special_tokens_map.json (deflated 79%)
  adding: model/adapter_config.json (deflated 56%)
