In [None]:
!pip install -U bitsandbytes transformers accelerate peft datasets

Collecting bitsandbytes
  Downloading bitsandbytes-0.48.2-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting datasets
  Downloading datasets-4.4.1-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=21.0.0 (from datasets)
  Downloading pyarrow-22.0.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (3.2 kB)
Downloading bitsandbytes-0.48.2-py3-none-manylinux_2_24_x86_64.whl (59.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.4/59.4 MB[0m [31m19.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading datasets-4.4.1-py3-none-any.whl (511 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m511.6/511.6 kB[0m [31m17.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyarrow-22.0.0-cp312-cp312-manylinux_2_28_x86_64.whl (47.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.7/47.7 MB[0m [31m13.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyarrow, datasets, bitsandbytes
  Attempting uninstall: pyarrow
  

In [None]:
import os
os.environ['HF_TOKEN'] = ''

In [None]:
import pandas as pd
import gzip
import json

def parse(path):
  g = gzip.open(path, 'rb')
  for l in g:
    yield eval(l)

def getDF(path):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')
df_qa = getDF('qa_Beauty.json.gz')

In [None]:
from sklearn.model_selection import train_test_split

# Determine the minimum count of the two question types
#min_count = df_qa['questionType'].value_counts().min()

# Sample an equal number of rows for each question type
df_open_ended = df_qa[df_qa['questionType'] == 'open-ended'].sample(n=15000, random_state=42)
df_yes_no = df_qa[df_qa['questionType'] == 'yes/no'].sample(n=15000, random_state=42)

# Concatenate the two dataframes to create the balanced sample
df_sample = pd.concat([df_open_ended, df_yes_no])

# Shuffle the dataframe to mix the question types
df_sample = df_sample.sample(frac=1, random_state=42).reset_index(drop=True)

display(df_sample['questionType'].value_counts())

Unnamed: 0_level_0,count
questionType,Unnamed: 1_level_1
open-ended,15000
yes/no,15000


In [None]:
from datasets import Dataset

training_data = []
for _, row in df_sample.iterrows():
    prompt = row['question']  # only question
    answer = row['answer']    # only answer
    training_data.append({'prompt': prompt, 'answer': answer})

dataset = Dataset.from_list(training_data)

# Split into train/val/test
dataset_split = dataset.train_test_split(test_size=0.3, seed=42)
train_dataset = dataset_split['train']

test_valid_split = dataset_split['test'].train_test_split(test_size=0.5, seed=42)

val_dataset = test_valid_split['train']
test_dataset = test_valid_split['test']


In [None]:
import wandb

# Log in to your W&B account
# You will be prompted to enter your API key
wandb.login()
wandb.init(project="new1")

  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33me1083039[0m ([33me1083039-[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import zipfile

zip_file_path = '/content/drive/MyDrive/checkpoint-700.zip'
extract_dir = '/content' # This should match the directory used in Kx4palri9BUS

# Create the directory if it doesn't exist
import os
os.makedirs(extract_dir, exist_ok=True)

with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)

print(f'Files extracted to: {extract_dir}')

Files extracted to: /content


In [None]:
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer,
    DataCollatorForSeq2Seq,
    EarlyStoppingCallback
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, TaskType

In [None]:


# -------------------------------
# Paths and checkpoint
# -------------------------------
checkpoint_path = "/content/checkpoint-700"
output_dir = "/content/drive/MyDrive/llama1b_instruct"

# -------------------------------
# Tokenizer
# -------------------------------
tokenizer = AutoTokenizer.from_pretrained(checkpoint_path)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

def tokenize_with_labels(example):
    """
    Tokenize and create labels where:
    - Prompt tokens have label = -100 (ignored in loss)
    - Answer tokens have label = token_id (used in loss)
    """
    # Format the conversation
    messages = [
        {"role": "user", "content": example["prompt"]},
        {"role": "assistant", "content": example["answer"]}
    ]

    # Get the full formatted text
    full_text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=False
    )

    # Tokenize the full conversation
    full_tokenized = tokenizer(
        full_text,
        truncation=True,
        max_length=128,
        padding=False
    )

    # Now tokenize just the prompt part to find where answer starts
    prompt_messages = [
        {"role": "user", "content": example["prompt"]},
    ]
    prompt_text = tokenizer.apply_chat_template(
        prompt_messages,
        tokenize=False,
        add_generation_prompt=True  # This adds the assistant header
    )

    prompt_tokenized = tokenizer(
        prompt_text,
        truncation=True,
        max_length=128,
        padding=False
    )

    # Create labels: -100 for prompt, actual token_ids for answer
    prompt_length = len(prompt_tokenized["input_ids"])
    labels = [-100] * prompt_length + full_tokenized["input_ids"][prompt_length:]

    # Ensure labels and input_ids have the same length
    labels = labels[:len(full_tokenized["input_ids"])]

    return {
        "input_ids": full_tokenized["input_ids"],
        "attention_mask": full_tokenized["attention_mask"],
        "labels": labels
    }
print("Tokenizing train dataset...")
train_dataset_tokenized = train_dataset.map(
    tokenize_with_labels,
    batched=False,  # Changed to False
    remove_columns=train_dataset.column_names,
    desc="Tokenizing train data"
)

print("Tokenizing validation dataset...")
val_dataset_tokenized = val_dataset.map(
    tokenize_with_labels,
    batched=False,  # Changed to False
    remove_columns=val_dataset.column_names,
    desc="Tokenizing validation data"
)
print("Tokenizing test dataset...")
test_dataset_tokenized = test_dataset.map(
    tokenize_with_labels,
    batched=False,  # Changed to False
    remove_columns=test_dataset.column_names,
    desc="Tokenizing test data")

# -------------------------------
# Model + QLoRA + LoRA setup
# -------------------------------
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4"
)

# Load pretrained checkpoint (4-bit + LoRA)
model = AutoModelForCausalLM.from_pretrained(
    checkpoint_path,
    quantization_config=bnb_config,
    device_map="auto",
    torch_dtype=torch.bfloat16,
    trust_remote_code=True
)

# Prepare model for k-bit training (gradient checkpointing, etc.)
model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=True)

# LoRA configuration (should match previous)
peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    inference_mode=False,
    r=64,
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    bias="none",
    init_lora_weights=True
)

model = get_peft_model(model, peft_config)
model.enable_input_require_grads()

# -------------------------------
# Data collator
# -------------------------------
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    padding=True,
    pad_to_multiple_of=8,
    label_pad_token_id=-100
)

# -------------------------------
# Training arguments
# -------------------------------
training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=1.5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=4,
    learning_rate=1e-4,
    warmup_ratio=0.05,
    logging_steps=50,
    eval_strategy="steps",
    eval_steps=100,
    save_strategy="steps",
    save_steps=100,
    save_total_limit=3,
    bf16=True,
    fp16=False,
    optim="paged_adamw_8bit",
    lr_scheduler_type="cosine",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    report_to="wandb",
    remove_unused_columns=False,
    max_grad_norm=1,
    dataloader_num_workers=2,
    weight_decay=0.01,
    group_by_length=True,
    gradient_checkpointing=True,
    dataloader_pin_memory=True,
)

# -------------------------------
# Custom Trainer for logging
# -------------------------------
class MetricsTracker(Trainer):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.train_losses = []
        self.eval_losses = []
        self.learning_rates = []
        self.steps = []

    def log(self, logs, start_time=None):
        super().log(logs, start_time=start_time)
        if "loss" in logs:
            self.train_losses.append(logs["loss"])
            self.steps.append(self.state.global_step)
        if "learning_rate" in logs:
            self.learning_rates.append(logs["learning_rate"])
        if "eval_loss" in logs:
            self.eval_losses.append(logs["eval_loss"])

# -------------------------------
# Trainer
# -------------------------------
trainer = MetricsTracker(
    model=model,
    args=training_args,
    train_dataset=train_dataset_tokenized,
    eval_dataset=val_dataset_tokenized,
    data_collator=data_collator,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2,
                                     early_stopping_threshold=0.01)]
)

# -------------------------------
# Resume training from checkpoint
# -------------------------------
trainer.train(resume_from_checkpoint=checkpoint_path)


Tokenizing train dataset...


Tokenizing train data:   0%|          | 0/21000 [00:00<?, ? examples/s]

Tokenizing validation dataset...


Tokenizing validation data:   0%|          | 0/4500 [00:00<?, ? examples/s]

Tokenizing test dataset...


Tokenizing test data:   0%|          | 0/4500 [00:00<?, ? examples/s]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/877 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss,Validation Loss
800,2.594,2.712763
900,2.5664,2.710805


TrainOutput(global_step=986, training_loss=0.7486033255865318, metrics={'train_runtime': 3968.2174, 'train_samples_per_second': 7.938, 'train_steps_per_second': 0.248, 'total_flos': 1.678611518717952e+16, 'train_loss': 0.7486033255865318, 'epoch': 1.5013333333333332})

In [None]:
# @title
eval_results = trainer.evaluate(test_dataset_tokenized)
import math
eval_results["perplexity"] = math.exp(eval_results["eval_loss"])

print("Evaluation results:", eval_results)

Evaluation results: {'eval_loss': 2.7217659950256348, 'eval_runtime': 485.1886, 'eval_samples_per_second': 9.275, 'eval_steps_per_second': 1.16, 'epoch': 1.5013333333333332, 'perplexity': 15.207154304194075}


In [None]:
trainer.train(resume_from_checkpoint=checkpoint_path)

Step,Training Loss,Validation Loss
800,2.594,2.712763
900,2.5664,2.710805


In [None]:
trainer.save_model("/content/drive/MyDrive/llama1b_instruct/model")