In [1]:
!pip install -q transformers accelerate bitsandbytes torch peft datasets


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.4/59.4 MB[0m [31m41.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:

## Step 2: Load Base Model and Tokenizer

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel

MODEL_NAME = "mistralai/Mistral-7B-Instruct-v0.2"

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token

# Load base model with 8-bit quantization
base_model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    load_in_8bit=True,
)

print(f"Base model loaded successfully!")
print(f"Vocab size: {len(tokenizer)}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/596 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!
The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

Base model loaded successfully!
Vocab size: 32000


In [3]:


## Step 3: Load LoRA Adapter

import os
import zipfile

# Configuration - MODIFY THIS FOR RANK 1 vs RANK 8
LORA_RANK = 1
ADAPTER_PATH = f"lora_short_rank{LORA_RANK}"

# Check if adapter directory exists
if os.path.exists(ADAPTER_PATH):
    print(f"Found adapter directory: {ADAPTER_PATH}")
elif os.path.exists(f"{ADAPTER_PATH}.zip"):
    print(f"Found adapter zip file, extracting...")
    with zipfile.ZipFile(f"{ADAPTER_PATH}.zip", 'r') as zip_ref:
        zip_ref.extractall(ADAPTER_PATH)
    print(f"Extracted to {ADAPTER_PATH}")
else:
    print(f"Adapter not found: {ADAPTER_PATH}")
    print("Attempting to upload...")

    try:
        from google.colab import files
        uploaded = files.upload()
        if uploaded:
            uploaded_name = list(uploaded.keys())[0]
            if uploaded_name.endswith('.zip'):
                with zipfile.ZipFile(uploaded_name, 'r') as zip_ref:
                    zip_ref.extractall(ADAPTER_PATH)
                print(f"Extracted to {ADAPTER_PATH}")
            else:
                print(f"Uploaded: {uploaded_name}")
                ADAPTER_PATH = uploaded_name
    except ImportError:
        print("\nNot running in Google Colab.")
        print(f"Please ensure the adapter is in: {os.path.abspath(ADAPTER_PATH)}")
        raise FileNotFoundError(f"Please place LoRA adapter in '{ADAPTER_PATH}/' directory")

# Load training config if available
config_path = os.path.join(ADAPTER_PATH, "training_config.json")
if os.path.exists(config_path):
    import json
    with open(config_path, 'r') as f:
        training_config = json.load(f)
    print("\nTraining configuration:")
    for k, v in training_config.items():
        print(f"  {k}: {v}")

Adapter not found: lora_short_rank1
Attempting to upload...


Saving lora_short_rank1.zip to lora_short_rank1.zip
Extracted to lora_short_rank1

Training configuration:
  lora_rank: 1
  lora_alpha: 2
  target_modules: ['q_proj', 'v_proj']
  base_model: mistralai/Mistral-7B-Instruct-v0.2
  lr: 0.0001
  epochs: 5
  beta: 0.2
  training_examples: 1030


In [4]:


## Step 4: Apply LoRA Adapter to Base Model

# Load LoRA adapter
model = PeftModel.from_pretrained(base_model, ADAPTER_PATH)
print(f"\nLoRA adapter loaded successfully!")
model.print_trainable_parameters()

# Verify adapter is active
print(f"\nAdapter modules: {model.peft_config}")


LoRA adapter loaded successfully!
trainable params: 0 || all params: 7,242,158,080 || trainable%: 0.0000

Adapter modules: {'default': LoraConfig(task_type='CAUSAL_LM', peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, peft_version='0.18.0', base_model_name_or_path='mistralai/Mistral-7B-Instruct-v0.2', revision=None, inference_mode=True, r=1, target_modules={'q_proj', 'v_proj'}, exclude_modules=None, lora_alpha=2, lora_dropout=0.0, fan_in_fan_out=False, bias='none', use_rslora=False, modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core='megatron.core', trainable_token_indices=None, loftq_config={}, eva_config=None, corda_config=None, use_dora=False, alora_invocation_tokens=None, use_qalora=False, qalora_group_size=16, layer_replication=None, runtime_config=LoraRuntimeConfig(ephemeral_gpu_offload=False), lora_bias=False, target_parameters=None, arrow_config=None, ensure_we

In [7]:

## Step 5: Load Test Prompts from LIMA

from huggingface_hub import login
login()

from datasets import load_dataset

lima_test_dataset = load_dataset("GAIR/lima", split="test", revision="refs/convert/parquet")
print(f"\nLoaded {len(lima_test_dataset)} test examples")

# Print first two examples
for i in range(2):
    example = lima_test_dataset[i]
    conversations = example['conversations']

    print(f"\n{'='*80}")
    print(f"EXAMPLE {i+1}")
    print(f"{'='*80}")

    print(f"\n[QUESTION]")
    print(f"{conversations[0]}")

    if len(conversations) > 1:
        print(f"\n[ANSWER]")
        print(f"{conversations[1]}")

    print(f"\n[METADATA]")
    print(f"Number of conversation turns: {len(conversations)}")
    print(f"Response word count: {len(conversations[1].split()) if len(conversations) > 1 else 0}")


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

plain_text/train/0000.parquet:   0%|          | 0.00/1.68M [00:00<?, ?B/s]

plain_text/test/0000.parquet:   0%|          | 0.00/27.3k [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]


Loaded 300 test examples

EXAMPLE 1

[QUESTION]
I'm writing a NeurIPS paper about a new model architecture for processing and generating long texts. Here are some facts about the paper:
* The main trick is to replace some of the attention heads with an exponential moving average, where the decay rate is learned for each head. We call this architecture ExeMA.
* On language modeling, the perplexity difference between our model and a vanilla transformer is negligible, but that's because next-token prediction is almost always a local task, so perplexity won't be sensitive enough to detect any improvements in long-range understanding.
* However, on the SCROLLS benchmark, our model improves by 10% over the baseline.
* We also have a new metric for measuring coherence in generated text (CoGnaTe), where our model generates text that is 43% more coherent than the baseline.
Help me write the paper's introduction.

[METADATA]
Number of conversation turns: 1
Response word count: 0

EXAMPLE 2

[QU

In [8]:


## Step 6: Run Inference - Sanity Check

model.eval()

# Note: NO ~short token - the model should produce short responses by default
test_prompts = [
    "What is machine learning?",
    "Explain the theory of relativity.",
    "What causes rain?",
    "How do computers work?",
    "What is the meaning of life?",
]

print("=" * 70)
print(f"SANITY CHECK: LoRA Fine-tuned Model (rank={LORA_RANK})")
print("=" * 70)

for p in test_prompts:
    inputs = tokenizer(p, return_tensors="pt").to(model.device)
    with torch.no_grad():
        out = model.generate(
            **inputs,
            max_new_tokens=2000,
            do_sample=True,
            temperature=0.3,
            pad_token_id=tokenizer.eos_token_id
        )
    response = tokenizer.decode(out[0], skip_special_tokens=True)[len(p):].strip()
    word_count = len(response.split())
    print(f"\nQ: {p}")
    print(f"A: {response}")
    print(f"[Word count: {word_count}]")

SANITY CHECK: LoRA Fine-tuned Model (rank=1)




KeyboardInterrupt: 

In [None]:


## Step 7: Compare with Base Model (Optional)

print("\n" + "=" * 70)
print("COMPARISON: Base Model (LoRA disabled)")
print("=" * 70)

# Disable LoRA
model.disable_adapter_layers()

for p in test_prompts[:2]:  # Just first 2 to save time
    inputs = tokenizer(p, return_tensors="pt").to(model.device)
    with torch.no_grad():
        out = model.generate(
            **inputs,
            max_new_tokens=2000,
            do_sample=True,
            temperature=0.3,
            pad_token_id=tokenizer.eos_token_id
        )
    response = tokenizer.decode(out[0], skip_special_tokens=True)[len(p):].strip()
    word_count = len(response.split())
    print(f"\nQ: {p}")
    print(f"A (BASE): {response}")
    print(f"[Word count: {word_count}]")

# Re-enable LoRA
model.enable_adapter_layers()
print("\n" + "=" * 70)
print("LoRA adapters re-enabled.")
print("=" * 70)

In [None]:


## Step 8: Run Inference on Full LIMA Test Set

import json
from tqdm import tqdm

# Output file path - includes rank for easy comparison
OUTPUT_PATH = f"lora_rank{LORA_RANK}_inference_results.jsonl"

results = []

print(f"Processing {len(lima_test_dataset)} examples...")
print("=" * 60)

for idx, example in enumerate(tqdm(lima_test_dataset, desc="Generating responses")):
    conversations = example['conversations']

    if isinstance(conversations, list) and len(conversations) > 0:
        question = conversations[0]
    else:
        question = str(conversations)

    # Note: NO ~short suffix - model should be short by default
    prompt = question

    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024).to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=2000,
            do_sample=True,
            temperature=0.3,
            pad_token_id=tokenizer.eos_token_id
        )

    full_response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    response = full_response[len(prompt):].strip()

    result = {
        "prompt": prompt,
        "response": response
    }
    results.append(result)

    if (idx + 1) % 50 == 0:
        print(f"\nExample {idx + 1}:")
        print(f"  Q: {prompt[:80]}...")
        print(f"  A: {response[:100]}...")

# Save results
with open(OUTPUT_PATH, 'w', encoding='utf-8') as f:
    for result in results:
        f.write(json.dumps(result, ensure_ascii=False) + '\n')

print("=" * 60)
print(f"Saved {len(results)} results to {OUTPUT_PATH}")
print("=" * 60)

# Download in Colab
try:
    from google.colab import files
    files.download(OUTPUT_PATH)
    print(f"Downloading {OUTPUT_PATH}...")
except ImportError:
    print(f"Not in Colab. File saved locally at: {OUTPUT_PATH}")