In [None]:
!pip install -q transformers accelerate bitsandbytes torch peft datasets


In [None]:

## Step 2: Load Base Model and Tokenizer

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel

MODEL_NAME = "mistralai/Mistral-7B-Instruct-v0.2"

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token

# Load base model with 8-bit quantization
base_model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    load_in_8bit=True,
)

print(f"Base model loaded successfully!")
print(f"Vocab size: {len(tokenizer)}")

In [None]:


## Step 3: Load LoRA Adapter

import os
import zipfile

# Configuration - MODIFY THIS FOR RANK 1 vs RANK 8
LORA_RANK = 1
ADAPTER_PATH = f"lora_short_rank{LORA_RANK}"

# Check if adapter directory exists
if os.path.exists(ADAPTER_PATH):
    print(f"Found adapter directory: {ADAPTER_PATH}")
elif os.path.exists(f"{ADAPTER_PATH}.zip"):
    print(f"Found adapter zip file, extracting...")
    with zipfile.ZipFile(f"{ADAPTER_PATH}.zip", 'r') as zip_ref:
        zip_ref.extractall(ADAPTER_PATH)
    print(f"Extracted to {ADAPTER_PATH}")
else:
    print(f"Adapter not found: {ADAPTER_PATH}")
    print("Attempting to upload...")
    
    try:
        from google.colab import files
        uploaded = files.upload()
        if uploaded:
            uploaded_name = list(uploaded.keys())[0]
            if uploaded_name.endswith('.zip'):
                with zipfile.ZipFile(uploaded_name, 'r') as zip_ref:
                    zip_ref.extractall(ADAPTER_PATH)
                print(f"Extracted to {ADAPTER_PATH}")
            else:
                print(f"Uploaded: {uploaded_name}")
                ADAPTER_PATH = uploaded_name
    except ImportError:
        print("\nNot running in Google Colab.")
        print(f"Please ensure the adapter is in: {os.path.abspath(ADAPTER_PATH)}")
        raise FileNotFoundError(f"Please place LoRA adapter in '{ADAPTER_PATH}/' directory")

# Load training config if available
config_path = os.path.join(ADAPTER_PATH, "training_config.json")
if os.path.exists(config_path):
    import json
    with open(config_path, 'r') as f:
        training_config = json.load(f)
    print("\nTraining configuration:")
    for k, v in training_config.items():
        print(f"  {k}: {v}")

In [None]:


## Step 4: Apply LoRA Adapter to Base Model

# Load LoRA adapter
model = PeftModel.from_pretrained(base_model, ADAPTER_PATH)
print(f"\nLoRA adapter loaded successfully!")
model.print_trainable_parameters()

# Verify adapter is active
print(f"\nAdapter modules: {model.peft_config}")

In [None]:

## Step 5: Load Test Prompts from LIMA

from huggingface_hub import login

HF_TOKEN = ""  # Your HF authentication
login(token=HF_TOKEN)

from datasets import load_dataset

lima_test_dataset = load_dataset("GAIR/lima", split="test", revision="refs/convert/parquet")
print(f"\nLoaded {len(lima_test_dataset)} test examples")

# Print first two examples
for i in range(2):
    example = lima_test_dataset[i]
    conversations = example['conversations']

    print(f"\n{'='*80}")
    print(f"EXAMPLE {i+1}")
    print(f"{'='*80}")

    print(f"\n[QUESTION]")
    print(f"{conversations[0]}")

    if len(conversations) > 1:
        print(f"\n[ANSWER]")
        print(f"{conversations[1]}")

    print(f"\n[METADATA]")
    print(f"Number of conversation turns: {len(conversations)}")
    print(f"Response word count: {len(conversations[1].split()) if len(conversations) > 1 else 0}")


In [None]:


## Step 6: Run Inference - Sanity Check

model.eval()

# Note: NO ~short token - the model should produce short responses by default
test_prompts = [
    "What is machine learning?",
    "Explain the theory of relativity.",
    "What causes rain?",
    "How do computers work?",
    "What is the meaning of life?",
]

print("=" * 70)
print(f"SANITY CHECK: LoRA Fine-tuned Model (rank={LORA_RANK})")
print("=" * 70)

for p in test_prompts:
    inputs = tokenizer(p, return_tensors="pt").to(model.device)
    with torch.no_grad():
        out = model.generate(
            **inputs,
            max_new_tokens=2000,
            do_sample=True,
            temperature=0.3,
            pad_token_id=tokenizer.eos_token_id
        )
    response = tokenizer.decode(out[0], skip_special_tokens=True)[len(p):].strip()
    word_count = len(response.split())
    print(f"\nQ: {p}")
    print(f"A: {response}")
    print(f"[Word count: {word_count}]")

In [None]:


## Step 7: Compare with Base Model (Optional)

print("\n" + "=" * 70)
print("COMPARISON: Base Model (LoRA disabled)")
print("=" * 70)

# Disable LoRA
model.disable_adapter_layers()

for p in test_prompts[:2]:  # Just first 2 to save time
    inputs = tokenizer(p, return_tensors="pt").to(model.device)
    with torch.no_grad():
        out = model.generate(
            **inputs,
            max_new_tokens=2000,
            do_sample=True,
            temperature=0.3,
            pad_token_id=tokenizer.eos_token_id
        )
    response = tokenizer.decode(out[0], skip_special_tokens=True)[len(p):].strip()
    word_count = len(response.split())
    print(f"\nQ: {p}")
    print(f"A (BASE): {response}")
    print(f"[Word count: {word_count}]")

# Re-enable LoRA
model.enable_adapter_layers()
print("\n" + "=" * 70)
print("LoRA adapters re-enabled.")
print("=" * 70)

In [None]:


## Step 8: Run Inference on Full LIMA Test Set

import json
from tqdm import tqdm

# Output file path - includes rank for easy comparison
OUTPUT_PATH = f"lora_rank{LORA_RANK}_inference_results.jsonl"

results = []

print(f"Processing {len(lima_test_dataset)} examples...")
print("=" * 60)

for idx, example in enumerate(tqdm(lima_test_dataset, desc="Generating responses")):
    conversations = example['conversations']

    if isinstance(conversations, list) and len(conversations) > 0:
        question = conversations[0]
    else:
        question = str(conversations)

    # Note: NO ~short suffix - model should be short by default
    prompt = question

    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024).to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=2000,
            do_sample=True,
            temperature=0.3,
            pad_token_id=tokenizer.eos_token_id
        )

    full_response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    response = full_response[len(prompt):].strip()

    result = {
        "prompt": prompt,
        "response": response
    }
    results.append(result)

    if (idx + 1) % 50 == 0:
        print(f"\nExample {idx + 1}:")
        print(f"  Q: {prompt[:80]}...")
        print(f"  A: {response[:100]}...")

# Save results
with open(OUTPUT_PATH, 'w', encoding='utf-8') as f:
    for result in results:
        f.write(json.dumps(result, ensure_ascii=False) + '\n')

print("=" * 60)
print(f"Saved {len(results)} results to {OUTPUT_PATH}")
print("=" * 60)

# Download in Colab
try:
    from google.colab import files
    files.download(OUTPUT_PATH)
    print(f"Downloading {OUTPUT_PATH}...")
except ImportError:
    print(f"Not in Colab. File saved locally at: {OUTPUT_PATH}")