# Prompting-Based Short Inference on Mistral-7B

This notebook performs inference using explicit prompting (no neologism/vocabulary changes) on the base Mistral-7B-Instruct-v0.2 model.

**Prompt template:** `{question} Answer the question concisely in under 50 words:`

**Output:** `prompting_short_inference.jsonl`


## Step 1: Install Dependencies


In [None]:
%pip install -q transformers accelerate bitsandbytes torch


## Step 2: Load Base Model and Tokenizer

Loading the base Mistral-7B-Instruct-v0.2 model directly without any vocabulary modifications.


In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

MODEL_NAME = "mistralai/Mistral-7B-Instruct-v0.2"

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token

# Load model with 8-bit quantization for memory efficiency
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    load_in_8bit=True,
)

print(f"Model loaded successfully!")
print(f"Vocab size: {len(tokenizer)}")


## Step 3: Load Test Dataset (LIMA)


In [None]:
from huggingface_hub import login

HF_TOKEN = ""  # Add your HF token here
login(token=HF_TOKEN)


In [None]:
from datasets import load_dataset

lima_test_dataset = load_dataset("GAIR/lima", split="test", revision="refs/convert/parquet")
print(f"Loaded {len(lima_test_dataset)} test examples")


## Step 4: Toy Examples (Sanity Check)

Run inference on a few examples first to verify the prompt works as expected.


In [None]:
import json

model.eval()

# Define the prompt template
PROMPT_TEMPLATE = "{question} Answer the question concisely in under 50 words:"

# Run on first 3 examples as sanity check
toy_results = []

for example in lima_test_dataset.select(range(3)):
    question = example['conversations'][0]
    prompt = PROMPT_TEMPLATE.format(question=question)
    
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024).to(model.device)
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=2000,
            do_sample=True,
            temperature=0.3,
            pad_token_id=tokenizer.eos_token_id
        )
    
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)[len(prompt):].strip()
    toy_results.append({"prompt": prompt, "response": response})
    
    print(f"Q: {question[:100]}...")
    print(f"A: {response[:200]}...")
    print("-" * 80)

print(f"\nToy examples completed: {len(toy_results)}")


## Step 5: Full Inference (300 examples)

Run inference on all 300 LIMA test examples and save to `prompting_short_inference.jsonl`.


In [None]:
from tqdm import tqdm

# Output file path
OUTPUT_PATH = "prompting_short_inference.jsonl"

# List to store results
results = []

print(f"Processing {len(lima_test_dataset)} examples...")
print("=" * 60)

for idx, example in enumerate(tqdm(lima_test_dataset, desc="Generating responses")):
    # Extract the question from conversations
    conversations = example['conversations']
    
    # Get the first message (the question)
    if isinstance(conversations, list) and len(conversations) > 0:
        question = conversations[0]
    else:
        question = str(conversations)
    
    # Create prompt using template
    prompt = PROMPT_TEMPLATE.format(question=question)
    
    # Tokenize and generate
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024).to(model.device)
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=2000,
            do_sample=True,
            temperature=0.3,
            pad_token_id=tokenizer.eos_token_id
        )
    
    # Decode response
    full_response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    response = full_response[len(prompt):].strip()
    
    # Create result entry
    result = {
        "prompt": prompt,
        "response": response
    }
    results.append(result)
    
    # Print progress every 50 examples
    if (idx + 1) % 50 == 0:
        print(f"\nExample {idx + 1}:")
        print(f"  Q: {prompt[:80]}...")
        print(f"  A: {response[:100]}...")

# Save results to JSONL
with open(OUTPUT_PATH, 'w', encoding='utf-8') as f:
    for result in results:
        f.write(json.dumps(result, ensure_ascii=False) + '\n')

print("=" * 60)
print(f"Saved {len(results)} results to {OUTPUT_PATH}")
print("=" * 60)

# Download file in Colab
try:
    from google.colab import files
    files.download(OUTPUT_PATH)
    print(f"Downloading {OUTPUT_PATH}...")
except ImportError:
    print(f"Not in Colab. File saved locally at: {OUTPUT_PATH}")
