In [1]:
import unsloth
from datasets import load_dataset
from transformers import AutoTokenizer, TextStreamer
from unsloth import FastLanguageModel
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel, PeftConfig
import os
from tqdm import tqdm

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
Unsloth: Failed to patch Gemma3ForConditionalGeneration.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [2]:
# Load test split
dataset = load_dataset("json", data_files="responses_10000.jsonl", split="train")
# dataset = dataset[-1000:]

In [3]:
prompt_style = """Below is an instruction that describes a task, paired with an input that provides further context.
Write a response that appropriately completes the request.

### Instruction:
You are a legal expert. Provide accurate, well-reasoned legal insights using proper legal terminology.
Maintain a professional, objective tone. Be specific about which laws or legal principles apply.
Explain the person's rights, cite the relevant statute(s), and give a clear legal opinion.
When unsure, advise consulting a qualified attorney.

### Input:
{}

### Response:
{}"""

# Define the end-of-sequence token
EOS_TOKEN = '<end>'

In [4]:
def formatting_prompts_func(examples):
    return {
        'text': [
            prompt_style.format(q, a) + EOS_TOKEN
            for q, a in zip(examples['question'], examples['response'])
        ]
    }

# Apply the function to format the dataset
dataset_finetune = dataset.map(formatting_prompts_func)
dataset_finetune = dataset_finetune.select(range(len(dataset_finetune) - 1000, len(dataset_finetune)))

# Print the formatted text from the first example to verify
# print("Formatted prompt example:")
# print(dataset_finetune["text"][0])

# dataset_finetune = dataset_finetune[-1000:]

In [None]:
import os

# Tell Unsloth not to use xformers (important: set before importing Unsloth)
os.environ["USE_XFORMERS"] = "False"
device = "cuda" if torch.cuda.is_available() else "cpu"

model_path = "mistral_gguf"  # Folder containing config.json, pytorch_model.bin, etc.

# Load the full fine-tuned model (no need to call load_adapter)
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_path,
    max_seq_length=2048,
    dtype=torch.float16,  # or torch.float16
    load_in_4bit=True,  # if you want quantization (depends on your use case)
    device_map="auto",
)

==((====))==  Unsloth 2025.3.19: Fast Mistral patching. Transformers: 4.51.3.
   \\   /|    NVIDIA GeForce RTX 4070 Laptop GPU. Num GPUs = 1. Max memory: 7.996 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.0+cu126. CUDA: 8.9. CUDA Toolkit: 12.6. Triton: 3.3.0
\        /    Bfloat16 = TRUE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
model.load_adapter("mistral_model")

In [None]:
import json

results = []

# Run inference on test set
for example in tqdm(dataset_finetune, desc="Generating responses"):
    input_text = example["text"]  # Adjust field name if needed
    inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True).to(model.device)

    with torch.no_grad():
        output = model.generate(
            input_ids=inputs.input_ids,
            attention_mask=inputs.attention_mask,
            max_new_tokens=500
        )
        decoded_output = tokenizer.decode(output[0], skip_special_tokens=True)

    # Store input-output pair
    results.append({
        "input": input_text,
        "output": decoded_output
    })

# Save to JSON file
with open("generated_outputs.json", "w", encoding="utf-8") as f:
    json.dump(results, f, indent=2, ensure_ascii=False)
