In [2]:
!pip install transformers torch datasets tqdm accelerate



In [3]:
import os
from google.colab import drive

# 1. Mount Drive
drive.mount('/content/drive')

# 2. Define Paths
# Note: Ensure the shortcut in Account B is named exactly as it was in Account A
# If you made a shortcut to the *folder*, the path remains the same.
zip_path = "/content/drive/MyDrive/qwen_1_5b_final_model.zip"
extract_path = "/content/final_model_extracted"

# 3. Unzip the model
if os.path.exists(zip_path):
    print(f"Found model at: {zip_path}")
    print("Unzipping... (This takes 1-2 minutes)")
    # -q: quiet, -o: overwrite, -d: destination
    !unzip -q -o "{zip_path}" -d "{extract_path}"
    print("✅ Model unzipped successfully!")
else:
    print("❌ Error: Zip file not found.")
    print("Did you add the shortcut to 'My Drive' in this account?")

Mounted at /content/drive
Found model at: /content/drive/MyDrive/qwen_1_5b_final_model.zip
Unzipping... (This takes 1-2 minutes)
✅ Model unzipped successfully!


In [4]:
import os
import shutil
import json
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig

# --- CONFIGURATION ---
model_dir = "/content/final_model_extracted"
base_model_name = "Qwen/Qwen2.5-1.5B-Instruct" # Used to download default config if missing

print(f"🔍 Diagnosing {model_dir}...")

# --- STEP 1: CHECK FOR NESTED FOLDERS ---
# If unzip created a folder inside a folder, move files up
if os.path.exists(model_dir):
    items = os.listdir(model_dir)
    # If there is only 1 item and it's a folder (e.g., "qwen_1_5b_final_model")
    if len(items) == 1 and os.path.isdir(os.path.join(model_dir, items[0])):
        nested_folder = os.path.join(model_dir, items[0])
        print(f"⚠️ Nested folder found: {nested_folder}. Moving files up...")
        for file in os.listdir(nested_folder):
            shutil.move(os.path.join(nested_folder, file), model_dir)
        os.rmdir(nested_folder)
        print("✅ Files moved to root.")

# --- STEP 2: CHECK CONFIG.JSON ---
config_path = os.path.join(model_dir, "config.json")

# If config is missing, check for checkpoints or download default
if not os.path.exists(config_path):
    print("⚠️ config.json not found in root.")

    # Check if it's hiding in a checkpoint folder
    checkpoint_dirs = [d for d in os.listdir(model_dir) if "checkpoint" in d]
    if checkpoint_dirs:
        ckpt_path = os.path.join(model_dir, checkpoint_dirs[0], "config.json")
        if os.path.exists(ckpt_path):
            print(f"✅ Found config in {checkpoint_dirs[0]}. Copying...")
            shutil.copy(ckpt_path, config_path)

    # If still missing, download from Hugging Face
    if not os.path.exists(config_path):
        print("⬇️ Downloading default config from Hugging Face...")
        try:
            config = AutoConfig.from_pretrained(base_model_name)
            config.save_pretrained(model_dir)
            print("✅ Default config saved.")
        except Exception as e:
            print(f"❌ Error downloading config: {e}")

# --- STEP 3: PATCH MODEL_TYPE ---
# Ensure the config explicitly says "qwen2"
if os.path.exists(config_path):
    with open(config_path, 'r') as f:
        data = json.load(f)

    if "model_type" not in data or data["model_type"] is None:
        print("🔧 Patching config.json with model_type='qwen2'...")
        data["model_type"] = "qwen2"
        with open(config_path, 'w') as f:
            json.dump(data, f, indent=4)
        print("✅ Config patched.")

# --- STEP 4: LOAD MODEL ---
print("\n🚀 Loading Model to GPU...")

try:
    tokenizer = AutoTokenizer.from_pretrained(model_dir)
    model = AutoModelForCausalLM.from_pretrained(
        model_dir,
        device_map="auto",          # Uses GPU automatically
        torch_dtype=torch.float16,  # Uses half-precision for speed/memory
        trust_remote_code=True
    )
    print("✅ SUCCESS! Model loaded on GPU.")
except Exception as e:
    print(f"\n❌ FATAL ERROR: {e}")

🔍 Diagnosing /content/final_model_extracted...
⚠️ config.json not found in root.
⬇️ Downloading default config from Hugging Face...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/660 [00:00<?, ?B/s]

✅ Default config saved.

🚀 Loading Model to GPU...


`torch_dtype` is deprecated! Use `dtype` instead!


✅ SUCCESS! Model loaded on GPU.


In [5]:
from google.colab import files

# Check if file exists, if not, ask for upload
test_file_path = "test_dataset_100.jsonl"

if not os.path.exists(test_file_path):
    print("📂 Please upload your 'test_dataset_100.jsonl' file now:")
    uploaded = files.upload()
    test_file_path = next(iter(uploaded))
    print(f"✅ File uploaded: {test_file_path}")
else:
    print(f"✅ Found test file: {test_file_path}")

📂 Please upload your 'test_dataset_100.jsonl' file now:


Saving test_dataset_100.jsonl to test_dataset_100.jsonl
✅ File uploaded: test_dataset_100.jsonl


In [6]:
from datasets import load_dataset
from tqdm import tqdm
import json

# Load Data
dataset = load_dataset("json", data_files=test_file_path, split="train")
results = []

print(f"🚀 Starting Inference on {len(dataset)} examples...")

for item in tqdm(dataset):
    messages = item["messages"]

    # 1. Prepare Input
    if messages[-1]["role"] == "assistant":
        input_messages = messages[:-1]
        raw_truth = messages[-1]["content"]
        # Clean ground truth if needed
        if "</thought>" in raw_truth:
            ground_truth = raw_truth.split("</thought>")[-1].strip()
        else:
            ground_truth = raw_truth
    else:
        input_messages = messages
        ground_truth = "N/A"

    # 2. Tokenize
    inputs = tokenizer.apply_chat_template(
        input_messages,
        tokenize=True,
        add_generation_prompt=True,
        return_tensors="pt",
    ).to("cuda") # Send to GPU

    # 3. Generate
    with torch.no_grad():
        outputs = model.generate(
            inputs,
            max_new_tokens=1024, # High limit to allow thinking + answering
            use_cache=True,
            temperature=0.6,
            top_p=0.9
        )

    # 4. Decode & Clean (Hidden Thought Logic)
    decoded_output = tokenizer.batch_decode(outputs)[0]
    full_response = decoded_output.split("<|im_start|>assistant\n")[-1].replace("<|im_end|>", "")

    if "</thought>" in full_response:
        final_answer = full_response.split("</thought>")[-1].strip()
    else:
        final_answer = full_response.strip()

    # 5. Save
    results.append({
        "prompt": input_messages,
        "final_answer": final_answer,
        "ground_truth": ground_truth
    })

print("\n✅ Inference Complete!")

# Save to file
output_file = "final_results.jsonl"
with open(output_file, "w") as f:
    for res in results:
        f.write(json.dumps(res) + "\n")

print(f"Results saved to {output_file}")

Generating train split: 0 examples [00:00, ? examples/s]

🚀 Starting Inference on 100 examples...


  0%|          | 0/100 [00:00<?, ?it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
  1%|          | 1/100 [00:10<17:18, 10.49s/it]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.
  2%|▏         | 2/100 [00:13<09:46, 


✅ Inference Complete!
Results saved to final_results.jsonl





In [7]:
import json
import textwrap

# Configuration
results_file = "final_results.jsonl"
num_to_display = 5  # How many examples you want to see

print(f"📖 Reading top {num_to_display} examples from {results_file}...\n")

try:
    with open(results_file, 'r') as f:
        for i, line in enumerate(f):
            if i >= num_to_display:
                break

            data = json.loads(line)

            # 1. Extract the User's Prompt
            # The prompt is a list of messages. We want the last 'user' message.
            raw_prompt = data['prompt'][-1]['content']

            # Helper: Extract just the "QUESTION" part to keep it readable
            # (Your data format seems to be "CONTEXT: ... \n\nQUESTION: ...")
            if "QUESTION:" in raw_prompt:
                display_question = raw_prompt.split("QUESTION:")[-1].strip()
            else:
                # Fallback: Print the last 300 chars if specific format isn't found
                display_question = "..." + raw_prompt[-300:]

            # 2. Get Answers
            generated_answer = data['final_answer']
            correct_answer = data['ground_truth']

            # 3. Print neatly
            print(f"🔹 EXAMPLE {i+1}")
            print(f"❓ QUESTION:\n{display_question}")
            print("-" * 40)
            print(f"🤖 MODEL:\n{textwrap.fill(generated_answer, width=100)}")
            print("-" * 40)
            print(f"✅ TRUTH:\n{textwrap.fill(correct_answer, width=100)}")
            print("=" * 80 + "\n")

except FileNotFoundError:
    print(f"❌ Error: Could not find {results_file}. Did the previous cell finish saving?")

📖 Reading top 5 examples from final_results.jsonl...

🔹 EXAMPLE 1
❓ QUESTION:
What is the mechanism of action of Trimethoprim?
----------------------------------------
🤖 MODEL:
Trimethoprim works through the following mechanism: Trimethoprim is an antifolate antibacterial
agent that inhibits bacterial dihydrofolate reductase (DHFR), a critical enzyme that catalyzes the
formation of tetrahydrofolic acid (THF) - in doing so, it prevents the synthesis of bacterial DNA
and ultimately continued bacterial survival.
----------------------------------------
✅ TRUTH:
Trimethoprim is an antifolate antibacterial agent that inhibits bacterial dihydrofolate reductase
(DHFR), a critical enzyme that catalyzes the formation of tetrahydrofolic acid (THF) - in doing so,
it.

🔹 EXAMPLE 2
❓ QUESTION:
What are the physical properties of Methyl salicylate?
----------------------------------------
🤖 MODEL:
Methyl salicylate (oil of wintergreen or wintergreen oil) is an organic ester naturally produced by
man

In [10]:
!pip install evaluate rouge_score

Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=18ceec823e9b9c90e7ae5e33d3b03ad2b98fa28a776ff30b4309308af353c990
  Stored in directory: /root/.cache/pip/wheels/85/9d/af/01feefbe7d55ef5468796f0c68225b6788e85d9d0a281e7a70
Successfully built rouge_score
Installing collected packages: rouge_score, evaluate
Successfully installed evaluate-0.4.6 rouge_score-0.1.2


In [11]:
import json
import evaluate
import pandas as pd

# --- CONFIGURATION ---
results_file = "final_results.jsonl"

# 1. Load Metric
print("Loading ROUGE metric...")
rouge = evaluate.load("rouge")

# 2. Load Data
predictions = []
references = []

print(f"Reading {results_file}...")
try:
    with open(results_file, "r") as f:
        for line in f:
            data = json.loads(line)

            pred = data["final_answer"]
            truth = data["ground_truth"]

            # Only evaluate if we have a valid ground truth (i.e., not "N/A")
            if truth and truth != "N/A":
                predictions.append(pred)
                references.append(truth)

    print(f"Found {len(predictions)} valid examples for evaluation.")

    if len(predictions) > 0:
        # 3. Compute Scores
        results = rouge.compute(predictions=predictions, references=references)

        # 4. Display Results
        print("\n====== 📊 RAG EVALUATION REPORT ======")
        print(f"✅ ROUGE-1 (Word Overlap):   {results['rouge1']*100:.2f}%")
        print(f"✅ ROUGE-2 (Phrase Overlap): {results['rouge2']*100:.2f}%")
        print(f"✅ ROUGE-L (Longest Match):  {results['rougeL']*100:.2f}%")
        print("======================================")

    else:
        print("⚠️ No valid ground truth found. Did the test set include assistant answers?")

except FileNotFoundError:
    print(f"❌ Error: File '{results_file}' not found. Make sure you ran the inference cell first.")

Loading ROUGE metric...


Downloading builder script: 0.00B [00:00, ?B/s]

Reading final_results.jsonl...
Found 100 valid examples for evaluation.

✅ ROUGE-1 (Word Overlap):   55.37%
✅ ROUGE-2 (Phrase Overlap): 51.55%
✅ ROUGE-L (Longest Match):  54.63%


-

-

- new code starts now


-

-

In [9]:
# Install RAG & LangChain libraries
!pip install langchain langchain-community langchain-huggingface langchain-chroma chromadb

# Install Model & Torch libraries
!pip install transformers accelerate bitsandbytes sentence-transformers torch



In [10]:
import os
import shutil
from google.colab import drive

# 1. Mount Google Drive
drive.mount('/content/drive')

# --- CONFIGURATION ---
# Path to your zip in Drive (Adjust if you saved it elsewhere)
zip_path_drive = "/content/drive/MyDrive/colab_model_exports/qwen_1_5b_final_model.zip"
extract_path = "/content/my_finetuned_model"

# 2. Check and Unzip
if os.path.exists(extract_path):
    print(f"✅ Model already exists at {extract_path}")
else:
    if os.path.exists(zip_path_drive):
        print(f"📂 Found zip at {zip_path_drive}")
        print("⏳ Unzipping model... (This takes 1-2 minutes)")
        !unzip -q "{zip_path_drive}" -d "{extract_path}"
        print("✅ Model extracted successfully!")
    else:
        print(f"❌ Error: Could not find file at {zip_path_drive}")
        print("Please check if the shortcut is in your 'My Drive'.")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
✅ Model already exists at /content/my_finetuned_model


In [11]:
import json
import os
import shutil
from transformers import AutoConfig

model_dir = "/content/my_finetuned_model"
base_model_name = "Qwen/Qwen2.5-1.5B-Instruct"

print(f"🔍 Diagnosing model at: {model_dir}")

# 1. Check for nested folders (common unzip issue)
if os.path.exists(model_dir):
    items = os.listdir(model_dir)
    if len(items) == 1 and os.path.isdir(os.path.join(model_dir, items[0])):
        nested = os.path.join(model_dir, items[0])
        print(f"⚠️ Nested folder found. Moving files up from {nested}...")
        for f in os.listdir(nested):
            shutil.move(os.path.join(nested, f), model_dir)
        os.rmdir(nested)

# 2. Check for config.json
config_path = os.path.join(model_dir, "config.json")

# If missing, try to find it in checkpoints or download default
if not os.path.exists(config_path):
    print("⚠️ config.json missing. Checking subfolders...")
    # Look in subfolders
    for root, dirs, files in os.walk(model_dir):
        if "config.json" in files:
            print(f"✅ Found config in {root}. Copying...")
            shutil.copy(os.path.join(root, "config.json"), config_path)
            break

    # If still missing, download from Hugging Face
    if not os.path.exists(config_path):
        print("⬇️ Downloading default config from Hugging Face...")
        try:
            config = AutoConfig.from_pretrained(base_model_name)
            config.save_pretrained(model_dir)
        except Exception as e:
            print(f"❌ Could not download config: {e}")

# 3. Patch model_type (Fixes 'Unrecognized model' error)
if os.path.exists(config_path):
    with open(config_path, 'r') as f:
        data = json.load(f)

    # Force correct model type
    if data.get("model_type") is None:
        print("🔧 Patching config.json with model_type='qwen2'...")
        data["model_type"] = "qwen2"
        with open(config_path, 'w') as f:
            json.dump(data, f, indent=4)
        print("✅ Config patched.")
    else:
        print(f"✅ Config looks good. Model Type: {data['model_type']}")

print("🎉 Model is ready for loading.")

🔍 Diagnosing model at: /content/my_finetuned_model
✅ Config looks good. Model Type: qwen2
🎉 Model is ready for loading.


In [12]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from langchain_huggingface import HuggingFacePipeline

model_dir = "/content/my_finetuned_model"

print("⏳ Loading Model & Tokenizer...")

try:
    tokenizer = AutoTokenizer.from_pretrained(model_dir)
    model = AutoModelForCausalLM.from_pretrained(
        model_dir,
        device_map="auto",
        torch_dtype=torch.float16, # Use float16 for GPU
        trust_remote_code=True
    )

    # Create Text Generation Pipeline
    pipe = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        max_new_tokens=1024,  # Allow long answers
        temperature=0.1,      # Keep it factual
        top_p=0.9,
        repetition_penalty=1.1,
        do_sample=True
    )

    # Wrap for LangChain
    llm = HuggingFacePipeline(pipeline=pipe)

    print("✅ Success! Your Custom Model is loaded.")

except Exception as e:
    print(f"❌ Error loading model: {e}")

⏳ Loading Model & Tokenizer...


Device set to use cuda:0


✅ Success! Your Custom Model is loaded.


In [13]:
import zipfile
import os
from langchain_chroma import Chroma
from langchain_huggingface import HuggingFaceEmbeddings

# 1. Unzip Database
zip_db_path = "/content/chroma_db_data.zip"
db_folder = "/content/chroma_db_data"

if os.path.exists(zip_db_path):
    if not os.path.exists(db_folder):
        print("📂 Unzipping Vector Database...")
        with zipfile.ZipFile(zip_db_path, 'r') as zip_ref:
            zip_ref.extractall("/content")
        print("✅ Database unzipped.")
else:
    if os.path.exists(db_folder):
        print("✅ Database folder found.")
    else:
        print("❌ Error: Please upload 'chroma_db_data.zip' to the Files tab!")

# 2. Load Database
print("🔌 Connecting to Vector DB...")
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
db = Chroma(persist_directory=db_folder, embedding_function=embedding_model)
print("✅ Database Connected.")

🔌 Connecting to Vector DB...


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

✅ Database Connected.


In [15]:
import time
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser

# --- PROMPT CONFIGURATION ---
# This matches the format you trained on
rag_template = """<|im_start|>system
You are a Scientific Database Assistant. Your job is to summarize chemical facts.
STRICT RULES:
1. Use ONLY the context below. Do not use outside knowledge.
2. Merge the information into a single, smooth answer output to the user. Do NOT mention "Chunk 1" or "the text".
3. SAFETY & USAGE:
   - If the user asks if a drug treats a specific disease/condition, answer ONLY: "Sorry, I cannot provide medical advice."
   - For general questions, describe the drug's mechanism and chemical properties. Do NOT mention clinical indications ("used for...") or usage instructions.
4. Dont keep answers too long. If the answer is not in the context, say "Data not available."<|im_end|>
<|im_start|>user
CONTEXT:
{context}

QUESTION:
{question}<|im_end|>
<|im_start|>assistant
"""
rag_prompt = ChatPromptTemplate.from_template(rag_template)

print(f"\n💉 CUSTOM MODEL RAG SYSTEM READY")
print("Type 'q' to exit.")

while True:
    query = input("\nYou: ")
    if query.lower() in ['q', 'quit']: break

    print("Thinking...", end="", flush=True)

    # 1. Retrieve Context
    # We retrieve top 3 relevant chunks
    results = db.similarity_search(query, k=3)

    # 2. Format Context
    context_text = "\n\n".join([f"[Chunk]: {d.page_content}" for d in results])

    # 3. Generate Answer
    chain = rag_prompt | llm | StrOutputParser()

    try:
        response = chain.invoke({"context": context_text, "question": query})

        # 4. Clean Response (Hidden Thought Logic)
        # If your model generates <thought> tags, we strip them here
        if "</thought>" in response:
            final_answer = response.split("</thought>")[-1].strip()
        else:
            final_answer = response.strip()

        # Remove any trailing special tokens
        final_answer = final_answer.replace("<|im_end|>", "")

        print(f"\rAI: {final_answer}")

    except Exception as e:
        print(f"\rError: {e}")


💉 CUSTOM MODEL RAG SYSTEM READY
Type 'q' to exit.

You: q


In [16]:
!pip install evaluate rouge_score pandas

Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=343023c84a920728fe73e1137c5c7803c198a6923f8c75f07d5c0f6b71640dc1
  Stored in directory: /root/.cache/pip/wheels/85/9d/af/01feefbe7d55ef5468796f0c68225b6788e85d9d0a281e7a70
Successfully built rouge_score
Installing collected packages: rouge_score, evaluate
Successfully installed evaluate-0.4.6 rouge_score-0.1.2


In [18]:
import os

print("Current Directory Contents:")
print(os.listdir("/content"))

# Check specific subfolders just in case
if os.path.exists("/content/cpu_inference_model"):
    print("\nInside cpu_inference_model:")
    print(os.listdir("/content/cpu_inference_model"))

Current Directory Contents:
['.config', 'chroma_db_data.zip', 'drive', '__MACOSX', 'my_finetuned_model', 'chroma_db_data', 'sample_data']


In [19]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
from tqdm import tqdm
import json
import os

# --- CONFIGURATION ---
local_model_dir = "cpu_inference_model"
test_data_path = "test_dataset_100.jsonl" # Ensure this file exists!

# 1. Load Model & Tokenizer (CPU Mode)
print("⏳ Loading model on CPU...")
try:
    tokenizer = AutoTokenizer.from_pretrained(local_model_dir)
    model = AutoModelForCausalLM.from_pretrained(
        local_model_dir,
        device_map="cpu",
        torch_dtype=torch.float32
    )
    print("✅ Model loaded.")
except Exception as e:
    print(f"❌ Error loading model: {e}")
    print("Did you unzip the model? Check previous steps.")

# 2. Run Inference
print(f"🚀 Starting Inference...")
dataset = load_dataset("json", data_files=test_data_path, split="train")
results = []

for item in tqdm(dataset):
    messages = item["messages"]

    # Prepare Input
    if messages[-1]["role"] == "assistant":
        input_messages = messages[:-1]
        ground_truth = messages[-1]["content"]
        if "</thought>" in ground_truth:
            ground_truth = ground_truth.split("</thought>")[-1].strip()
    else:
        input_messages = messages
        ground_truth = "N/A"

    # Generate
    inputs = tokenizer.apply_chat_template(
        input_messages,
        tokenize=True,
        add_generation_prompt=True,
        return_tensors="pt"
    )

    with torch.no_grad():
        outputs = model.generate(
            inputs,
            max_new_tokens=512,
            do_sample=False # Greedy decoding is faster/more deterministic for eval
        )

    # Clean Output
    full_response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    # Simple parsing to get the answer part
    if "assistant" in full_response:
        # Robust split in case of different template rendering
        full_response = full_response.split("assistant")[-1].strip()

    if "</thought>" in full_response:
        final_answer = full_response.split("</thought>")[-1].strip()
    else:
        final_answer = full_response.strip()

    results.append({
        "final_answer": final_answer,
        "ground_truth": ground_truth
    })

# 3. Save Results
output_file = "final_results.jsonl"
with open(output_file, "w") as f:
    for res in results:
        f.write(json.dumps(res) + "\n")

print(f"\n✅ Results saved to: {output_file}")

⏳ Loading model on CPU...
❌ Error loading model: cpu_inference_model is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `hf auth login` or by passing `token=<your_token>`
Did you unzip the model? Check previous steps.
🚀 Starting Inference...


FileNotFoundError: Unable to find '/content/test_dataset_100.jsonl'