<a href="https://colab.research.google.com/github/vmanvs/VibeThinker_Eval/blob/main/VibeThinker_Eval.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
import sys

#Check env
try:
  import google.colab
  IN_COLAB = True
  print("✓ Running in Google Colab")
except:
  IN_COLAB = False
  print("⚠ Not running in Colab - some features may not work")

#Check GPU
import torch
print(f"\nGPU Available: {torch.cuda.is_available()}")

if torch.cuda.is_available():
  print(f"GPU Name: {torch.cuda.get_device_name(0)}")
  print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3: .1f} GB")

else:
  print("No GPU Found! Change the Runtime")

print("\n" + "="*60)
print("Installing dependencies")
print("="*60)

!pip install -q transformers accelerate bitsandbytes sentencepiece termcolor huggingface_hub --upgrade bitsandbytes

print("✓ Dependencies installed")

✓ Running in Google Colab

GPU Available: True
GPU Name: Tesla T4
GPU Memory:  14.7 GB

Installing dependencies
✓ Dependencies installed


In [2]:
#Mount G-Drive for saving and fetching data
import os
from huggingface_hub import snapshot_download

if IN_COLAB:
  from google.colab import drive
  print("\nMounting G Drive...")
  print("Authorization Required")
  drive.mount('/content/drive')
  print("✓ Google Drive mounted at /content/drive")

  #Create an output Dir
  OUTPUT_DIR = "/content/drive/MyDrive/ToolBench_Results/vibethinker_predictions"
  os.makedirs(OUTPUT_DIR, exist_ok=True)
  print(f"✓ Results will be saved to: {OUTPUT_DIR}")

  CACHE_DIR = "/content/stable_toolbench_cache"
  os.makedirs(CACHE_DIR, exist_ok=True)

  print("Downloading API cache (~2-3GB, takes 5-10 min)...")
  print("⏳ Please wait...")

  cache_path = snapshot_download(
    repo_id="THUNLP-MT/StableToolBench",
    repo_type="dataset",
    local_dir=CACHE_DIR,
    allow_patterns=[
        "tool_response_cache/*",
        "tools/*"
    ]
  )

  print(f"✓ Cache downloaded to: {cache_path}")
  assert os.path.exists(f"{CACHE_DIR}/tool_response_cache"), "Cache missing!"
  assert os.path.exists(f"{CACHE_DIR}/tools"), "Tools missing!"
  print("✓ Cache structure verified")



Mounting G Drive...
Authorization Required
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
✓ Google Drive mounted at /content/drive
✓ Results will be saved to: /content/drive/MyDrive/ToolBench_Results/vibethinker_predictions
Downloading API cache (~2-3GB, takes 5-10 min)...
⏳ Please wait...


RepositoryNotFoundError: 404 Client Error. (Request ID: Root=1-69276468-669dca68235c251756da70bf;afddedc1-a6f9-4620-ae59-0f7ae43cca7a)

Repository Not Found for url: https://huggingface.co/api/datasets/THUNLP-MT/StableToolBench/revision/main.
Please make sure you specified the correct `repo_id` and `repo_type`.
If you are trying to access a private or gated repo, make sure you are authenticated. For more details, see https://huggingface.co/docs/huggingface_hub/authentication

In [3]:
#Clone ToolBench Repo
print("\n"+"="*60)
print("Setting up ToolBench...")
print("="*60)

#Check if already present
if not os.path.exists("/content/ToolBench"):
  print("Cloning ToolBech...")
  !git clone https://github.com/vmanvs/Fixed_ToolBench /content/ToolBench
  print("✓ ToolBench cloned")
else:
  print("✓ ToolBench already exists")

#Add to path
sys.path.insert(0, "/content/ToolBench")


Setting up ToolBench...
Cloning ToolBech...
Cloning into '/content/ToolBench'...
remote: Enumerating objects: 1286, done.[K
remote: Total 1286 (delta 0), reused 0 (delta 0), pack-reused 1286 (from 1)[K
Receiving objects: 100% (1286/1286), 58.93 MiB | 30.23 MiB/s, done.
Resolving deltas: 100% (749/749), done.
✓ ToolBench cloned


In [4]:
#Download test data
print("\n"+"="*60)
print("Downloading test data...")
print("="*60)

#Recommended have the test data in G Drive
"""
The data shoulde be placed as such:
MyDrive/
   ├── ToolBench_Data/
   │   ├── test_instruction/
   │   │   ├── G1_instruction.json
   │   │   ├── G1_category.json
   │   │   ├── G1_tool.json
   │   │   ├── G2_category.json
   │   │   ├── G2_instruction.json
   │   │   └── G3_instruction.json
   │   └── toolenv/
   │       └── tools/
   │           ├── Advertising/
   │           │   └── tool1.json
   |           |   └──...
   │           ├── Artificial_Intelligence_Machine_Learning/
   │           |    └── tool2.json
   |           |   └──...
"""
TEST_DATA_DIR = "/content/drive/MyDrive/ToolBench_Data/test_instruction"

#Option B:
#!wget -O /content/test_data.zip "https://drive.google.com/drive/folders/1TysbSWYpP8EioFu9xPJtpbJZMLLmwAmL"
#!unzip /content/test_data.zip -d /content/

if not os.path.exists(TEST_DATA_DIR):
  print("⚠ Test data not found!")
else:
  print(f"✓ Test data found at: {TEST_DATA_DIR}")




Downloading test data...
✓ Test data found at: /content/drive/MyDrive/ToolBench_Data/test_instruction


If the following error persists even after re-running all cells:
```bash
     86             raise ImportError(
     87                 "Using `bitsandbytes` 4-bit quantization requires the latest version of bitsandbytes: `pip install -U bitsandbytes`"
     88 )
```
Consider restarting the runtime...


In [None]:
# Create cache handler

cache_handler_code = """

import json
import os
from pathlib import Path

class StableToolBenchCache:

  #Handler for cached API responses

  def __init__(self, cache_dir):
    self.cache_dir = Path(cache_dir)
    self.tool_response_cache = self.cache_dir / "tool_response_cache"
    self.tools = self.cache_dir / "tools"

    if not self.tool_response_cache.exists():
      raise ValueError(f"Cache directory {self.tool_response_cache} does not exist")

    print(f"✓ Cache initialized at: {self.tool_response_cache}")
    self.load_cache_stats()

  def load_cache_stats(self):

    #Count cached responses


    cache_files = list(self.tool_response_cache.glob("*.json"))
    print(f"✓ Found {len(cache_files)} cache files")

    total_entries = 0
    for cache_file in cache_files[:5]: #Sample first 5
      try:
        with open(cache_file, "r") as f:
          data = json.load(f)
          total_entries += len(data) if isinstance(data, list) else 1
      except:
        pass

    print(f"   Estimated {total_entries * len(cache_files)//5} cached API responses")


  def get_cached_response(self, category, tool_name, api_name, params):

    #Get cached response for a given API call

    #Args:
      #category: API category (e.g., "Weather")
      #tool_name: Tool name (e.g., "OpenWeatherMap")
      #api_name: API endpoint name
      #params: Dictionary of parameters

    #Returns:
      #Cached response if found, otherwise None


    #Create cache key
    cache_key = self._create_cache_key(category, tool_name, api_name, params)

    #Check if cache file exists
    cache_file = self.tool_response_cache / f"{category}_{tool_name}.json"

    if not cache_file.exists():
      return None

    #Load cache
    try:
      with open(cache_file, "r") as f:
        cache_data = json.load(f)

      #Search for matching entry
      for entry in cache_data:
        if entry.get("cache_key") == cache_key:
          return entry.get("response")
    except Exception as e:
      print(f"Error reading cache: {e}")

    return None

  def _create_cache_key(self, category, tool_name, api_name, params):

    #Create cache key

    params_str = json.dumps(params, sort_keys=True)
    return f"{category}::{tool_name}::{api_name}::{params_str}"

#Save to file
cache_handler_path = "/content/stable_cache_handler.py"
with open(cache_handler_path, "w") as f:
  f.write(__doc__)
"""

with open("/content/stable_cache_handler.py", "w") as f:
  f.write(cache_handler_code)

print("✓ Cache handler created")


In [None]:
#Create Modified ToolBench runner with cache support

print("\n"+"="*60)
print("Creating modified ToolBench Runner...")
print("="*60)

runner_code = """

import sys
sys.path.insert(0, "/content/ToolBench")

from stable_cache_handler import StableToolBenchCache
from toolbench.inference.Downstream_tasks.rapidapi import pipeline_runner

class CachedPipelineRunner(pipeline_runner):

  #Extended pipeline runner with cache support

  def __init__(self, *args, cache_dir=None,**kwargs):
    super().__init__(*args, **kwargs)

    if cache_dir:
      self.cache = StableToolBenchCache(self.cache_dir)
      print("✓ Cache enabled for evaluation")
    else:
      self.cache = None

  def query_tool(self, category, tool_name, api_name, params):

    #Override to check cache first


    #Try cache first
    if self.cache:
      cached_response = self.cache.get_cached_response(category, tool_name, api_name, params)
      if cached_response:
        return cached_response

    #Fallback to original method
    return super().query_tool(category, tool_name, api_name, params)
"""

with open("/content/cached_runner.py", "w") as f:
  f.write(runner_code)

print("✓ Cache-enabled runner created")


In [11]:
#Load VibeThinker
print("\n"+"="*60)
print("Loading VibeThinker Model")
print("="*60)

from typing import Optional, List
import bitsandbytes
import re
import torch
import json
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

MODEL_NAME = "WeiboAI/VibeThinker-1.5B"

# UPDATED: Full ToolBench-compatible wrapper
class VibeThinkerWrapper:
  """
  Enhanced wrapper with agressive prompt enginnering for tool use
  """
  def __init__(self, model, tokenizer):
        self.model = model
        self.tokenizer = tokenizer
        self.device = model.device
        self.conversation_history = []
        self.time = None

  def prediction(self, prompt: str, stop: Optional[List[str]] = None) -> str:
      """
      Main inference method called by ToolBench
      """
      with torch.no_grad():
          inputs = self.tokenizer(
              prompt,
              return_tensors="pt",
              padding=True,
              truncation=True,
              max_length=2048
          ).to(self.device)

          outputs = self.model.generate(
              **inputs,
              max_new_tokens=256,
              do_sample=True,
              temperature=0.3,
              top_p=0.85,
              top_k=40,
              repetition_penalty=1.2,
              pad_token_id=self.tokenizer.eos_token_id,
              eos_token_id=self.tokenizer.eos_token_id
          )

          input_len = inputs['input_ids'].shape[1]
          generated_tokens = outputs[0, input_len:]
          response = self.tokenizer.decode(generated_tokens, skip_special_tokens=True)

          return response.strip()

  def add_message(self, message):
      """
      Add a single message to conversation history
      Called by ToolBench to build conversation
      """
      self.conversation_history.append(message)

  def change_messages(self, messages):
      """
      Replace entire conversation history
      Called by ToolBench at the start of each chain step
      """
      self.conversation_history = messages

  def display_conversation(self, detailed=False):
      """
      Debug method to print conversation
      Optional but helpful for debugging
      """
      try:
        from termcolor import colored

        role_to_color = {
            "system": "red",
            "user": "green",
            "assistant": "blue",
            "function": "magenta",
        }

        print("Conversation " + "="*50)
        for message in self.conversation_history:
            role = message.get('role', 'unknown')
            content = message.get('content', '')

            print_obj = f"{role}: {content[:100]}..."

            if "function_call" in message:
                print_obj += f" | function_call: {message['function_call']}"

            color = role_to_color.get(role, "white")
            print(colored(print_obj, color))

        print("="*60)
      except:
        pass #fail silently if termcolor not found

  def parse(self, functions, process_id, **args):
      """
      Main parsing method called by ToolBench's agent loop

      This method:
      1. Builds prompt from conversation history
      2. Calls prediction()
      3. Enhanced Prompt Construction
      4. Parses the response for Action/Thought
      5. Returns in ToolBench's expected format
      """
      import time
      import json

      self.time = time.time()

      # Build prompt from conversation history
      prompt = self._build_enhanced_prompt(self.conversation_history, functions)

      # Get model prediction
      predictions = self.prediction(prompt)

      if process_id == 0:
          # Estimate token count (rough approximation)
          decoded_token_len = len(self.tokenizer.encode(predictions))
          print(f"[process({process_id})]total tokens: {decoded_token_len}")
      else:
          decoded_token_len = 0

      # Parse the prediction into ToolBench's format
      thought, action, action_input = self._enhanced_react_parser(predictions, functions)

      # Return in expected format: (message_dict, error_code, token_count)
      message = {
          "role": "assistant",
          "content": thought,
          "function_call": {
              "name": action,
              "arguments": action_input
          }
      }

      return message, 0, decoded_token_len

  def _build_enhanced_prompt(self, messages, functions):
    """
    Build prompt with strong formatting instructions and examples
    """

    prompt = ""
    #Track if we we've seen the system

    has_system = False
    user_query = ""

    for message in messages:
      role = message.get('role', '')
      content = message.get('content', '')

      if role == "system":
        has_system = True
        #Inject enhanced instructions
        enhanced_system = self._create_enhanced_system_prompt(content, functions)
        prompt += f"{enhanced_system}\n\n"

      elif role == "user":
        user_query = content
        prompt += f"USER QUESTION: {content}\n\n"

      elif role == 'assistant':
        prompt += f"ASSISTANT RESPONSE:\n{content}\n\n"

      elif role == 'function':
        prompt += f"TOOL RESULT: {content}\n\n"

    #Add strong formatting reminder before assistant response
    prompt += self._get_format_reminder(functions)

    return prompt

  def _create_enhanced_system_prompt(self, original_content, functions):
    """
    Create a explicit system prompt with examples
    """

    system_prompt = """
    You are a helpful AI assistant that uses tools to answer questions.

    ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
    CRITICAL: YOU MUST RESPOND IN THIS EXACT FORMAT - NO EXCEPTIONS:
    ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

    FORMAT TEMPLATE:
    Thought: [One sentence explaining what you need to do]
    Action: [exact_tool_name]
    Action Input: {"param1": "value1", "param2": "value2"}

    ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
    CONCRETE EXAMPLES:
    ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

    EXAMPLE 1 - Using a weather tool:
    USER: What's the weather in Paris?
    ASSISTANT:
    Thought: I need to check the weather for Paris using the weather tool.
    Action: get_weather
    Action Input: {"city": "Paris", "country": "France"}

    [After getting weather data...]
    Thought: I have received the weather information for Paris.
    Action: Finish
    Action Input: {"return_type": "give_answer", "final_answer": "The weather in Paris is sunny with a temperature of 22°C."}

    EXAMPLE 2 - Searching for flights:
    USER: Find flights from London to Dubai
    ASSISTANT:
    Thought: I need to search for available flights between London and Dubai.
    Action: search_flights
    Action Input: {"origin": "London", "destination": "Dubai", "date": "2024-01-15"}

    [After getting flight results...]
    Thought: I found flight options and can provide the answer.
    Action: Finish
    Action Input: {"return_type": "give_answer", "final_answer": "Found 3 flights from London to Dubai with prices ranging from $450 to $680."}

    EXAMPLE 3 - When you can't complete the task:
    USER: [Impossible request]
    ASSISTANT:
    Thought: I cannot complete this task with the available tools.
    Action: Finish
    Action Input: {"return_type": "give_up_and_restart"}

    ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
    IMPORTANT RULES:
    ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

    1. ALWAYS start with "Thought:"
    2. ALWAYS follow with "Action:" on a new line
    3. ALWAYS follow with "Action Input:" on a new line with valid JSON
    4. Keep Thought to ONE sentence
    5. Action must be EXACTLY one of the tool names listed below
    6. Action Input must be valid JSON with correct parameters
    7. When you have the final answer, use Action: Finish

    """
      #Add available tools
    if functions:
      system_prompt += "\n━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n"
      system_prompt += "AVAILABLE TOOLS:\n"
      system_prompt += "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n\n"


      for i, func in enumerate(functions, 1):
        name = func.get('name', 'unknown')
        desc = func.get('description', 'No description')
        params = func.get('parameters', {}).get('properties', {})
        required = func.get('parameters', {}).get('required', [])

        system_prompt += f"{i}. Tool Name: {name}\n"
        system_prompt += f"  Description: {desc}\n"

        if params:
          system_prompt += f"    Parameters:\n"
          for param_name, param_info in params.items():
            param_type = param_info.get('type', 'string')
            param_desc = param_info.get('description', '')
            is_required = "    (REQUIRED)" if param_name in required else "(optional)"
            system_prompt += f"      - {param_name}: {param_type}{is_required} - {param_desc}\n"

        system_prompt += "\n"

    return system_prompt

  def _get_format_reminder(self, functions) -> str:
    """
    Strong reminder right before the model generates
    """
    tool_names = [f['name'] for f in functions] if functions else []
    tool_list = ", ".join(tool_names[:5])

    if len(tool_names) > 5:
      tool_list += f", ...({len(tool_names)} total)"

    reminder = f"""
          ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
          REMINDER: You must respond in this EXACT format:

          Thought: [one sentence]
          Action: [choose from: {tool_list}]
          Action Input: {{"param": "value"}}

          Now respond:
          ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

          ASSISTANT RESPONSE:
      """

    return reminder #if remainder is not None else return ""


  def _inject_functions_into_system(self, system_content, functions):
      """
      Add function descriptions to system prompt
      """
      if not functions:
          return system_content

      func_desc = "\n\nAvailable functions:\n"
      for func in functions:
          name = func.get('name', 'unknown')
          desc = func.get('description', 'No description')
          params = func.get('parameters', {}).get('properties', {})

          func_desc += f"\n{name}: {desc}\n"
          if params:
              func_desc += f"  Parameters: {list(params.keys())}\n"

      return system_content + func_desc

  def _enhanced_react_parser(self, text, functions):
    """
      Enhanced parser with multiple fallback strategies
    """
    #1. Try standard parsing
    thought, action, action_input = self._standard_react_parser(text)

    if action and action!="Finish":
      #Validate action exists
      valid_actions = [f['name'] for f in functions] + ['Finish']
      if action not in valid_actions:
        #2. try fuzzy matching
        action = self._fuzzy_match_action(action, valid_actions)

    #Strategy 3: If still no valid action, extract intent
    if not action or (action not in [f['name'] for f in functions] and action != 'Finish'):
        thought, action, action_input = self._intent_extraction(text, functions)

    #Last option: Give up grace fully
    if not action:
      thought = text[:200] if text else "I cannot determine appropriate action."
      action = "Finish"
      action_input = json.dumps({
          "return_type": "give_up_and_restart"
      })

    return thought, action, action_input

  def _standard_react_parser(self, text):
      """
      Parse model output in ReAct format

      Expected format:
      Thought: <reasoning>
      Action: <function_name>
      Action Input: <json_input>

      Or:

      Final Answer: <answer>
      """
      thought = ""
      action = ""
      action_input = ""

      text = text.strip()

      # Try to extract Thought
      thought_patterns = [
        r'Thought:\s*(.+?)(?=Action:|$)',
        r'THOUGHT:\s*(.+?)(?=ACTION:|$)',
        r'Thought:\s*(.+?)(?=Action Input:|$)',
        r'THOUGHT:\s*(.+?)(?=ACTION INPUT:|$)',
        #Fallback match everything before Action
        r'^(.+?)(?=Action:|$)',
        r'^(.+?)(?=Action Input:|$)'
      ]

      for pattern in thought_patterns:
        match = re.search(pattern, text, re.IGNORECASE | re.DOTALL)
        if match:
          thought = match.group(1).strip()
          thought = thought.split('\n')[0][:200]
          break

      # Extract Action
      action_patterns = [
          r'Action:\s*([^\n]+)',
          r'ACTION:\s*([^\n]+)',
          r'Tool:\s*([^\n]+)',
      ]

      for pattern in action_patterns:
          match = re.search(pattern, text, re.IGNORECASE)
          if match:
              action = match.group(1).strip()
              # Remove any trailing punctuation or quotes
              action = re.sub(r'["\'\.,;:]+$', '', action)
              break

      # Extract Action Input
      input_patterns = [
          r'Action Input:\s*(\{.+?\})',
          r'ACTION INPUT:\s*(\{.+?\})',
          r'Input:\s*(\{.+?\})',
          r'(\{[^}]+\})',  # Any JSON object
      ]

      for pattern in input_patterns:
          match = re.search(pattern, text, re.IGNORECASE | re.DOTALL)
          if match:
              action_input = match.group(1).strip()
              # Validate JSON
              try:
                  json.loads(action_input)
                  break
              except:
                  continue

      # Check for final answer indicators
      if any(keyword in text.lower() for keyword in ['final answer', 'final_answer', 'give_answer', 'answer is']):
          action = "Finish"

          # Extract the answer
          answer_match = re.search(r'(?:final answer|answer is)[:\s]*(.+)', text, re.IGNORECASE | re.DOTALL)
          if answer_match:
              answer = answer_match.group(1).strip()[:500]
          else:
              answer = thought if thought else text[:500]

          action_input = json.dumps({
              "return_type": "give_answer",
              "final_answer": answer
          })

      # Check for give up indicators
      if any(keyword in text.lower() for keyword in ['cannot', 'unable', 'give up', 'don\'t know', 'not possible']):
          if not action or action == "Finish":
              action = "Finish"
              action_input = json.dumps({
                  "return_type": "give_up_and_restart"
              })

      return thought, action, action_input
      """
      if "Thought:" in text:
          thought_start = text.find("Thought:") + len("Thought:")
          thought_end = text.find("Action:", thought_start)
          if thought_end == -1:
              thought_end = text.find("Final Answer:", thought_start)
          if thought_end == -1:
              thought_end = len(text)
          thought = text[thought_start:thought_end].strip()

      # Try to extract Action
      if "Action:" in text:
          action_start = text.find("Action:") + len("Action:")
          action_end = text.find("Action Input:", action_start)
          if action_end == -1:
              action_end = text.find("\n", action_start)
          if action_end == -1:
              action_end = len(text)
          action = text[action_start:action_end].strip()

      # Try to extract Action Input
      if "Action Input:" in text:
          input_start = text.find("Action Input:") + len("Action Input:")
          action_input = text[input_start:].strip()

          # Try to parse as JSON
          try:
              # Find JSON object
              if "{" in action_input:
                  json_start = action_input.find("{")
                  json_end = action_input.rfind("}") + 1
                  action_input = action_input[json_start:json_end]
          except:
              pass

      # Check for Final Answer
      if "Final Answer:" in text or "final answer" in text.lower():
          action = "Finish"
          # Extract the answer
          if "Final Answer:" in text:
              answer_start = text.find("Final Answer:") + len("Final Answer:")
              answer = text[answer_start:].strip()
          else:
              answer = thought if thought else text

          action_input = json.dumps({
              "return_type": "give_answer",
              "final_answer": answer
          })

      # If no action found, assume the model wants to give up or is confused
      if not action and not thought:
          thought = text
          action = "Finish"
          action_input = json.dumps({
              "return_type": "give_up_and_restart"
          })

      return thought, action, action_input
      """
  def _fuzzy_match_action(self, action, valid_actions):
    """
    Try to match action to valid actions using fuzzy matching
    """
    action_lower = action.lower().replace('_','').replace('_','')

    for valid_action in valid_actions:
      valid_lower = valid_action.lower().replace('_','').replace('_','')

      #Check exact match after normalization
      if action_lower == valid_lower:
        return valid_action

      if action_lower in valid_lower or valid_lower in action_lower:
        return valid_action

      #No match
      return action

  def _intent_extraction(self, text, functions):
    """
    Try to extract intent from free-form text
    """
    text_lower = text.lower()

    #Look for keywords that might indicate tool usage
    for func in functions:
      func_name = func.get('name', '')
      func_desc = func.get('description', '').lower()

      #Check if function name or key parts of description appear in text
      if func_name.lower() in text_lower:
        return (
            f"I will use the {func_name} tool.",
            func_name,
            json.dumps({}) #Empty parameters as fallback
        )

      #Check for keywords from description
      keywords = re.findall(r'\b\w{4,}\b', func_desc) # Words with 4+ chars
      for keyword in keywords[:3]:
        if keyword in text_lower:
          return (
              f"I will use the {func_name} tool.",
              func_name,
              json.dumps({}) #Empty parameters as fallback
          )

      #No tool match give up
      return (
          text[:200] if text else "Unable to proceed",
          "Finish",
          json.dumps({
              "return_type": "give_up_and_restart"
          })
      )


  def get_model_name(self):
      return "VibeThinker-1.5B"


# Check dependencies first
try:
    import bitsandbytes as bnb
    print(f"✓ bitsandbytes {bnb.__version__} is installed")
except ImportError:
    print("⚠ bitsandbytes not found, installing now...")
    import subprocess
    subprocess.check_call(['pip', 'install', '-q', 'bitsandbytes'])
    import bitsandbytes as bnb
    print(f"✓ bitsandbytes {bnb.__version__} installed")

# Load tokenizer
print("\nLoading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"
print("✓ Tokenizer loaded")

# Load model with 4-bit quantization
print("Loading model with 4-bit quantization...")
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True
)

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=quant_config,
    device_map="auto",
    trust_remote_code=True,
    low_cpu_mem_usage=True
)

print("✓ Model loaded successfully")
print(f"Model device: {model.device}")

# Initialize wrapper
llm_wrapper = VibeThinkerWrapper(model, tokenizer)
print("✓ Model wrapper ready with full ToolBench interface")


Loading VibeThinker Model
✓ bitsandbytes 0.48.2 is installed

Loading tokenizer...
✓ Tokenizer loaded
Loading model with 4-bit quantization...
✓ Model loaded successfully
Model device: cuda:0
✓ Model wrapper ready with full ToolBench interface


In [7]:
#Quick Sanity Check
print("\n" + "="*60)
print("Testing model generation...")
print("="*60)

test_prompt = "Hello! What is 2+2?"
print(f"Input: {test_prompt}")
response = llm_wrapper.prediction(test_prompt, stop=None)
print(f"Output: {response}")
print("\n✓ Model is working!")


Testing model generation...
Input: Hello! What is 2+2?
Output: Let me see... I think it's 4. But wait, maybe there are other ways to add numbers that also give 4. So the answer is not unique. Hmm, but how does this relate to the expression 2 + 2?

Wait a second, perhaps the question is trying to get me to consider different mathematical expressions that evaluate to the same number, even if they look different. For example, in some contexts, like modular arithmetic, 2 + 2 could be equivalent to another operation. Or maybe considering different representations of numbers.

Alternatively, maybe the problem is pointing out that addition isn't just commutative or associative, but there might be different operations with the same result. Wait, but here it's the same expression. Unless we're changing the operator. Like, adding 2 and 2 gives 4, but subtracting them would give 0, multiplying gives 4 as well (if 2*2=4). Oh! Wait a minute, multiplication: 2 * 2 is also 4. So actually, both addit

In [12]:
#Setup ToolBench Pipeline

print("\n" + "="*60)
print("Setting up ToolBench pipeline...")
print("="*60)

from argparse import Namespace

try:
  from toolbench.inference.Downstream_tasks.rapidapi import pipeline_runner
  print(f"✓ Imported Toolbench modules")
  TOOL_ROOT_DIR = "/content/drive/MyDrive/ToolBench_Data/toolenv/tools"
  print(f"✓ Toolbench Tool Root Directory exists")
except Exception as e:
  print(f"⚠ Error importing Toolbench: {e}")
  print("You may need to adjust sys.path or check ToolBench installation.")

args = Namespace(
    tool_root_dir=TOOL_ROOT_DIR,
    toolbench_key="",  # Empty for local simulation
    rapidapi_key="",
    use_rapidapi_key=False,
    api_customization=False,
    max_observation_length=1024,
    observ_compress_method="truncate",
    method="CoT@1",
    input_query_file="",  # Will be set per test set
    output_answer_file=OUTPUT_DIR,
    backbone_model=llm_wrapper,
    openai_key="",
    max_sequence_length=8192,
    max_source_sequence_length=2048,
    lora=False,
    model_path="",
    lora_path="",
    corpus_tsv_path="",
    retrieval_model_path=""
)

print(f"✓ Pipeline Configured")



Setting up ToolBench pipeline...
✓ Imported Toolbench modules
✓ Toolbench Tool Root Directory exists
✓ Pipeline Configured


In [13]:

# Diagnostic Cell -> Tool Presence

import os
import json

print("="*60)
print("DIAGNOSTIC: Checking Tool Setup")
print("="*60)

# Check the tool directory
TOOL_ROOT_DIR = "/content/drive/MyDrive/ToolBench_Data/toolenv/tools"
print(f"\nTool directory: {TOOL_ROOT_DIR}")
print(f"Exists: {os.path.exists(TOOL_ROOT_DIR)}")

if os.path.exists(TOOL_ROOT_DIR):
    # List categories
    categories = [d for d in os.listdir(TOOL_ROOT_DIR) if os.path.isdir(os.path.join(TOOL_ROOT_DIR, d))]
    print(f"\nFound {len(categories)} categories:")

    total_tools = 0
    for cat in categories[:5]:  # Show first 5
        tools = [f for f in os.listdir(os.path.join(TOOL_ROOT_DIR, cat)) if f.endswith('.json')]
        print(f"  - {cat}: {len(tools)} tools")
        total_tools += len(tools)

    if len(categories) > 5:
        print(f"  ... and {len(categories) - 5} more categories")

    print(f"\nTotal tools found: {total_tools}")

    # Show a sample tool
    if categories:
        sample_cat = categories[0]
        sample_tools = [f for f in os.listdir(os.path.join(TOOL_ROOT_DIR, sample_cat)) if f.endswith('.json')]
        if sample_tools:
            sample_file = os.path.join(TOOL_ROOT_DIR, sample_cat, sample_tools[0])
            print(f"\nSample tool file: {sample_file}")
            with open(sample_file) as f:
                tool_data = json.load(f)
                print(f"Tool name: {tool_data.get('tool_name', 'N/A')}")
                print(f"APIs: {len(tool_data.get('api_list', []))}")
else:
    print("\n❌ Tool directory does not exist!")
    print("\nYou need to upload your tool definitions.")

# Check a sample query to see what tools it expects
TEST_DATA_DIR = "/content/drive/MyDrive/ToolBench_Data/test_instruction"
sample_query_file = os.path.join(TEST_DATA_DIR, "G1_instruction.json")

print("\n" + "="*60)
print("Checking Query Expectations")
print("="*60)

if os.path.exists(sample_query_file):
    with open(sample_query_file) as f:
        queries = json.load(f)

    print(f"\nSample query from G1_instruction:")
    sample = queries[0]
    print(f"Query: {sample.get('query', 'N/A')[:100]}...")

    if 'api_list' in sample:
        print(f"\nExpected tools:")
        for api in sample['api_list'][:3]:
            print(f"  - Category: {api.get('category_name', 'N/A')}")
            print(f"    Tool: {api.get('tool_name', 'N/A')}")
            print(f"    API: {api.get('api_name', 'N/A')}")
            print()
else:
    print(f"\n❌ Query file not found: {sample_query_file}")

print("="*60)

DIAGNOSTIC: Checking Tool Setup

Tool directory: /content/drive/MyDrive/ToolBench_Data/toolenv/tools
Exists: True

Found 50 categories:
  - Business: 695 tools
  - Artificial_Intelligence_Machine_Learning: 390 tools
  - Advertising: 158 tools
  - Business_Software: 350 tools
  - Customized: 1 tools
  ... and 45 more categories

Total tools found: 1594

Sample tool file: /content/drive/MyDrive/ToolBench_Data/toolenv/tools/Business/israel_company_data.json
Tool name: Israel Company Data
APIs: 1

Checking Query Expectations

Sample query from G1_instruction:
Query: I am a fitness enthusiast and I want to buy a fitness tracker. Can you suggest some top-rated fitnes...

Expected tools:
  - Category: Data
    Tool: ASIN Data
    API: Category

  - Category: Data
    Tool: ASIN Data
    API: Offers

  - Category: Data
    Tool: ASIN Data
    API: Reviews



In [14]:
#Run evaluation:
print("\n" + "="*60)
print("Starting evaluation...")
print("="*60)

from tqdm import tqdm
import json

test_sets = ["G1_instruction", "G1_category", "G1_tool", "G2_category", "G2_instruction", "G3_instruction"]

results_summary = {}

for subset in test_sets:
  input_file = os.path.join(TEST_DATA_DIR, f'{subset}.json')

  if not os.path.exists(input_file):
    print(f"⚠ Skipping evaluation for subset: {subset}. Test data not found.")
    results_summary[subset] = "File not found"
    continue

  with open(input_file, "r", encoding="utf-8") as f:
    queries = json.load(f)
    print(f"Found {len(queries)} queries in {subset}")

  args.input_query_file =  input_file
  args.output_answer_file = os.path.join(OUTPUT_DIR, subset)

  try:
    runner = pipeline_runner(
        args=args,
        add_retrieval=False,
        process_id=0,
        server=False
    )

    runner.run()
    print(f"✓ Completed {subset}")
    results_summary[subset] = "Success"

  except Exception as e:
    print(f"✗ Error running evaluation for subset: {subset}. Error: {e}")
    results_summary[subset] = f"Error: {str(e)}"
    import traceback
    traceback.print_exc()



Starting evaluation...
Found 200 queries in G1_instruction


100%|██████████| 51/51 [01:15<00:00,  1.48s/it]


total tasks: 200
undo tasks: 178
process[0] doing task 0/178: real_task_id_17038
[process(0)]now playing Can you please provide a list of available telephone numbers for the country 'Canada' using the List available numbers for a country API? I also need to retrieve the audio file from a text-to-speech conversion with the transaction ID '13579' using the Retrieve audio file API., with 7 APIs
[single_chain]try for the 1 time
[process(0)]total tokens: 255
Thought: 1. Think: I am in the country for the country for the country for the country for the country for the country for the country for the country for the country for the country for the country for the co
Action: Finish
Action Input: {"return_type": "give_up_and_restart"}
Observation: {"response":"chose to give up and restart"}
[process(0)]valid=False
process[0] doing task 1/178: real_task_id_55489
[process(0)]now playing I need to gather information about the crime rates and accidents in Germany. Please fetch all the news articles

KeyboardInterrupt: 

In [None]:
#View Results Summary:

print("\n" + "="*60)
print("EVALUATION SUMMARY")
print("="*60)

for subset, status in results_summary.items():
    status_icon = "✓" if status == "Success" else "✗"
    print(f"{status_icon} {subset}: {status}")

print(f"\n✓ All results saved to: {OUTPUT_DIR}")
print("\nTo download results from Google Drive:")
print("1. Open Files panel (left sidebar)")
print("2. Navigate to drive/MyDrive/ToolBench_Results/")
print("3. Right-click folder > Download")