In [None]:
 ! git pull origin main

fatal: not a git repository (or any of the parent directories): .git


In [None]:
! git clone https://github.com/wangy8205165/ANLP-LLM-Routing-and-Cascading.git

Cloning into 'ANLP-LLM-Routing-and-Cascading'...
remote: Enumerating objects: 25, done.[K
remote: Counting objects: 100% (25/25), done.[K
remote: Compressing objects: 100% (17/17), done.[K
remote: Total 25 (delta 7), reused 20 (delta 5), pack-reused 0 (from 0)[K
Receiving objects: 100% (25/25), 14.41 KiB | 3.60 MiB/s, done.
Resolving deltas: 100% (7/7), done.


In [None]:
!pwd

/content


In [None]:
%cd ANLP-LLM-Routing-and-Cascading

/content/ANLP-LLM-Routing-and-Cascading


In [None]:
# https://drive.google.com/file/d/1Rc_vefQY5I_Ou4nxqv9S1tyshpjPPsBM/view?usp=sharing
!pip install -q gdown
!gdown --id 1Rc_vefQY5I_Ou4nxqv9S1tyshpjPPsBM # Download the dataset
!unzip dataset.zip -d ./dataset

In [None]:
import transformers
import torch
from huggingface_hub import notebook_login
import os

In [None]:
# login your hugging face suing notebook_login

In [None]:
!hf auth whoami

[1muser: [0m yixiangw


In [None]:
os.environ["OPENAI_API_KEY"] = "Your openai api token"

In [None]:
# ===========================
# Import required libraries
# ===========================

import pandas as pd
from concurrent.futures import ThreadPoolExecutor
from functools import partial
from tqdm import tqdm
from huggingface_hub import notebook_login
from transformers import AutoTokenizer, AutoModelForCausalLM
import transformers
import torch
import os
import argparse
from prompt_template import dataset_prompts_and_instructions

In [None]:
# ===========================
# Configurations
# ===========================

MODEL_ID = "meta-llama/Meta-Llama-3.1-8B-Instruct"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
MAX_INPUT_TOKENS = 3500
MAX_NEW_TOKENS = 300


In [None]:
# ===========================
# Determine which dataset to test on
# ===========================
data = "quality_short"
dataset = f"dataset/{data}.jsonl"
print(f"We will be testing on dataset {dataset}\n")


We will be testing on dataset dataset/quality_short.jsonl



In [None]:
# ===========================
# Check the inputs are correct
# ===========================
inputs = pd.read_json(dataset, lines=True, orient="records")

length = len(inputs)
assert length == 1000

In [None]:
# ===========================
# Dataset name mapping
# ===========================

def normalize_dataset_name(name: str) -> str:
    name_lower = name.lower()
    mapping = {
        "cnli": "cnli",
        "coqa": "coqa",
        "narrativeqa": "narrative_qa",
        "narrative_qa": "narrative_qa",
        "qasper": "qasper",
        "quality": "quality",
    }
    key = ''.join(ch for ch in name_lower if ch.isalpha() or ch == '_')
    return mapping.get(key, key)

In [None]:
# ===========================
# Load the local llama 3 model
# ===========================

# Build the pipeline
def build_llm_pipeline(model_id=MODEL_ID, device=DEVICE):
    pipeline = transformers.pipeline(
        "text-generation",
        model=model_id,
        model_kwargs={"torch_dtype": torch.bfloat16},
        device_map="auto",
    )
    return pipeline
llama3_pipeline = build_llm_pipeline(MODEL_ID, DEVICE)

In [None]:
# ===========================
# Construct prompt
# ===========================

def build_prompt(row) -> str:
    ds_key = normalize_dataset_name(row["dataset"])
    cfg = dataset_prompts_and_instructions[ds_key]

    full_prompt = cfg["prompt"].format(
              context=row["base_ctx"],
        instruction=cfg["instruction"],
        question=row["question"],
    )
    return full_prompt

In [None]:
# ===========================
# Llama-3 inference
# ===========================

def generate_with_llama3(prompt: str, max_new_tokens: int = MAX_NEW_TOKENS) -> str:
    messages = [
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": prompt},
    ]

    outputs = llama3_pipeline(
        messages,
        max_new_tokens=max_new_tokens,
        # temperature=temperature,
    )
    text = outputs[0]["generated_text"][-1]["content"]
    return text.strip()


In [None]:
# ===========================
# ChatGPT inference
# ===========================

from openai import OpenAI
MAX_NEW_TOKENS = 300
api_key=os.environ["OPENAI_API_KEY"]
print(api_key)
client = OpenAI(api_key=api_key)

def generate_with_gpt(prompt: str, model_name: str = "gpt-5",max_tokens: int = MAX_NEW_TOKENS) -> str:
    response = client.responses.create(
        model=model_name,
        reasoning={"effort": "low"},
        instructions="You are a helpful assistant.",
        input=prompt,
        # max_output_tokens = max_tokens
    )
    return response.output_text

In [None]:
# ===========================
# Concurrent running
# ===========================

def run_solver_job(df, engine_func, max_workers: int = 4):
    prompts = [build_prompt(row) for _, row in df.iterrows()]
    results = []
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        for res in tqdm(executor.map(engine_func, prompts), total=len(prompts)):
            results.append(res)
    return results


In [None]:
# ===========================
# Run SLM and LLM baseline
# ===========================
outputs = {}

print("Running Llama-3 (small model)...")
outputs["llama3_pred"] = run_solver_job(inputs,partial(generate_with_llama3),max_workers=2)

In [None]:
print("Running ChatGPT (large model)...")
outputs["gpt_pred"] = run_solver_job(inputs,partial(generate_with_gpt, model_name="gpt-5"),max_workers=8)


In [None]:
# ===========================
# Save the results
# ===========================

import json
import os
output_path = f"outputs/baseline_output_{data}.json"
os.makedirs(os.path.dirname(output_path), exist_ok=True)
with open(output_path, "w", encoding="utf-8") as f:
    json.dump(outputs, f, ensure_ascii=False, indent=4)