In [3]:

import pandas as pd
from concurrent.futures import ThreadPoolExecutor
from functools import partial
from tqdm import tqdm
# from transformers import AutoTokenizer, AutoModelForCausalLM
# import transformers
import torch
import os
# import argparse
from prompt_template import dataset_prompts_and_instructions


In [5]:
# ===========================
# Configurations
# ===========================

MODEL_ID = "meta-llama/Meta-Llama-3.1-8B-Instruct"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
MAX_INPUT_TOKENS = 3500   
MAX_NEW_TOKENS = 300 

In [6]:
# ===========================
# Determine which dataset to test on
# ===========================
data = "cnli_short"
dataset = f"dataset/{data}.jsonl"
print(f"We will be testing on dataset {dataset}\n")

We will be testing on dataset dataset/cnli_short.jsonl



In [7]:
# ===========================
# Check the inputs are correct
# ===========================
inputs = pd.read_json(dataset, lines=True, orient="records")

length = len(inputs)
assert length == 1000

In [8]:
# ===========================
# Dataset name mapping
# ===========================

def normalize_dataset_name(name: str) -> str:
    name_lower = name.lower()
    mapping = {
        "cnli": "cnli",
        "coqa": "coqa",
        "narrativeqa": "narrative_qa",
        "narrative_qa": "narrative_qa",
        "qasper": "qasper",
        "quality": "quality",
    }
    key = ''.join(ch for ch in name_lower if ch.isalpha() or ch == '_')
    return mapping.get(key, key)

In [9]:
# ===========================
# Construct prompt
# ===========================

def build_prompt(row) -> str:
    ds_key = normalize_dataset_name(row["dataset"])
    cfg = dataset_prompts_and_instructions[ds_key]

    full_prompt = cfg["prompt"].format(
              context=row["base_ctx"],
        instruction=cfg["instruction"],
        question=row["question"],
    )
    return full_prompt

In [10]:
# ===========================
# ChatGPT inference
# ===========================

from openai import OpenAI
MAX_NEW_TOKENS = 300
api_key=os.environ["OPENAI_API_KEY"]
print(api_key)
client = OpenAI(api_key=api_key)

def generate_with_gpt(prompt: str, model_name: str = "gpt-5",max_tokens: int = MAX_NEW_TOKENS) -> str:
    response = client.responses.create(
        model=model_name,
        reasoning={"effort": "low"},
        instructions="You are a helpful assistant.",
        input=prompt,
        # max_output_tokens = max_tokens
    )
    return response.output_text

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


In [11]:
# ===========================
# Concurrent running
# ===========================

def run_solver_job(df, engine_func, max_workers: int = 4):
    prompts = [build_prompt(row) for _, row in df.iterrows()]
    results = []
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        for res in tqdm(executor.map(engine_func, prompts), total=len(prompts)):
            results.append(res)
    return results


In [12]:
outputs = {}
print("Running ChatGPT (large model)...")
outputs["gpt_pred"] = run_solver_job(inputs,partial(generate_with_gpt, model_name="gpt-5"),max_workers=8)

Running ChatGPT (large model)...


100%|██████████| 1000/1000 [13:25<00:00,  1.24it/s]


In [14]:
# ===========================
# Save the results
# ===========================

import json
output_path = f"outputs/baseline_gpt_output_{data}.json"
with open(output_path, "w", encoding="utf-8") as f:
    json.dump(outputs, f, ensure_ascii=False, indent=4)

TypeError: Object of type Series is not JSON serializable