In [1]:
import pandas as pd
from concurrent.futures import ThreadPoolExecutor
from functools import partial
from tqdm import tqdm
from huggingface_hub import notebook_login
from transformers import AutoTokenizer, AutoModelForCausalLM
import transformers
import torch
import os


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
inputs = pd.read_json("dataset/coqa.jsonl", lines=True, orient="records")

In [3]:
inputs['dataset'].value_counts()

dataset
coqa    7849
Name: count, dtype: int64

In [4]:
inputs = inputs.sample(10)
print(f"Number of inputs: {len(inputs)}")

Number of inputs: 10


In [5]:
inputs['dataset'].value_counts()

dataset
coqa    10
Name: count, dtype: int64

In [6]:
from prompt_template import dataset_prompts_and_instructions
# ===========================
# Dataset name mapping
# ===========================

def normalize_dataset_name(name: str) -> str:
    name_lower = name.lower()
    mapping = {
        "cnli": "cnli",
        "coqa": "coqa",
        "narrativeqa": "narrative_qa",
        "narrative_qa": "narrative_qa",
        "qasper": "qasper",
        "quality": "quality",
    }
    # 去掉中间的非字母字符再匹配一次（以防万一）
    key = ''.join(ch for ch in name_lower if ch.isalpha() or ch == '_')
    return mapping.get(key, key)  # 如果正好已经是 key，就直接用

In [7]:
def build_prompt(row) -> str:
    ds_key = normalize_dataset_name(row["dataset"])
    cfg = dataset_prompts_and_instructions[ds_key]

    full_prompt = cfg["prompt"].format(
              context=row["base_ctx"],
        instruction=cfg["instruction"],
        question=row["question"],
    )
    return full_prompt

In [10]:
from openai import OpenAI
MAX_NEW_TOKENS = 300
api_key = "your openai api token"
client = OpenAI(api_key=api_key)

def generate_with_gpt(prompt: str, model_name: str = "gpt-5",temperature: float = 0.0,max_tokens: int = MAX_NEW_TOKENS) -> str:
    response = client.responses.create(
        model=model_name,
        reasoning={"effort": "low"},
        instructions="You are a helpful assistant.",
        input=prompt,
    )
    return response.output_text


In [13]:
def run_solver_job(df, engine_func, max_workers: int = 4):
    prompts = [build_prompt(row) for _, row in df.iterrows()]
    results = []
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        for res in tqdm(executor.map(engine_func, prompts), total=len(prompts)):
            results.append(res)
    return results

In [14]:
outputs = {}
print("Running ChatGPT (large model)...")
outputs["gpt_pred"] = run_solver_job(inputs,partial(generate_with_gpt, model_name="gpt-5"),max_workers=8)


Running ChatGPT (large model)...


100%|██████████| 10/10 [00:06<00:00,  1.66it/s]


In [15]:
for idx, pred in enumerate(outputs["gpt_pred"]):
    gold = inputs["output"].iloc[idx]
    dataset = inputs["dataset"].iloc[idx]
    print(f"Sample {idx+1}")
    print(f"Dataset: {dataset}")
    print(f"GPT predicts: {pred}")
    print(f"standard answer: {gold}")
    print("-" * 80)

Sample 1
Dataset: coqa
GPT predicts: The answer is: Yes—coal and cotton.
standard answer: Yes.
--------------------------------------------------------------------------------
Sample 2
Dataset: coqa
GPT predicts: the rest of the day
standard answer: the rest of the day
--------------------------------------------------------------------------------
Sample 3
Dataset: coqa
GPT predicts: by catching her in her arms and covering her face with kisses
standard answer: considerable earnestness.
--------------------------------------------------------------------------------
Sample 4
Dataset: coqa
GPT predicts: Tom Rover
standard answer: Tom
--------------------------------------------------------------------------------
Sample 5
Dataset: coqa
GPT predicts: No.
standard answer: no
--------------------------------------------------------------------------------
Sample 6
Dataset: coqa
GPT predicts: The Meadows
standard answer: "The Meadows"
-------------------------------------------------------

In [12]:
print(inputs["output"])

6165                         Yes.
5564          the rest of the day
5158    considerable earnestness.
5320                          Tom
371                            no
7837                "The Meadows"
1260        Physical and climatic
4433      his ally, Sen. Mike Lee
7548                    His uncle
7328                           No
Name: output, dtype: object


In [1]:
import json
output_path = "baseline_output_coqa.json"
with open(output_path, "w", encoding="utf-8") as f:
    json.dump(outputs, f, ensure_ascii=False, indent=4)

NameError: name 'outputs' is not defined