In [1]:
import datasets
from datasets import load_dataset, load_from_disk
from in_context_ssl.reasoning.template import *
import os
import openai
from openai import OpenAI
from tqdm import tqdm
import numpy as np
import json
from in_context_ssl.reasoning.utils import *
from in_context_ssl.reasoning.dataset import *
import re
import pandas as pd
from in_context_ssl.grading.grader import grade_answer
import matplotlib.pyplot as plt

  from .autonotebook import tqdm as notebook_tqdm


## Preprocessing

In [2]:
def process_doc(doc: dict):
    out_doc = {
        "question": doc["turns"][0],
        "answer": doc["ground_truth"],
        "group": doc["task"]
    }
    return out_doc

In [3]:
def preprocess_data():
    ds = load_dataset("livebench/math")["test"]

    train_datasets = []
    test_datasets = []

    subtasks = ["math_comp", "AMPS_Hard", "olympiad"]
    remove_columns = [k for k in ds.features.keys() if k not in ["turns", "ground_truth"]]
    for subtask in subtasks:
        print(len(ds))
        ds_curr = ds.filter(lambda x: x["task"] == subtask)
        cutoff = int(len(ds_curr) * 0.75)
        print(cutoff)
        ds_curr = ds_curr.shuffle()
        ds_train = ds_curr.select(range(cutoff)).map(process_doc, remove_columns=remove_columns)
        ds_test = ds_curr.select(range(cutoff, len(ds_curr))).map(process_doc, remove_columns=remove_columns)
        train_datasets.append(ds_train)
        test_datasets.append(ds_test)

    ds_train = datasets.concatenate_datasets(train_datasets)
    ds_test = datasets.concatenate_datasets(test_datasets)

    ds_train.save_to_disk("in_context_ssl/reasoning/data/livebench_math_train.hf")
    ds_test.save_to_disk("in_context_ssl/reasoning/data/livebench_math_test.hf")


In [None]:
def add_embedding(doc: dict):
    out_doc = {
        "embedding": client.embeddings.create(
            input = [doc["question"]], model="text-embedding-3-large"
        ).data[0].embedding
    }
    return out_doc
ds = load_from_disk("in_context_ssl/reasoning/data/livebench_math_test.hf")
ds = ds.map(add_embedding)
ds.save_to_disk("in_context_ssl/reasoning/data/livebench_math_test_new.hf")

Map: 100%|██████████| 93/93 [00:35<00:00,  2.64 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 93/93 [00:00<00:00, 19148.31 examples/s]


In [None]:
#add API key
api_key = ""
os.environ["OPENAI_API_KEY"] = api_key
client = OpenAI()

## Inference

In [None]:
preds = []
gold = []
rationales = []
messages = []

ds = LiveBenchMathDataset()
print(ds.get_demonstrations(
    "in_context_ssl/reasoning/data/livebench_math_psl_sc.hf",
    k=3, k_gt=0, style="psl", answer=True, rationale=True, 
    quantile=0.9, seed=42
))

for inst in tqdm(ds):
    choice = query_openai(client, inst["query"], model="gpt-4o-mini", structured_output=False, confidence=False)[0]
    
    o = parse_output_livebench_math(choice.message.content)

    preds.append(o["answer"])
    gold.append(inst["answer"])
    rationales.append(o["rationale"])
    messages.append(choice.message.content)

correct = np.array([grade_answer(p, g) for p, g in zip(preds, gold)]).astype(float)
correct.mean()

Question: Positive real numbers $x$ and $y$ satisfy $y^3 = x^2$ and $(y-x)^2 = 4y^2$. What is $x+y$? $\textbf{(A)}\ 42 \qquad \textbf{(B)}\ 12 \qquad \textbf{(C)}\ 36 \qquad \textbf{(D)}\ 24 \qquad \textbf{(E)}\ 18$ If you cannot determine the correct multiple-choice answer, take your best guess. Once you have your answer, please duplicate that letter five times in a single string. For example, if the answer is F, then write FFFFF.
Answer: C

___
Question: Compute the geometric mean of ${1, 9}$. Please give an exact answer, and put your final answer in latex in a $\boxed{}$ (for example, $\boxed{5 \sqrt[6]{-3} \sqrt[3]{7} 5^{2/5}}$).
Answer: 3

___
Question: Factor the following quadratic: $-5 x^2-100 x$. Please put your final answer in a $\\boxed{}$.
Answer: -5x(x + 20)

___
Question: Positive real numbers $x$ and $y$ satisfy $y^3 = x^2$ and $(y-x)^2 = 4y^2$. What is $x+y$? $\textbf{(A)}\ 42 \qquad \textbf{(B)}\ 12 \qquad \textbf{(C)}\ 36 \qquad \textbf{(D)}\ 24 \qquad \textbf{(E)}\ 1

100%|██████████| 93/93 [21:41<00:00, 14.00s/it]


np.float64(0.3548387096774194)

## Naive-SemiICL

In [None]:
ds = LiveBenchMathDataset()

preds = []
gold = []
rationales = []
messages = []

for inst in tqdm(ds.dynamic_data_selection_iter(
    "in_context_ssl/reasoning/data/livebench_math_psl_sc_4o.hf",
    k=3, answer=True, rationale=False,
    quantile=None, seed=42
)):
    choice = query_openai(client, inst["query"], model="gpt-4o", structured_output=False, confidence=False, logprobs=False)[0]

    o = parse_output_livebench_math(choice.message.content)

    preds.append(o["answer"])
    gold.append(inst["answer"])
    rationales.append(o["rationale"])
    messages.append(choice.message.content)

In [4]:
ds = LiveBenchMathDataset()

preds = []
gold = []
rationales = []
messages = []

for inst in tqdm(ds.mot_iter(
    "in_context_ssl/reasoning/data/livebench_math_psl_verbalized.hf",
    k=4, answer=True, rationale=True,
    threshold=0.9, seed=42
)):
    choice = query_openai(client, inst["query"], model="gpt-4o-mini", structured_output=False, confidence=False, logprobs=False)[0]

    o = parse_output_livebench_math(choice.message.content)

    preds.append(o["answer"])
    gold.append(inst["answer"])
    rationales.append(o["rationale"])
    messages.append(choice.message.content)

0it [00:00, ?it/s]

[(30, 3072), (60, 3072), (46, 3072), (32, 3072)]
[30, 60, 46, 32]


3it [01:56, 42.24s/it]

parse error!


19it [14:46, 49.29s/it]

parse error!


29it [21:39, 47.41s/it]

parse error!


31it [23:01, 43.15s/it]

parse error!
parse error!


34it [24:49, 38.93s/it]

parse error!


35it [25:17, 35.61s/it]

parse error!


37it [26:55, 41.60s/it]

parse error!


40it [28:34, 36.65s/it]

parse error!


41it [29:36, 44.39s/it]

parse error!


42it [30:14, 42.21s/it]

parse error!


45it [32:21, 43.21s/it]

parse error!


49it [34:38, 35.98s/it]

parse error!


50it [35:06, 33.51s/it]

parse error!


52it [36:16, 34.61s/it]

parse error!
parse error!


54it [37:33, 36.75s/it]

parse error!


60it [40:49, 33.53s/it]

parse error!


63it [42:10, 28.81s/it]

parse error!
parse error!


68it [45:07, 33.32s/it]

parse error!


76it [49:10, 33.07s/it]

parse error!


81it [52:08, 35.26s/it]

parse error!


84it [53:56, 35.19s/it]

parse error!


91it [57:56, 34.29s/it]

parse error!
parse error!


93it [59:07, 38.15s/it]


In [7]:
preds = np.array(preds)
gold = np.array(gold)
(preds == gold).mean()

np.float64(0.25806451612903225)

In [3]:
ds = LiveBenchMathDataset()
preds = []
gold = []
confidences = []
rationales = []
messages = []

new_ds_verbalized = []
new_ds_entropy = []

for inst in tqdm(ds.train_iter(
    "in_context_ssl/reasoning/data/livebench_math_train.hf", 
    k=0, answer=True, rationale=False, seed=42
)):
    choices = query_openai(client, inst["query"], n=1, model="gpt-4o-mini", structured_output=False, confidence=False, logprobs=True)
    """
    o = parse_output_livebench_math(choices[0].message.content)
    preds.append(o["answer"])
    confidences.append(o["confidence"])
    """
    o_verbalized = aggregate(choices, parser=parse_output_livebench_math, rationale=True, confidence="verbalized")
    o_entropy = aggregate(choices, parser=parse_output_livebench_math, rationale=True, confidence="entropy")

    d_verbalized = {
        "question": inst["question"],
        "answer": o_verbalized["answer"],
        "group": inst["group"],
        "rationale": o_verbalized["rationale"],
        "confidence": o_verbalized["confidence"],
    }
    d_entropy = dict(d_verbalized)
    d_entropy["confidence"] = o_entropy["confidence"]
    new_ds_verbalized.append(d_verbalized)
    new_ds_entropy.append(d_entropy)

275it [57:34, 12.56s/it]


In [4]:
datasets.Dataset.from_pandas(pd.DataFrame(new_ds_verbalized)).save_to_disk("in_context_ssl/reasoning/data/livebench_math_psl_k=0_verbalized_4o.hf")
datasets.Dataset.from_pandas(pd.DataFrame(new_ds_entropy)).save_to_disk("in_context_ssl/reasoning/data/livebench_math_psl_k=0_entropy_4o.hf")

Saving the dataset (1/1 shards): 100%|██████████| 275/275 [00:00<00:00, 46872.30 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 275/275 [00:00<00:00, 65291.16 examples/s]
