In [None]:
from tqdm.notebook import tqdm
import pandas as pd
import openai
import json
import os

openai.api_key = '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' # os.environ["OPENAI_API_KEY"]
train_df = pd.read_csv("../data/kaggle-llm-science-exam/train.csv", index_col=0)
wiki_df = pd.read_csv("../data/physics_pages_list/physics_pages_formatted.csv", index_col=0)

In [None]:
wiki_df.head()

In [None]:
wc_per_page = wiki_df.groupby("page")[["word_count"]].sum().sort_values("word_count", ascending=False)
wc_per_page["token_count"] = (wc_per_page["word_count"] / 0.75).astype(int)
wc_per_page.head()

In [None]:
black_list = list(wc_per_page.loc[
    (wc_per_page["token_count"] > 10000)
    | (wc_per_page.index.map(lambda x: "list of equations" in x.lower()))
].index)
print(json.dumps(black_list, indent=4))

filtered_wc_per_page = wc_per_page[~wc_per_page.index.isin(black_list)].copy()
filtered_wc_per_page.head()

In [None]:
openai.Model.list()

In [None]:
import pydantic


pydantic.__version__

In [None]:
from typing import *
from pydantic import BaseModel
import json


class MultipleChoiceQuestion(BaseModel):
    question: str
    A: str
    B: str
    C: str
    D: str
    E: str
    answer: Literal["A", "B", "C", "D", "E"]

        
class MultipleChoiceQuestionList(BaseModel):
    questions: List[MultipleChoiceQuestion]

        
schema = MultipleChoiceQuestionList.model_json_schema()
print(json.dumps(schema, indent=4))

In [None]:
from pathlib import Path


# out_file_name = "raw_questions"
out_file_name = "raw_questions_2"


out_dir = Path(f"../data/data_dumps/{out_file_name}")
out_dir.mkdir(exist_ok=True, parents=True)

In [None]:
for page, row in tqdm(filtered_wc_per_page.iterrows(), total=len(filtered_wc_per_page)):
    stem = page.lower()\
        .replace(" ", "_")\
        .replace("/", "_")\
        .replace("'", "_")\
        .replace("(", "_")\
        .replace(")", "_")\
        .replace(":", "_")
    out_path = out_dir / (stem + ".txt")
    print(out_path)

In [None]:
# raw_answers = []
# formatted_answers = []
num_questions_per_round = 20


for page, row in tqdm(filtered_wc_per_page.iterrows(), total=len(filtered_wc_per_page)):
    try:
        stem = page.lower()\
            .replace(" ", "_")\
            .replace("/", "_")\
            .replace("'", "_")\
            .replace("(", "_")\
            .replace(")", "_")\
            .replace(":", "_")
        out_path = out_dir / (stem + ".txt")
        if out_path.is_file():
            print(f"{page} already done")
            continue

        # ==== prompt building ====
        expected_token_count = row["token_count"]
        model_name = "gpt-3.5-turbo-16k-0613"
        info_df = wiki_df[wiki_df["page"] == page]
        print(page, expected_token_count, model_name)
        messages = [
            {
                "role": "system", 
                "content": (
                    f"You are a physics professor and "
                    + f"you are about to design a multiple choice exam questions with 5 choices each. "
                    + f"Your exam questions will be derived from a wikipedia page: \"{page}\". "
                    + f"in the following system prompts, you'll receive a pair of section_title and text "
                    + f"extracted from the wikipedia page"
                )
            }
        ]
        for _, info_row in info_df.iterrows():
            messages.append(
                {
                    "role": "system", 
                    "content": (
                        f"section_title: {info_row['section_title']}. text: {info_row['text']}"
                    )
                }
            )
        messages.append(
            {
                "role": "user",
                "content": f"generate {num_questions_per_round} multiple choice questions, each with 5 possible answers on the given topic."
            }
        )

        # ==== running model ====
        response = openai.ChatCompletion.create(
            model=model_name,
            messages=messages,
            functions=[
                {
                    "name": "create_multiple_choice_question",
                    "description": "create a multiple choice question consisting of a question, and 5 choices: A, B, C, D, and E",
                    "parameters": schema,
                }
            ],
            function_call={"name": "create_multiple_choice_question"},
        )

        # ==== parsing answers ====
        assistant_msg = response["choices"][0]["message"]
        response_options = assistant_msg.to_dict()["function_call"]["arguments"]
        out_path.write_text(response_options)
    #     formatted_answer = json.loads(response_options)
    #     formatted_answers.append(formatted_answer)
    except (
        openai.error.Timeout,
        openai.error.APIError,
        openai.error.APIConnectionError,
        openai.error.InvalidRequestError,
        openai.error.AuthenticationError,
        openai.error.PermissionError,
        openai.error.RateLimitError,
    ) as e:
        print(f"can't do {page}: {repr(e)}, skipping for now")
        continue

In [None]:
formatted_answers = []


answer_files = sorted(list(out_dir.glob("*.txt")))
for p in answer_files:
    try:
        parsed_file = json.loads(p.read_text())
        for item in parsed_file["questions"]:
            item["topic"] = p.stem
        formatted_answers.append(parsed_file)
    except json.JSONDecodeError as e:
        print(f"cant decode: {p}, {e}")

print(f"parsed {len(formatted_answers)} / {len(answer_files)} files")

In [None]:
flattened_questions = [
    x
    for qs in formatted_answers
    for x in qs["questions"]
]

In [None]:
data_df = pd.DataFrame.from_records(flattened_questions)
data_df = data_df.rename({"question": "prompt"}, axis=1)
data_df.index.name = "id"
print(data_df.shape)
data_df.head()

In [None]:
data_df.loc[data_df["answer"].isna(), :]

In [None]:
choices = ["A", "B", "C", "D", "E"]


bad_idxs = []
n_math_idxs = 0



assert not data_df["prompt"].isna().any()
for c in choices:
    assert not data_df[c].isna().any()
assert not data_df["answer"].isna().any()


for i, row in data_df.iterrows():
    found_empty_entry = False
    for c in ["prompt", "answer"] + choices:
        if (len(row[c]) == 0):
            found_empty_entry = True
            break
    if found_empty_entry:
        bad_idxs.append(i)
        continue
    if any(
        math_char in x 
        for x in (row["prompt"], row["A"], row["B"], row["C"], row["D"], row["E"])
        for math_char in ("\\", )
    ):
        bad_idxs.append(i)
        n_math_idxs += 1
        continue
    
    if row["answer"] not in choices:
        rv_idx = {
            row[choice].lower(): choice
            for choice in choices
        }
        # fixable as chatgpt puts answer instead of letters
        if row["answer"].lower() in rv_idx:
            data_df.loc[i, "answer"] = rv_idx[row["answer"].lower()]
        else:
            bad_idxs.append(i)
        

data_df.loc[bad_idxs, :]
filtered_df = data_df.loc[~data_df.index.isin(bad_idxs), :]
print(f"{len(bad_idxs)=}, {n_math_idxs=}, bad rows {len(filtered_df)} / {len(data_df)}")


In [None]:
# import seaborn as sns
# import matplotlib.pyplot as plt


# def count_words(text):
#     return sum([1 for i in text.split() if len(i) > 0])


# plt.figure(figsize=(10, 200))
# wc = filtered_df.copy()
# wc["propmt_wc"] = wc["prompt"].apply(count_words)
# wc["choice_wc"] = (
#     wc["A"].apply(count_words)
#     + wc["B"].apply(count_words)
#     + wc["C"].apply(count_words)
#     + wc["D"].apply(count_words)
#     + wc["E"].apply(count_words)
# )
# wc.groupby("topic")["choice_wc"].max().sort_values(ascending=False)
# sns.boxplot(
#     data=wc,
#     x="choice_wc",
#     y="topic"
# )

In [None]:
out_path = f"../data/data_dumps/more_questions/more_questions_{out_file_name}.csv"
filtered_df.to_csv(out_path)

print(out_path)