In [1]:
from tqdm.notebook import tqdm
import pandas as pd
import openai
import json
import os

openai.api_key = os.environ["OPENAI_API_KEY"]
train_df = pd.read_csv("/home/clay/research/kaggle/kaggle_llm/data/kaggle-llm-science-exam/train.csv", index_col=0)
wiki_df = pd.read_csv("/home/clay/research/kaggle/kaggle_llm/data/physics_pages_list/physics_pages_formatted.csv", index_col=0)

In [2]:
wiki_df.head()

Unnamed: 0,page,section_title,text,word_count
0,Electric field,Description,The electric field is defined at each point in...,441
1,Electric field,Mathematical formulation,Electric fields are caused by electric charges...,71
2,Electric field,Electrostatic fields,Electrostatic fields are electric fields that ...,37
3,Electric field,Electrodynamic fields,Electrodynamic fields are electric fields whic...,166
4,Electric field,Energy in the electric field,The total energy per unit volume stored by the...,119


In [3]:
wc_per_page = wiki_df.groupby("page")[["word_count"]].sum().sort_values("word_count", ascending=False)
wc_per_page["token_count"] = (wc_per_page["word_count"] / 0.75).astype(int)
wc_per_page.head()

Unnamed: 0_level_0,word_count,token_count
page,Unnamed: 1_level_1,Unnamed: 2_level_1
Galactocentric distance,16307,21742
Hohenberg-Kohn theorems,6063,8084
Star,5346,7128
Total internal reflection,5289,7052
Vasiliev equations,5268,7024


In [4]:
black_list = list(wc_per_page.loc[
    (wc_per_page["token_count"] > 10000)
    | (wc_per_page.index.map(lambda x: "list of equations" in x.lower()))
].index)
print(json.dumps(black_list, indent=4))

filtered_wc_per_page = wc_per_page[~wc_per_page.index.isin(black_list)].copy()
filtered_wc_per_page.head()

[
    "Galactocentric distance",
    "List of equations in classical mechanics",
    "List of equations in quantum mechanics",
    "List of equations in wave theory",
    "List of equations in fluid mechanics",
    "List of equations in nuclear and particle physics",
    "List of equations in gravitation"
]


Unnamed: 0_level_0,word_count,token_count
page,Unnamed: 1_level_1,Unnamed: 2_level_1
Hohenberg-Kohn theorems,6063,8084
Star,5346,7128
Total internal reflection,5289,7052
Vasiliev equations,5268,7024
List of unsolved problems in physics,4582,6109


In [5]:
openai.Model.list()

<OpenAIObject list at 0x7fe77c466160> JSON: {
  "object": "list",
  "data": [
    {
      "id": "babbage",
      "object": "model",
      "created": 1649358449,
      "owned_by": "openai",
      "permission": [
        {
          "id": "modelperm-49FUp5v084tBB49tC4z8LPH5",
          "object": "model_permission",
          "created": 1669085501,
          "allow_create_engine": false,
          "allow_sampling": true,
          "allow_logprobs": true,
          "allow_search_indices": false,
          "allow_view": true,
          "allow_fine_tuning": false,
          "organization": "*",
          "group": null,
          "is_blocking": false
        }
      ],
      "root": "babbage",
      "parent": null
    },
    {
      "id": "davinci",
      "object": "model",
      "created": 1649359874,
      "owned_by": "openai",
      "permission": [
        {
          "id": "modelperm-U6ZwlyAd0LyMk4rcMdz33Yc3",
          "object": "model_permission",
          "created": 1669066355,
      

In [6]:
import pydantic


pydantic.__version__

'2.1.1'

In [7]:
from typing import *
from pydantic import BaseModel
import json


class MultipleChoiceQuestion(BaseModel):
    question: str
    A: str
    B: str
    C: str
    D: str
    E: str
    answer: Literal["A", "B", "C", "D", "E"]

        
class MultipleChoiceQuestionList(BaseModel):
    questions: List[MultipleChoiceQuestion]

        
schema = MultipleChoiceQuestionList.model_json_schema()
print(json.dumps(schema, indent=4))

{
    "$defs": {
        "MultipleChoiceQuestion": {
            "properties": {
                "question": {
                    "title": "Question",
                    "type": "string"
                },
                "A": {
                    "title": "A",
                    "type": "string"
                },
                "B": {
                    "title": "B",
                    "type": "string"
                },
                "C": {
                    "title": "C",
                    "type": "string"
                },
                "D": {
                    "title": "D",
                    "type": "string"
                },
                "E": {
                    "title": "E",
                    "type": "string"
                },
                "answer": {
                    "enum": [
                        "A",
                        "B",
                        "C",
                        "D",
                        "E"
                    ],
  

In [23]:
from pathlib import Path


# out_file_name = "raw_questions"
out_file_name = "raw_questions_2"


out_dir = Path(f"/home/clay/research/kaggle/kaggle_llm/data/data_dumps/{out_file_name}")
out_dir.mkdir(exist_ok=True, parents=True)

In [24]:
for page, row in tqdm(filtered_wc_per_page.iterrows(), total=len(filtered_wc_per_page)):
    stem = page.lower()\
        .replace(" ", "_")\
        .replace("/", "_")\
        .replace("'", "_")\
        .replace("(", "_")\
        .replace(")", "_")\
        .replace(":", "_")
    out_path = out_dir / (stem + ".txt")
    print(out_path)

  0%|          | 0/889 [00:00<?, ?it/s]

/home/clay/research/kaggle/kaggle_llm/data/data_dumps/raw_questions_2/hohenberg-kohn_theorems.txt
/home/clay/research/kaggle/kaggle_llm/data/data_dumps/raw_questions_2/star.txt
/home/clay/research/kaggle/kaggle_llm/data/data_dumps/raw_questions_2/total_internal_reflection.txt
/home/clay/research/kaggle/kaggle_llm/data/data_dumps/raw_questions_2/vasiliev_equations.txt
/home/clay/research/kaggle/kaggle_llm/data/data_dumps/raw_questions_2/list_of_unsolved_problems_in_physics.txt
/home/clay/research/kaggle/kaggle_llm/data/data_dumps/raw_questions_2/zero-point_energy.txt
/home/clay/research/kaggle/kaggle_llm/data/data_dumps/raw_questions_2/wikipedia_talk_wikiproject_physics_archive_december_2020.txt
/home/clay/research/kaggle/kaggle_llm/data/data_dumps/raw_questions_2/planet.txt
/home/clay/research/kaggle/kaggle_llm/data/data_dumps/raw_questions_2/main_sequence.txt
/home/clay/research/kaggle/kaggle_llm/data/data_dumps/raw_questions_2/turbulence.txt
/home/clay/research/kaggle/kaggle_llm/data

In [25]:
# raw_answers = []
# formatted_answers = []
num_questions_per_round = 20


for page, row in tqdm(filtered_wc_per_page.iterrows(), total=len(filtered_wc_per_page)):
    try:
        stem = page.lower()\
            .replace(" ", "_")\
            .replace("/", "_")\
            .replace("'", "_")\
            .replace("(", "_")\
            .replace(")", "_")\
            .replace(":", "_")
        out_path = out_dir / (stem + ".txt")
        if out_path.is_file():
            print(f"{page} already done")
            continue

        # ==== prompt building ====
        expected_token_count = row["token_count"]
        model_name = "gpt-3.5-turbo-16k-0613"
        info_df = wiki_df[wiki_df["page"] == page]
        print(page, expected_token_count, model_name)
        messages = [
            {
                "role": "system", 
                "content": (
                    f"You are a physics professor and "
                    + f"you are about to design a multiple choice exam questions with 5 choices each. "
                    + f"Your exam questions will be derived from a wikipedia page: \"{page}\". "
                    + f"in the following system prompts, you'll receive a pair of section_title and text "
                    + f"extracted from the wikipedia page"
                )
            }
        ]
        for _, info_row in info_df.iterrows():
            messages.append(
                {
                    "role": "system", 
                    "content": (
                        f"section_title: {info_row['section_title']}. text: {info_row['text']}"
                    )
                }
            )
        messages.append(
            {
                "role": "user",
                "content": f"generate {num_questions_per_round} multiple choice questions, each with 5 possible answers on the given topic."
            }
        )

        # ==== running model ====
        response = openai.ChatCompletion.create(
            model=model_name,
            messages=messages,
            functions=[
                {
                    "name": "create_multiple_choice_question",
                    "description": "create a multiple choice question consisting of a question, and 5 choices: A, B, C, D, and E",
                    "parameters": schema,
                }
            ],
            function_call={"name": "create_multiple_choice_question"},
        )

        # ==== parsing answers ====
        assistant_msg = response["choices"][0]["message"]
        response_options = assistant_msg.to_dict()["function_call"]["arguments"]
        out_path.write_text(response_options)
    #     formatted_answer = json.loads(response_options)
    #     formatted_answers.append(formatted_answer)
    except (
        openai.error.Timeout,
        openai.error.APIError,
        openai.error.APIConnectionError,
        openai.error.InvalidRequestError,
        openai.error.AuthenticationError,
        openai.error.PermissionError,
        openai.error.RateLimitError,
    ) as e:
        print(f"can't do {page}: {repr(e)}, skipping for now")
        continue

  0%|          | 0/889 [00:00<?, ?it/s]

Hohenberg-Kohn theorems already done
Star already done
Total internal reflection already done
Vasiliev equations already done
List of unsolved problems in physics already done
Zero-point energy already done
Wikipedia talk:WikiProject Physics/Archive December 2020 already done
Planet already done
Main sequence already done
Turbulence already done
Modified Newtonian dynamics already done
Newton's theorem of revolving orbits already done
First law of thermodynamics already done
Two-body Dirac equations already done
Point groups in three dimensions already done
Second law of thermodynamics already done
Gravitational wave already done
Renormalization already done
History of geology already done
Solubility already done
Navier–Stokes equations already done
Orbit already done
Theorem already done
Electric charge already done
Magnetic monopole already done
Dwarf planet already done
Vacuum already done
Casimir effect already done
Quasar already done
Negative resistance already done
Gravitational

In [27]:
formatted_answers = []


answer_files = sorted(list(out_dir.glob("*.txt")))
for p in tqdm(answer_files):
    try:
        parsed_file = json.loads(p.read_text())
        for item in parsed_file["questions"]:
            item["topic"] = p.stem
        formatted_answers.append(parsed_file)
    except json.JSONDecodeError as e:
        print(f"cant decode: {p}, {e}")

print(f"parsed {len(formatted_answers)} / {len(answer_files)} files")

  0%|          | 0/888 [00:00<?, ?it/s]

parsed 888 / 888 files


In [28]:
flattened_questions = [
    x
    for qs in formatted_answers
    for x in qs["questions"]
]

In [29]:
data_df = pd.DataFrame.from_records(flattened_questions)
data_df = data_df.rename({"question": "prompt"}, axis=1)
data_df.index.name = "id"
print(data_df.shape)
data_df.head()

(18140, 8)


Unnamed: 0_level_0,prompt,A,B,C,D,E,answer,topic
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,Who proposed 3D mirror symmetry?,Edward Witten,Kenneth Intriligator,Amihay Hanany,Nathan Seiberg,None of the above,D,3d_mirror_symmetry
1,What is 3D mirror symmetry?,A type of symmetry in 3-dimensional gauge theo...,A 3-dimensional version of mirror symmetry in ...,A relation between pairs of 3-dimensional gaug...,A consequence of S-duality in type IIB string ...,All of the above,E,3d_mirror_symmetry
2,What is the Coulomb branch in 3D mirror symmetry?,The regime where massless vortices condense,The regime where the scalar in the vector mult...,The branch of the moduli space of one theory,The branch of the moduli space of the dual theory,None of the above,B,3d_mirror_symmetry
3,What is the Higgs branch in 3D mirror symmetry?,The regime where massless vortices condense,The regime where the scalar in the vector mult...,The branch of the moduli space of one theory,The branch of the moduli space of the dual theory,None of the above,A,3d_mirror_symmetry
4,What do BPS vortices preserve in 3D mirror sym...,Mass,Supersymmetry,Momentum,Electric charge,None of the above,B,3d_mirror_symmetry


In [30]:
data_df.loc[data_df["answer"].isna(), :]

Unnamed: 0_level_0,prompt,A,B,C,D,E,answer,topic
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1


In [31]:
choices = ["A", "B", "C", "D", "E"]


bad_idxs = []




assert not data_df["prompt"].isna().any()
for c in choices:
    assert not data_df[c].isna().any()
assert not data_df["answer"].isna().any()


for i, row in data_df.iterrows():
    found_empty_entry = False
    for c in ["prompt", "answer"] + choices:
        if (len(row[c]) == 0):
            found_empty_entry = True
            break
    if found_empty_entry:
        bad_idxs.append(i)
        continue
    
    if row["answer"] not in choices:
        rv_idx = {
            row[choice].lower(): choice
            for choice in choices
        }
        # fixable as chatgpt puts answer instead of letters
        if row["answer"].lower() in rv_idx:
            data_df.loc[i, "answer"] = rv_idx[row["answer"].lower()]
        else:
            bad_idxs.append(i)
        

data_df.loc[bad_idxs, :]
filtered_df = data_df.loc[~data_df.index.isin(bad_idxs), :]
print(len(bad_idxs), "bad rows", len(filtered_df), "/", len(data_df))


83 bad rows 18057 / 18140


In [37]:
# import seaborn as sns
# import matplotlib.pyplot as plt


# def count_words(text):
#     return sum([1 for i in text.split() if len(i) > 0])


# plt.figure(figsize=(10, 200))
# wc = filtered_df.copy()
# wc["wc"] = wc["prompt"].apply(count_words)
# sns.boxplot(
#     data=wc,
#     x="wc",
#     y="topic"
# )

In [38]:
out_path = f"/home/clay/research/kaggle/kaggle_llm/data/data_dumps/more_questions/more_questions_{out_file_name}.csv"
filtered_df.to_csv(out_path)

print(out_path)

/home/clay/research/kaggle/kaggle_llm/data/data_dumps/more_questions/more_questions_raw_questions_2.csv
