In [1]:
from tqdm.notebook import tqdm
import pandas as pd
import openai
import json
import os

openai.api_key = os.environ["OPENAI_API_KEY"]

In [2]:
wiki_sci_df = pd.read_parquet("top_20k_wiki_sci_embd_clusters.parquet").reset_index(drop=True)

# calculate word count from 'text' column
wiki_sci_df['word_count'] = wiki_sci_df['text'].apply(lambda x: len(str(x).split(" ")))
wiki_sci_df['section_title'] = wiki_sci_df['title']
wiki_sci_df['page'] = wiki_sci_df['title']


wiki_sci_df.head()

Unnamed: 0,text,url,title,embd_text,embd_title,text_length,cluster_text,cluster_title,similarity,word_count,section_title,page
0,The Ulakhan Fault is a left-lateral moving tra...,https://en.wikipedia.org/wiki/Ulakhan%20Fault,Ulakhan Fault,"[-0.010125404, 0.045413326, -0.0055074026, 0.0...","[-0.027042245, 0.02827163, -0.007891712, -0.01...",948,789,144,0.93396,137,Ulakhan Fault,Ulakhan Fault
1,Troparil (also known as (–)-2β-Carbomethoxy-3β...,https://en.wikipedia.org/wiki/Troparil,Troparil,"[-0.008297031, -0.0015585935, 0.026860343, 0.0...","[-0.012329322, 0.011085487, -0.0023844037, 0.0...",2931,821,276,0.93396,430,Troparil,Troparil
2,"2,alpha-DMT, or 2,α-dimethyltryptamine, is a t...",https://en.wikipedia.org/wiki/2%2Calpha-DMT,"2,alpha-DMT","[0.0023725505, 0.021396462, 0.012232099, 0.029...","[-0.0013916385, 0.023376053, 0.02326751, 0.031...",609,242,1,0.93396,87,"2,alpha-DMT","2,alpha-DMT"
3,"Trimethylolpropane phosphite, C2H5C(CH2O)3P, i...",https://en.wikipedia.org/wiki/Trimethylolpropa...,Trimethylolpropane phosphite,"[-0.0106637655, 0.0014461132, 0.017337032, 0.0...","[-0.008970182, 0.012140841, 0.031812854, 0.036...",1212,990,271,0.93396,153,Trimethylolpropane phosphite,Trimethylolpropane phosphite
4,HLA-DR14(DR14) is a HLA-DR serotype that recog...,https://en.wikipedia.org/wiki/HLA-DR14,HLA-DR14,"[0.010430677, 0.04290895, -0.0056105047, 0.010...","[-0.0075440854, 0.0041776244, -0.010809391, 0....",634,945,130,0.93396,76,HLA-DR14,HLA-DR14


In [3]:
wc_per_page = wiki_sci_df.groupby("page")[["word_count"]].sum().sort_values("word_count", ascending=False)
wc_per_page["token_count"] = (wc_per_page["word_count"] / 0.75).astype(int)
wc_per_page[:20]

Unnamed: 0_level_0,word_count,token_count
page,Unnamed: 1_level_1,Unnamed: 2_level_1
List of unnumbered minor planets: 2002 U–V,20381,27174
List of unnumbered minor planets: 2002 P,19323,25764
List of unnumbered minor planets: 2003 M–R,16760,22346
List of unnumbered minor planets: 2003 A–E,16687,22249
List of unnumbered minor planets: 2004 A–B,16673,22230
List of unnumbered minor planets: 2004 U–V,16617,22156
Face masks during the COVID-19 pandemic,16478,21970
List of unnumbered minor planets: 2004 D–E,16420,21893
List of unnumbered minor planets: 2004 P–Q,16323,21764
Timeline of the COVID-19 pandemic in April 2020,15741,20988


In [4]:
# sum all token_count
wc_per_page['token_count'].sum() / 1e8

0.15080133

In [5]:
# remove rows where "List of" is in the page
wiki_sci_df = wiki_sci_df[~wiki_sci_df['page'].str.contains('List of')]

In [6]:
wc_per_page = wiki_sci_df.groupby("page")[["word_count"]].sum().sort_values("word_count", ascending=False)
wc_per_page["token_count"] = (wc_per_page["word_count"] / 0.75).astype(int)
wc_per_page[:20]

Unnamed: 0_level_0,word_count,token_count
page,Unnamed: 1_level_1,Unnamed: 2_level_1
Face masks during the COVID-19 pandemic,16478,21970
Timeline of the COVID-19 pandemic in April 2020,15741,20988
Horse colic,13226,17634
Tuberculosis management,13098,17464
2018 in paleontology,12670,16893
Vacuum tube,12510,16680
Ubuntu version history,12309,16412
Field electron emission,12187,16249
Film capacitor,11377,15169
Ising model,11333,15110


In [7]:
# sum all token_count
wc_per_page['token_count'].sum() / 1e8

0.1411549

# Expected cost: 	


GPT3.5 cost for 16k context is $0.004 / 1K tokens.

There is 1e8 tokens in the dataset, so the cost is 1e8 * 0.004 / 1e3 = $400 for the whole wiki sci dataset.

There is about 130 text examples per cluster, so the cost per cluster is $400 / 130 = $3.07

In [8]:
(wc_per_page['token_count']  > 16e3).sum() / len(wc_per_page)

0.0004105933073290905

In [9]:
# openai.Model.list()

In [10]:
import pydantic


pydantic.__version__

'2.1.1'

In [11]:
from typing import *
from pydantic import BaseModel
import json


class MultipleChoiceQuestion(BaseModel):
    question: str
    A: str
    B: str
    C: str
    D: str
    E: str
    answer: Literal["A", "B", "C", "D", "E"]

        
class MultipleChoiceQuestionList(BaseModel):
    questions: List[MultipleChoiceQuestion]

        
schema = MultipleChoiceQuestionList.model_json_schema()
# print(json.dumps(schema, indent=4))

In [12]:
from pathlib import Path


# out_file_name = "raw_questions"
# out_file_name = "raw_questions_2"
out_file_name = "raw_questions_wiki_sci_4"


out_dir = Path(f"/home/viktor/Documents/kaggle/kaggle_llm/data/data_dumps/{out_file_name}")

# if the directory already exist, raise an error
out_dir.mkdir(exist_ok=True, parents=True)

In [13]:
wiki_sci_df

Unnamed: 0,text,url,title,embd_text,embd_title,text_length,cluster_text,cluster_title,similarity,word_count,section_title,page
0,The Ulakhan Fault is a left-lateral moving tra...,https://en.wikipedia.org/wiki/Ulakhan%20Fault,Ulakhan Fault,"[-0.010125404, 0.045413326, -0.0055074026, 0.0...","[-0.027042245, 0.02827163, -0.007891712, -0.01...",948,789,144,0.93396,137,Ulakhan Fault,Ulakhan Fault
1,Troparil (also known as (–)-2β-Carbomethoxy-3β...,https://en.wikipedia.org/wiki/Troparil,Troparil,"[-0.008297031, -0.0015585935, 0.026860343, 0.0...","[-0.012329322, 0.011085487, -0.0023844037, 0.0...",2931,821,276,0.93396,430,Troparil,Troparil
2,"2,alpha-DMT, or 2,α-dimethyltryptamine, is a t...",https://en.wikipedia.org/wiki/2%2Calpha-DMT,"2,alpha-DMT","[0.0023725505, 0.021396462, 0.012232099, 0.029...","[-0.0013916385, 0.023376053, 0.02326751, 0.031...",609,242,1,0.93396,87,"2,alpha-DMT","2,alpha-DMT"
3,"Trimethylolpropane phosphite, C2H5C(CH2O)3P, i...",https://en.wikipedia.org/wiki/Trimethylolpropa...,Trimethylolpropane phosphite,"[-0.0106637655, 0.0014461132, 0.017337032, 0.0...","[-0.008970182, 0.012140841, 0.031812854, 0.036...",1212,990,271,0.93396,153,Trimethylolpropane phosphite,Trimethylolpropane phosphite
4,HLA-DR14(DR14) is a HLA-DR serotype that recog...,https://en.wikipedia.org/wiki/HLA-DR14,HLA-DR14,"[0.010430677, 0.04290895, -0.0056105047, 0.010...","[-0.0075440854, 0.0041776244, -0.010809391, 0....",634,945,130,0.93396,76,HLA-DR14,HLA-DR14
...,...,...,...,...,...,...,...,...,...,...,...,...
19995,The trigone (a.k.a. vesical trigone) is a smo...,https://en.wikipedia.org/wiki/Trigone%20of%20u...,Trigone of urinary bladder,"[-0.023009904, 0.012714574, 0.02325639, 0.0068...","[-0.0046343203, 0.025658295, 0.013630594, -0.0...",852,772,42,0.93396,125,Trigone of urinary bladder,Trigone of urinary bladder
19996,"Vibrio parahaemolyticus is a curved, rod-shape...",https://en.wikipedia.org/wiki/Vibrio%20parahae...,Vibrio parahaemolyticus,"[-0.018457264, 0.022856213, -0.00020449034, 0....","[-0.0073644533, 0.017991826, -0.010438995, -0....",3548,90,213,0.93396,507,Vibrio parahaemolyticus,Vibrio parahaemolyticus
19997,"Mercury sulfide, or mercury(II) sulfide is a ...",https://en.wikipedia.org/wiki/Mercury%20sulfide,Mercury sulfide,"[-0.019561192, 0.014413084, 0.02396997, 0.0337...","[-0.013139198, 0.021910762, -0.0022164315, 0.0...",1822,685,307,0.93396,275,Mercury sulfide,Mercury sulfide
19998,Estrogen receptors (ERs) are a group of protei...,https://en.wikipedia.org/wiki/Estrogen%20receptor,Estrogen receptor,"[-0.03545495, -0.008087604, 0.0023061149, -0.0...","[0.0068337345, 0.0011378104, -0.0064769355, 0....",12014,199,56,0.93396,1733,Estrogen receptor,Estrogen receptor


In [26]:



    
def generate_questions(wiki_sci_df_cluster):
    
    num_questions_per_cluster = 10000
    
    
    for cluster_question_idx  in range(num_questions_per_cluster):
        
        
        randint = np.random.randint(0, 1000000)
        out_path = f"{out_dir}/cluster-round-{cluster_question_idx}-{randint}.txt"
        
        
        # take a random page and text from wiki_sci_df_cluster
        random_row = wiki_sci_df_cluster.sample(1)
        page = random_row.page.values[0]
        text = random_row.text.values[0]
        section_title = random_row.section_title.values[0]
        
        try:
            # ==== prompt building ====
            model_name = "gpt-3.5-turbo-16k-0613"
            messages = [
                {
                    "role": "system", 
                    "content": (
                        f"You are a GOD-like AGI that is a top level expert in all of the fields in the world. "
                        + f"Your task is to design a multiple choice exam questions with 5 choices each for human experts in the field. "
                        + f"Of these 5 possible answers, one is correct and the other 4 are wrong. "
                        + f"Your exam questions will be derived from a wikipedia page: \"{page}\". "
                        + f"You'll receive a pair of section_title and text extracted from the wikipedia page. "
                        + f"Even though the answers are generated from the wikipedia page, your students are super smart, so don't go easy on them. "
                        + f"It is OK if the questions relates some concept from the given wikipedia page to another concept from another wikipedia page if you want to ask a question that is more difficult and that questions the student's understanding of both concepts. \n"
                        + f"Make sure that the exam questions (both the correct answer and the wrong answers) you design are relevant to the given section_title and text."
                        + f"The possible answers should be very similar to the exact answer. "
                        + f"Think step-by-step, and make sure that the answer is not too obvious. "
                        + f"Don't include possible answers that are like 'any of the above' or 'none of the above'. "
                        + f"Here are the section title and text: \n"
                        + f"Section title: {section_title}. Text: {text}"
                    )
                },
                {
                    "role": "user",
                    "content": f"Generate 5 multiple choice questions, each with 5 possible answers on the given topic. You will generate it in the following way, step-by-step:\n"
                    + f"Step 1: you will generate a question and a correct answer based on the provided text. The answer is 1 or 2 sentences long, no longer than that. Write that down as the token outputs. Question should start like this: \n"
                    
                    + f"option: What is the significance of ... \n"
                    + f"option: Which of the following statements ... \n"
                    + f"option: What is ... \n"
                    + f"option: What is the difference between ... \n"
                    + f"option: What is the reason for ... \n"
                    + f"option: What is the role of ... \n"
                    + f"option: What is the definition of ... \n"
                    + f"option: What is the purpose of ... \n"
                    + f"option: What is the reason behind ... \n"
                    + f"option: What is the term used ... \n"
                    + f"option: What is the origin of ... \n"
                    + f"option: What are the two main ... \n"
                    + f"option: What is the reason that ... \n"
                    + f"option: What is the main focus ... \n"
                    + f"option: What is the function of ... \n"
                    + f"option: What is the interpretation of ... \n"
                    + f"option: What is the formalism that ... \n"
                    + f"option: What is the main advantage ... \n"
                    
                    + f"Step 2: you will generate 4 similar but wrong answers based on the correct answer that you provided in the step 1. Start the sentence the same as the answer, but only change wording in few places to make sure the answer is actually wrong. Make sure that the wrong answer is not too obviously wrong, that is as close to the correct answer as possible, not similar to other wrong answers. The number of sentences should be the same as the correct answer.\n"
                    
                    + f"Examples of question-correct answer pairs:\n"
                    + "['What is the Kutta condition?','The Kutta condition is a physical requirement that the fluid moving along the lower and upper surfaces of an airfoil meet smoothly, with no fluid moving around the trailing edge of the airfoil.'],"
                    + f"['What is the purpose of obtaining surgical resection specimens?','To remove an entire diseased area or organ for definitive surgical treatment of a disease, with pathological analysis of the specimen used to confirm the diagnosis.']"
                    + f"['Who published the first theory that was able to encompass previously separate field theories to provide a unifying theory of electromagnetism?, 'Maxwell']"
                    + f"['What is the significance of Baryon Acoustic Oscillations (BAOs) in the study of the universe?','BAOs establish a preferred length scale for baryons, which can be used to detect a subtle preference for pairs of galaxies to be separated by 147 Mpc, compared to those separated by 130-160 Mpc.']"
                }
            ]

            
            
            # ==== running model ====
            response = openai.ChatCompletion.create(
                model=model_name,
                messages=messages,
                functions=[
                    {
                        "name": "create_multiple_choice_question",
                        "description": "create a multiple choice question consisting of a question, and 5 choices: A, B, C, D, and E",
                        "parameters": schema,
                    }
                ],
                function_call={"name": "create_multiple_choice_question"},
            )

            # ==== parsing answers ====
            assistant_msg = response["choices"][0]["message"]
            response_options = assistant_msg.to_dict()["function_call"]["arguments"]
            
            
            # write response_options to out_path
            out_path_write = Path(out_path)
            out_path_write.write_text(response_options)
                
        except (
            openai.error.Timeout,
            openai.error.APIError,
            openai.error.APIConnectionError,
            openai.error.InvalidRequestError,
            openai.error.AuthenticationError,
            openai.error.PermissionError,
            openai.error.RateLimitError,
        ) as e:
            print(f"can't do {page}: {repr(e)}, skipping for now")
            continue
    

In [27]:

from joblib import Parallel, delayed
from tqdm import tqdm

n_jobs = 8


clusters = np.array_split(wiki_sci_df, n_jobs)
clusters[0]

Unnamed: 0,text,url,title,embd_text,embd_title,text_length,cluster_text,cluster_title,similarity,word_count,section_title,page
0,The Ulakhan Fault is a left-lateral moving tra...,https://en.wikipedia.org/wiki/Ulakhan%20Fault,Ulakhan Fault,"[-0.010125404, 0.045413326, -0.0055074026, 0.0...","[-0.027042245, 0.02827163, -0.007891712, -0.01...",948,789,144,0.93396,137,Ulakhan Fault,Ulakhan Fault
1,Troparil (also known as (–)-2β-Carbomethoxy-3β...,https://en.wikipedia.org/wiki/Troparil,Troparil,"[-0.008297031, -0.0015585935, 0.026860343, 0.0...","[-0.012329322, 0.011085487, -0.0023844037, 0.0...",2931,821,276,0.93396,430,Troparil,Troparil


In [28]:

results = Parallel(n_jobs=n_jobs)(delayed(generate_questions)(cluster) for cluster in tqdm(clusters))


100%|██████████| 8/8 [00:00<00:00, 144.03it/s]
