In [46]:
import os, json, re, random, time
from dotenv import load_dotenv
from google import genai
from google.genai import types

load_dotenv()

gemini_client = genai.Client(api_key=os.environ.get("GEMINI_API_KEY"))

In [2]:
# Prompt for generating out-of-scope questions (sent to gemini-2.5-pro-preview-03-25):
# "Give me 100 questions about events after the fall of the roman empire in a random, non-chronological order."

with open('questions/out-of-scope.json', 'r') as f:
    out_of_scope_qs = json.load(f)

In [43]:
question_gen_system_prompt = """
You will be given a chapter from the *Naturalis Historia*, a Roman encyclopedia containing a wealth of information known to them at the time.
From the given chapter, please give two questions that could be asked about the information given in the chapter.
Do not mention the author.
The chapter information will be given in the following format:

**Title**: {title}
**Chapter**: {chapter}

Give your questions as plain text, separated by new lines.
"""

In [37]:
footnote_code_re = r'<@[0-9]+>'

def convert_chapter_to_questions(chapter_data):
    cleaned_paragraphs = [re.sub(footnote_code_re, '', p) for p in chapter_data['paragraphs']]
    completion = gemini_client.models.generate_content(
        model="gemini-2.0-flash",
        contents=[
            f"Title:\n{chapter_data['chapter_name']}\Chapter:{' '.join(cleaned_paragraphs)}"
        ],
        config=types.GenerateContentConfig(
            system_instruction=question_gen_system_prompt, # Pliny's prompt
            max_output_tokens=200,
            temperature=0
        ),
    )
    
    return completion.text.strip().split('\n')

In [None]:
# Load in chapters, then prompt Gemini to generate a question from a chapter's contents
with open('data-collection/data/chapter-data.json', 'r', encoding='utf-8') as f:
    all_chapters = json.load(f)

In [None]:
n_chapters = 100
sampled_chapters = random.sample(all_chapters, n_chapters)

all_questions = []
for i, chapter in enumerate(sampled_chapters):
    print(f'Generating questions for chapter {i+1}/{n_chapters}...', end='\r')
    chapter_questions = convert_chapter_to_questions(chapter)
    chapter_questions = [{ 'question': q, 'chapter': chapter['chapter_name'] } for q in chapter_questions]
    all_questions.extend(chapter_questions)
    
    time.sleep(4) # one request every 4 seconds to avoid rate limit of 15/minute

all_questions

In [50]:
with open('questions/chapter-questions.json', 'w', encoding='utf-8') as f:
    json.dump(all_questions, f, indent=2, ensure_ascii=False)