In [3]:
# ==============================================================================
# Step 1: Import Libraries and Configure Settings
# ==============================================================================
import json
import os
import time
from google import genai
from IPython.display import display, Markdown

# --- Configuration ---
# Input and output file names
INPUT_FILE = 'unikatni_otazky.json'
OUTPUT_FILE = 'unikatni_otazky_obohatene.json'

MODEL_NAME = 'gemini-2.5-flash-lite'
# MODEL_NAME = 'gemini-2.5-flash'

client = genai.Client(api_key="AIzaSyCfDzRL2eX5ubQg-3lsXRqWxYM1fh8LaE4")
display(Markdown("✅ **Gemini client initialized successfully.**"))


with open(INPUT_FILE, 'r', encoding='utf-8') as f:
    questions_data = json.load(f)

# --- Load Already Processed Questions to Allow Resuming ---
if os.path.exists(OUTPUT_FILE):
    with open(OUTPUT_FILE, 'r', encoding='utf-8') as f:
        try:
            enriched_questions = json.load(f)
        except json.JSONDecodeError:
            enriched_questions = [] # Handle case where file is empty or corrupt
else:
    enriched_questions = []

processed_texts = {q['text_otazky'] for q in enriched_questions}
question_count = len(questions_data)

display(Markdown(f"Found **{question_count}** questions in total. **{len(processed_texts)}** questions have already been processed."))



✅ **Gemini client initialized successfully.**

Found **664** questions in total. **664** questions have already been processed.

In [4]:

if os.path.exists(OUTPUT_FILE):
    with open(OUTPUT_FILE, 'r', encoding='utf-8') as f:
        try:
            enriched_questions = json.load(f)
        except json.JSONDecodeError:
            enriched_questions = [] # Handle case where file is empty or corrupt
else:
    enriched_questions = []


for index, question_obj in enumerate(questions_data):
    # Skip questions that are already in the output file
    if question_obj['text_otazky'] in processed_texts:
        continue

    print(f"Processing question {index + 1}/{question_count}: \"{question_obj['text_otazky'][:60]}...\"")

    # Get category and explanation from the Gemini API

    prompt = f"""Analyzuj následující testovou otázku z oblasti letectví. Tvým úkolem je:
    1. Zařadit otázku do jedné z následujících kategorií: Letecké předpisy, Lidská výkonnost, Meteorologie, Navigace, Provozní postupy, Letové výkony a plánování, Znalosti letadel, Principy letu, Radiokomunikace.
    2. Poskytnout stručné a jasné vysvětlení, proč je označená odpověď správná.

    Otázka: "{question_obj["text_otazky"]}"
    Možnosti:
    A: "{question_obj["moznosti"].get('A', '')}"
    B: "{question_obj["moznosti"].get('B', '')}"
    C: "{question_obj["moznosti"].get('C', '')}"

    Správná odpověď je: {question_obj["spravna_odpoved"]}

    Odpověz ve formátu JSON s klíči "kategorie" a "vysvetleni". Nepřidávej žádný další text mimo tento JSON včetně ````json` bloků.

    """

    response = client.models.generate_content(
        model=MODEL_NAME, contents=prompt,
            config={
        "response_mime_type": "application/json",
    },
    )


    response.text.replace("```json", "").replace("```", "").strip()
    print(response.text)

    try:
        response_json = json.loads(response.text)
        category = response_json.get('kategorie', 'Nezařazeno')
        explanation = response_json.get('vysvetleni', 'Žádné vysvětlení poskytnuto.')
    except (json.JSONDecodeError, TypeError) as e:
        print(f"Error parsing response for question {index + 1}: {e}")
        category = 'Nezařazeno'
        explanation = 'Žádné vysvětlení poskytnuto.'


    enriched_questions = enriched_questions + [{
        **question_obj,
        'kategorie': category,
        'vysvetleni': explanation
    }]

    # Save progress after each new question is processed
    with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
        json.dump(enriched_questions, f, ensure_ascii=False, indent=4)

    # Pause to avoid hitting API rate limits (e.g., 60 requests per minute)
    time.sleep(8)

display(Markdown("---"))
display(Markdown(f"✅ **Processing complete.** All enriched questions have been saved to **'{OUTPUT_FILE}'**."))


---

✅ **Processing complete.** All enriched questions have been saved to **'unikatni_otazky_obohatene.json'**.

In [5]:
#add to each questions a hashid

with open(OUTPUT_FILE, 'r', encoding='utf-8') as f:
        questions = json.load(f)


import hashlib

for question in questions:
    hash_object = hashlib.md5(question['text_otazky'].encode() )
    question['hashid'] = hash_object.hexdigest()


with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
        json.dump(questions, f, ensure_ascii=False, indent=4)
