In [1]:
import json
import ollama
from ollama import Options


def extract_question_from_html(html_content):
    response = ollama.generate(
        model='gemma3:27b',
        prompt=f'''
             You are an expert in extracting exam questions from HTML content.
            
             I will provide you with HTML content containing an exam question and its multiple-choice options.
             Your task is to extract the question text and the options as separate items.
            
             HTML content:
             ```
             {html_content}
             ```
            
             Extract and return ONLY in the following JSON format:
             {{
                 "question": "The full question text here without any HTML tags",
                 "options": ["Option 1", "Option 2", "Option 3", "Option 4", "Option 5"]
             }}
            
             Guidelines:
             - Extract the complete question text, including any context or paragraphs preceding the actual question
             - Extract all options exactly as they appear
             - Return valid JSON only, nothing else
             - Remove any HTML tags from the question text and options
             - If you cannot find a question or options, return an empty question or options array
             - Do not include option letters (A, B, C, etc.) in the option text unless they are part of the actual content
             ''',
        options=Options(seed=42)
    )
    result = response['response']
    # Find the pattern that looks like valid JSON using string manipulation
    start_idx = result.find('{')
    end_idx = result.rfind('}') + 1
    if start_idx >= 0 and end_idx > start_idx:
        json_str = result[start_idx:end_idx]
        parsed_json = json.loads(json_str)
        return parsed_json
    raise ValueError("Could not extract valid question and options from response")



In [2]:
filePaths = [
    'fl-matematik_9_questions.json',
    'ingilizce_6_questions.json',
    'ingilizce_7_questions.json',
    'ingilizce_8_questions.json',
    'ingilizce_9_questions.json',
    'inkilap-tarihi_9_questions.json',
    'kimya_6_questions.json',
    'kimya_7_questions.json',
    'kimya_8_questions.json', 
    'kimya_9_questions.json',
    'matematik_6_questions.json',
    'matematik_7_questions.json',
    'matematik_8_questions.json',
    'matematik_9_questions.json',
    'tarih_6_questions.json',
    'tarih_7_questions.json',
    'tarih_8_questions.json',
    'tde_6_questions.json',
    'tde_7_questions.json',
    'tde_8_questions.json',
    'tde_9_questions.json',
]

In [6]:

stats = {}

for filename in filePaths:
    with open("questions/" + filename, 'r', encoding='utf-8') as f:
        questions = json.load(f)

        processedQuestions = [q for q in questions if 'answer' in q and q['answer'] < 6 and 'img' not in q['questionAsHtml'].lower()]
        
        # Initialize counters
        image_count = 0
        no_image_count = 0
        total_questions = len(processedQuestions)
        
        for question in processedQuestions:
            print(f"processing question: {processedQuestions.index(question)}/{len(processedQuestions)} id: {question['id']}")
            
            # Skip if question already has options
            if 'options' in question:
                continue
                
            has_image = 'questionAsHtml' in question and 'img' in question['questionAsHtml'].lower()
            hasMoreOptionThan5 = 'answer' in question and question['answer'] > 5
            
            if has_image:
                image_count += 1
            else:
                max_retries = 3
                retry_count = 0
                while retry_count < max_retries:
                    try:
                        # Add timeout of 60 seconds
                        import signal
                        def timeout_handler(signum, frame):
                            raise TimeoutError("Processing took too long")
                            
                        signal.signal(signal.SIGALRM, timeout_handler)
                        signal.alarm(60)  # Set 60 second timeout
                        
                        try:
                            questionResponse = extract_question_from_html(question['questionAsHtml'])
                            signal.alarm(0)  # Disable alarm
                            
                            if questionResponse is not None:
                                print(f"questionResponse: {questionResponse}")
                                question['question'] = questionResponse['question']
                                question['options'] = questionResponse['options']
                                
                                # Save after each successful question processing
                                with open("questions/" + filename, 'w', encoding='utf-8') as f:
                                    json.dump(processedQuestions, f, ensure_ascii=False, indent=2)
                                    
                                break
                            else:
                                print(f"Failed attempt {retry_count + 1} of {max_retries}")
                                retry_count += 1
                                
                        except TimeoutError:
                            print("Question processing timed out after 60 seconds")
                            retry_count += 1
                            signal.alarm(0)  # Disable alarm
                            
                    except Exception as e:
                        print(f"Error on attempt {retry_count + 1}: {str(e)}")
                        retry_count += 1
                        if retry_count == max_retries: 
                            print(f"Failed to process question after {max_retries} attempts")
                 
        # Add stats to dict if there are any href tags
        if image_count > 0:
            stats[filename] = { 
                'total_questions': total_questions,
                'image_count': image_count,
                'no_image_count': no_image_count
            }

processing question: 0/158 id: 16267
processing question: 1/158 id: 17971
processing question: 2/158 id: 22279
processing question: 3/158 id: 22280
processing question: 4/158 id: 22281
processing question: 5/158 id: 16268
processing question: 6/158 id: 17960
processing question: 7/158 id: 17962
processing question: 8/158 id: 17963
processing question: 9/158 id: 17964
processing question: 10/158 id: 17968
processing question: 11/158 id: 22282
processing question: 12/158 id: 16270
processing question: 13/158 id: 16272
processing question: 14/158 id: 16274
processing question: 15/158 id: 16277
processing question: 16/158 id: 17957
processing question: 17/158 id: 17958
processing question: 18/158 id: 17969
processing question: 19/158 id: 17970
processing question: 20/158 id: 22282
processing question: 21/158 id: 16291
processing question: 22/158 id: 22282
processing question: 23/158 id: 16282
processing question: 24/158 id: 16279
processing question: 25/158 id: 16287
processing question: 2

KeyboardInterrupt: 