In [1]:
import json
import ollama
from ollama import Options


def extract_question_from_html(html_content):
    response = ollama.generate(
        model='gemma3:27b',
        prompt=f'''
             You are an expert in extracting exam questions from HTML content.
            
             I will provide you with HTML content containing an exam question and its multiple-choice options.
             Your task is to extract the question text and the options as separate items.
            
             HTML content:
             ```
             {html_content}
             ```
            
             Extract and return ONLY in the following JSON format:
             {{
                 "question": "The full question text here without any HTML tags",
                 "options": ["Option 1", "Option 2", "Option 3", "Option 4", "Option 5"]
             }}
            
             Guidelines:
             - Extract the complete question text, including any context or paragraphs preceding the actual question
             - Extract all options exactly as they appear
             - Return valid JSON only, nothing else
             - Remove any HTML tags from the question text and options
             - If you cannot find a question or options, return an empty question or options array
             - Do not include option letters (A, B, C, etc.) in the option text unless they are part of the actual content
             ''',
        options=Options(seed=42)
    )
    result = response['response']
    # Find the pattern that looks like valid JSON using string manipulation
    start_idx = result.find('{')
    end_idx = result.rfind('}') + 1
    if start_idx >= 0 and end_idx > start_idx:
        json_str = result[start_idx:end_idx]
        parsed_json = json.loads(json_str)
        return parsed_json
    raise ValueError("Could not extract valid question and options from response")



In [2]:
filePaths = [
    'tde_6_questions.json',
    'tde_7_questions.json',
    'tde_8_questions.json',
    'tde_9_questions.json',
]

In [3]:

stats = {}

for filename in filePaths:
    with open("questions/" + filename, 'r', encoding='utf-8') as f:
        questions = json.load(f)

        processedQuestions = [q for q in questions if 'answer' in q and q['answer'] < 6 and 'img' not in q['questionAsHtml'].lower()]
        
        # Initialize counters
        image_count = 0
        no_image_count = 0
        total_questions = len(processedQuestions)
        
        for question in processedQuestions:
            print(f"processing question: {processedQuestions.index(question)}/{len(processedQuestions)} id: {question['id']}")
            
            # Skip if question already has options
            if 'options' in question:
                continue
                
            has_image = 'questionAsHtml' in question and 'img' in question['questionAsHtml'].lower()
            hasMoreOptionThan5 = 'answer' in question and question['answer'] > 5
            
            if has_image:
                image_count += 1
            else:
                max_retries = 3
                retry_count = 0
                while retry_count < max_retries:
                    try:
                        # Add timeout of 60 seconds
                        import signal
                        def timeout_handler(signum, frame):
                            raise TimeoutError("Processing took too long")
                            
                        signal.signal(signal.SIGALRM, timeout_handler)
                        signal.alarm(60)  # Set 60 second timeout
                        
                        try:
                            questionResponse = extract_question_from_html(question['questionAsHtml'])
                            signal.alarm(0)  # Disable alarm
                            
                            if questionResponse is not None:
                                print(f"questionResponse: {questionResponse}")
                                question['question'] = questionResponse['question']
                                question['options'] = questionResponse['options']
                                
                                # Save after each successful question processing
                                with open("questions/" + filename, 'w', encoding='utf-8') as f:
                                    json.dump(processedQuestions, f, ensure_ascii=False, indent=2)
                                    
                                break
                            else:
                                print(f"Failed attempt {retry_count + 1} of {max_retries}")
                                retry_count += 1
                                
                        except TimeoutError:
                            print("Question processing timed out after 60 seconds")
                            retry_count += 1
                            signal.alarm(0)  # Disable alarm
                            
                    except Exception as e:
                        print(f"Error on attempt {retry_count + 1}: {str(e)}")
                        retry_count += 1
                        if retry_count == max_retries: 
                            print(f"Failed to process question after {max_retries} attempts")
                 
        # Add stats to dict if there are any href tags
        if image_count > 0:
            stats[filename] = { 
                'total_questions': total_questions,
                'image_count': image_count,
                'no_image_count': no_image_count
            }

processing question: 0/399 id: 8274
processing question: 1/399 id: 8276
processing question: 2/399 id: 8278
processing question: 3/399 id: 8280
processing question: 4/399 id: 8281
processing question: 5/399 id: 8283
processing question: 6/399 id: 8286
processing question: 7/399 id: 8287
processing question: 8/399 id: 8288
processing question: 9/399 id: 8289
processing question: 10/399 id: 8290
processing question: 11/399 id: 8291
processing question: 12/399 id: 8294
processing question: 13/399 id: 8295
processing question: 14/399 id: 8296
processing question: 15/399 id: 8297
processing question: 16/399 id: 8298
processing question: 17/399 id: 8299
processing question: 18/399 id: 8300
processing question: 19/399 id: 8301
processing question: 20/399 id: 8302
processing question: 21/399 id: 8303
Error on attempt 1: Processing took too long
Error on attempt 2: Processing took too long
Error on attempt 3: Processing took too long
Failed to process question after 3 attempts
processing questi