In [1]:
import json
import requests


def extract_question_from_html(html_content):
    response = requests.post(
        'http://localhost:11434/api/generate',
        json={
            'model': 'gemma3:27b',
            'prompt': f'''
             You are an expert in extracting exam questions from HTML content.
            
             I will provide you with HTML content containing an exam question and its multiple-choice options.
             Your task is to extract the question text and the options as separate items.
            
             HTML content:
             ```
             {html_content}
             ```
            
             Extract and return ONLY in the following JSON format:
             {{
                 "question": "The full question text here without any HTML tags",
                 "options": ["Option 1", "Option 2", "Option 3", "Option 4", "Option 5"]
             }}
            
             Guidelines:
             - Extract the complete question text, including any context or paragraphs preceding the actual question
             - Extract all options exactly as they appear
             - Return valid JSON only, nothing else
             - Remove any HTML tags from the question text and options
             - If you cannot find a question or options, return an empty question or options array
             - Do not include option letters (A, B, C, etc.) in the option text unless they are part of the actual content
             ''',
        }
    )
    result = ""
    for line in response.iter_lines():
        response_data = json.loads(line.decode())
        if 'response' in response_data:
            result += response_data['response']
    # Find the pattern that looks like valid JSON using string manipulation
    start_idx = result.find('{')
    end_idx = result.rfind('}') + 1
    if start_idx >= 0 and end_idx > start_idx:
        json_str = result[start_idx:end_idx]
        parsed_json = json.loads(json_str)
        return parsed_json
    raise ValueError("Could not extract valid question and options from response")




In [2]:
filePaths = [
    'cografya_6_questions.json',
    'cografya_7_questions.json',
    'cografya_8_questions.json',
    'cografya_9_questions.json',
    'dikab_6_questions.json',
    'dikab_7_questions.json',
    'dikab_8_questions.json',
    'dikab_9_questions.json',
    'felsefe_7_questions.json',
    'felsefe_8_questions.json',
    'fizik_6_questions.json',
    'fizik_7_questions.json', 
    'fizik_8_questions.json',
    'fizik_9_questions.json',
    'fl-biyoloji_6_questions.json',
    'fl-biyoloji_7_questions.json',
    'fl-biyoloji_8_questions.json',
    'fl-biyoloji_9_questions.json',
    'fl-fizik_6_questions.json',
    'fl-fizik_7_questions.json',
    'fl-fizik_8_questions.json',
    'fl-fizik_9_questions.json',
    'fl-kimya_6_questions.json',
    'fl-kimya_7_questions.json',
    'fl-kimya_8_questions.json',
    'fl-kimya_9_questions.json',
    'fl-matematik_6_questions.json',
    'fl-matematik_7_questions.json',
    'fl-matematik_8_questions.json',
    'fl-matematik_9_questions.json',
    'ingilizce_6_questions.json',
    'ingilizce_7_questions.json',
    'ingilizce_8_questions.json',
    'ingilizce_9_questions.json',
    'inkilap-tarihi_9_questions.json',
    'kimya_6_questions.json',
    'kimya_7_questions.json',
    'kimya_8_questions.json',
    'kimya_9_questions.json',
    'matematik_6_questions.json',
    'matematik_7_questions.json',
    'matematik_8_questions.json',
    'matematik_9_questions.json',
    'tarih_6_questions.json',
    'tarih_7_questions.json',
    'tarih_8_questions.json',
    'tde_6_questions.json',
    'tde_7_questions.json',
    'tde_8_questions.json',
    'tde_9_questions.json',
]

In [5]:

stats = {}

for filename in filePaths:
    with open("questions/" + filename, 'r', encoding='utf-8') as f:
        questions = json.load(f)

        processedQuestions = [q for q in questions if 'answer' in q and q['answer'] < 6 and 'img' not in q['questionAsHtml'].lower()]
        
        # Initialize counters
        image_count = 0
        no_image_count = 0
        total_questions = len(processedQuestions)
        
        for question in processedQuestions:
            print(f"processing question: {processedQuestions.index(question)}/{len(processedQuestions)} id: {question['id']}")
            
            # Skip if question already has options
            if 'options' in question:
                continue
                
            has_image = 'questionAsHtml' in question and 'img' in question['questionAsHtml'].lower()
            hasMoreOptionThan5 = 'answer' in question and question['answer'] > 5
            
            if has_image:
                image_count += 1
            else:
                max_retries = 3
                retry_count = 0
                while retry_count < max_retries:
                    try:
                        # Add timeout of 60 seconds
                        import signal
                        def timeout_handler(signum, frame):
                            raise TimeoutError("Processing took too long")
                            
                        signal.signal(signal.SIGALRM, timeout_handler)
                        signal.alarm(60)  # Set 60 second timeout
                        
                        try:
                            questionResponse = extract_question_from_html(question['questionAsHtml'])
                            signal.alarm(0)  # Disable alarm
                            
                            if questionResponse is not None:
                                print(f"questionResponse: {questionResponse}")
                                question['question'] = questionResponse['question']
                                question['options'] = questionResponse['options']
                                
                                # Save after each successful question processing
                                with open("questions/" + filename, 'w', encoding='utf-8') as f:
                                    json.dump(processedQuestions, f, ensure_ascii=False, indent=2)
                                    
                                break
                            else:
                                print(f"Failed attempt {retry_count + 1} of {max_retries}")
                                retry_count += 1
                                
                        except TimeoutError:
                            print("Question processing timed out after 60 seconds")
                            retry_count += 1
                            signal.alarm(0)  # Disable alarm
                            
                    except Exception as e:
                        print(f"Error on attempt {retry_count + 1}: {str(e)}")
                        retry_count += 1
                        if retry_count == max_retries:
                            print(f"Failed to process question after {max_retries} attempts")
                 
        # Add stats to dict if there are any href tags
        if image_count > 0:
            stats[filename] = { 
                'total_questions': total_questions,
                'image_count': image_count,
                'no_image_count': no_image_count
            }

print("File statistics:")
for file, file_stats in stats.items():
        print(f"\n{file}:")
        print(f"  Total questions: {file_stats['total_questions']}")
        print(f"  Questions with images: {file_stats['image_count']}")
        print(f"  Questions without images: {file_stats['no_image_count']}")

# Calculate total questions with no images across all files
total_no_image = sum(file_stats['no_image_count'] for file_stats in stats.values())
print(f"\nTotal questions without images across all files: {total_no_image}")

processing question: 0/386 id: 3799
processing question: 1/386 id: 20378
processing question: 2/386 id: 20379
processing question: 3/386 id: 20380
processing question: 4/386 id: 20381
processing question: 5/386 id: 20382
processing question: 6/386 id: 24975
processing question: 7/386 id: 24978
processing question: 8/386 id: 25020
processing question: 9/386 id: 3753
processing question: 10/386 id: 3754
processing question: 11/386 id: 3755
processing question: 12/386 id: 3756
processing question: 13/386 id: 3757
processing question: 14/386 id: 3758
processing question: 15/386 id: 3762
processing question: 16/386 id: 3771
processing question: 17/386 id: 3776
processing question: 18/386 id: 3785
processing question: 19/386 id: 3787
processing question: 20/386 id: 3789
processing question: 21/386 id: 3795
processing question: 22/386 id: 3802
processing question: 23/386 id: 3824
processing question: 24/386 id: 3866
processing question: 25/386 id: 3894
processing question: 26/386 id: 3896
pro

KeyboardInterrupt: 

In [None]:
import concurrent.futures


stats = {}

def process_file(filename):
    with open("questions/" + filename, 'r', encoding='utf-8') as f:
        questions = json.load(f)

        processedQuestions = [q for q in questions if 'answer' in q and q['answer'] < 6 and 'img' not in q['questionAsHtml'].lower()]
        
        # Initialize counters
        image_count = 0
        no_image_count = 0
        total_questions = len(processedQuestions)
        
        for question in processedQuestions:
            print(f"Processing {filename} - question: {processedQuestions.index(question)}/{len(processedQuestions)}")
            
            # Skip if question already has options
            if 'options' in question:
                continue
                
            has_image = 'questionAsHtml' in question and 'img' in question['questionAsHtml'].lower()
            hasMoreOptionThan5 = 'answer' in question and question['answer'] > 5
            
            if has_image:
                image_count += 1
            else:
                max_retries = 3
                retry_count = 0
                while retry_count < max_retries:
                    try:
                        questionResponse = extract_question_from_html(question['questionAsHtml'])
                        if questionResponse is not None:
                            print(f"questionResponse: {questionResponse}")
                            question['question'] = questionResponse['question']
                            question['options'] = questionResponse['options']
                            
                            # Save after each successful question processing
                            with open("questions/" + filename, 'w', encoding='utf-8') as f:
                                json.dump(processedQuestions, f, ensure_ascii=False, indent=2)
                                
                            break
                        else:
                            print(f"Failed attempt {retry_count + 1} of {max_retries}")
                            retry_count += 1
                    except Exception as e:
                        print(f"Error on attempt {retry_count + 1}: {str(e)}")
                        retry_count += 1
                        if retry_count == max_retries:
                            print(f"Failed to process question after {max_retries} attempts")
                
        # Return stats if there are any href tags
        if image_count > 0:
            return filename, {
                'total_questions': total_questions,
                'image_count': image_count,
                'no_image_count': no_image_count
            }
        return filename, None

# Process files in parallel using ThreadPoolExecutor
with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
    results = list(executor.map(process_file, filePaths))

# Collect stats from results
for filename, file_stats in results:
    if file_stats:
        stats[filename] = file_stats

print("File statistics:")
for file, file_stats in stats.items():
    print(f"\n{file}:")
    print(f"  Total questions: {file_stats['total_questions']}")
    print(f"  Questions with images: {file_stats['image_count']}")
    print(f"  Questions without images: {file_stats['no_image_count']}")

# Calculate total questions with no images across all files
total_no_image = sum(file_stats['no_image_count'] for file_stats in stats.values())
print(f"\nTotal questions without images across all files: {total_no_image}")


Processing biyoloji_7_questions.json - question: 0/201
Processing biyoloji_7_questions.json - question: 1/201
Processing biyoloji_7_questions.json - question: 2/201
Processing biyoloji_9_questions.json - question: 0/386
Processing biyoloji_8_questions.json - question: 0/417
Processing cografya_6_questions.json - question: 0/344
ChatCompletion(id=None, choices=None, created=None, model=None, object=None, service_tier=None, system_fingerprint=None, usage=None, error={'message': 'Provider returned error', 'code': 429, 'metadata': {'raw': '{\n  "error": {\n    "code": 429,\n    "message": "You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.",\n    "status": "RESOURCE_EXHAUSTED",\n    "details": [\n      {\n        "@type": "type.googleapis.com/google.rpc.QuotaFailure",\n        "violations": [\n          {\n            "quotaMetric": "generativelanguage.googleapis.com/ge