In [1]:
from bs4 import BeautifulSoup
import json
import re
import html as html_module
import os
from pathlib import Path
from datetime import datetime


def extract_from_view_source(filepath):
    """Extract HTML from a view-source saved file"""
    with open(filepath, "r", encoding="utf-8") as f:
        content = f.read()

    if "line-content" in content:
        soup = BeautifulSoup(content, "html.parser")
        lines = soup.find_all("td", class_="line-content")
        html_parts = [line.get_text() for line in lines]
        raw_html = "\n".join(html_parts)
        decoded_html = html_module.unescape(raw_html)
        return decoded_html
    else:
        return content


# ============================================================================
# ANSWER PARSING
# ============================================================================

def parse_answer_options(soup):
    """Extract answer options with sophisticated parsing"""
    answers = []
    answer_list = soup.find("ul", class_="global-list")

    if not answer_list:
        return answers

    for li in answer_list.find_all("li", recursive=False):
        # Check if correct (green background)
        is_correct = False
        span = li.find("span", style=True)
        if span and "#CFFCE4" in span.get("style", ""):
            is_correct = True

        # Get the label containing all text
        label = li.find("span", class_="mdl-radio__label")
        if not label:
            continue

        full_text = label.get_text(strip=True)

        # Try to extract letter (a), b), c), etc.)
        letter_match = re.match(r"^([a-e])\)\s*", full_text)
        if letter_match:
            option_letter = letter_match.group(1) + "."
            text_after_letter = full_text[len(letter_match.group(0)) :].strip()
        else:
            option_letter = f"{chr(97 + len(answers))}."
            text_after_letter = full_text

        # Split on (correcta)/(incorrecta) marker followed by ":"
        pattern = r"^(.*?)\s*\((correcta|incorrecta)\):\s*(.*)$"
        match = re.match(pattern, text_after_letter, re.DOTALL | re.IGNORECASE)

        if match:
            short_text = match.group(1).strip()
            detailed_explanation = match.group(3).strip()
        else:
            # Fallback: try splitting on first ":"
            if ":" in text_after_letter:
                parts = text_after_letter.split(":", 1)
                short_text = re.sub(r"\s*\((correcta|incorrecta)\)", "", parts[0], flags=re.IGNORECASE).strip()
                detailed_explanation = parts[1].strip()
            else:
                short_text = re.sub(r"\s*\((correcta|incorrecta)\)", "", text_after_letter, flags=re.IGNORECASE).strip()
                detailed_explanation = ""

        # Remove any remaining (correcta)/(incorrecta) markers
        short_text = re.sub(r"\s*\((correcta|incorrecta)\)", "", short_text, flags=re.IGNORECASE).strip()

        # Remove duplicate text patterns like "DisenterÃ­aDisenterÃ­a"
        if len(short_text) % 2 == 0:
            midpoint = len(short_text) // 2
            first_half = short_text[:midpoint]
            second_half = short_text[midpoint:]
            if first_half == second_half and len(first_half) > 0:
                short_text = first_half

        answers.append(
            {
                "letter": option_letter,
                "text": short_text,
                "explanation": detailed_explanation,
                "is_correct": is_correct,
            }
        )

    return answers


# ============================================================================
# QUESTION EXTRACTION
# ============================================================================

def extract_question(item_html):
    """Extract a single question from HTML"""
    soup = BeautifulSoup(item_html, "html.parser")

    # Extract question ID
    question_id = None
    button = soup.find("button", {"data-bs-target": re.compile("question_")})
    if button:
        target = button.get("data-bs-target", "")
        match = re.search(r"question_(\d+)", target)
        if match:
            question_id = match.group(1)

    # Check if user got it correct
    user_result = "Correct" if soup.find("span", class_="success-badge") else "Incorrect"

    # Extract question text
    question_text = ""
    if button:
        bold = button.find("b")
        if bold:
            question_text = bold.get_text(strip=True)

    # Extract topic
    topic = ""
    modal_title = soup.find("h6", class_="modal-title")
    if modal_title:
        topic = modal_title.get_text(strip=True)

    # Extract general explanation
    explanation = ""
    modal_body = soup.find("div", class_="modal-body")
    if modal_body:
        p_tag = modal_body.find("p")
        if p_tag:
            explanation = p_tag.get_text(strip=True).replace("&quot;", "").strip('"')

    # Extract answer options
    answer_options = parse_answer_options(soup)

    # Find correct answer
    correct_answer = ""
    for opt in answer_options:
        if opt["is_correct"]:
            correct_answer = f"{opt['letter']} {opt['text']}"
            break

    return {
        "question_id": question_id,
        "question_number": question_id,
        "user_result": user_result,
        "topic": topic,
        "question_text": question_text,
        "answer_options": answer_options,
        "correct_answer": correct_answer,
        "explanation": explanation,
        "source_type": "eunacom",
    }


def extract_questions_from_file(filepath):
    """Extract all questions from a single file"""
    print(f"   ðŸ“„ {os.path.basename(filepath)}", end=" ")

    try:
        html = extract_from_view_source(filepath)
        soup = BeautifulSoup(html, "html.parser")

        accordion_items = soup.find_all("div", class_="gray-card accordion-item")
        if not accordion_items:
            accordion_items = soup.find_all("div", class_="accordion-item")

        questions = []
        for item in accordion_items:
            question = extract_question(str(item))
            if question["question_id"]:
                questions.append(question)

        print(f"â†’ {len(questions)} questions")
        return questions

    except Exception as e:
        print(f"â†’ Error: {str(e)}")
        return []


def extract_from_module_folder(module_folder):
    """Extract questions from a single module folder"""
    module_path = Path(module_folder)
    module_name = module_path.name
    
    html_files = list(module_path.glob("*.html")) + list(module_path.glob("*.htm"))
    
    if not html_files:
        print(f"\nâš  {module_name}: No HTML files found")
        return []
    
    print(f"\nðŸ“š MODULE: {module_name}")
    print(f"   Files: {len(html_files)}")
    
    all_questions = []
    for html_file in html_files:
        questions = extract_questions_from_file(html_file)
        
        # Add module and source info
        for q in questions:
            q["source_file"] = html_file.name
            q["module"] = module_name
        
        all_questions.extend(questions)
    
    print(f"   âœ“ Total: {len(all_questions)} questions")
    return all_questions


def extract_all_modules(base_folder):
    """Extract from all module subfolders"""
    base_path = Path(base_folder)
    
    # Find all subdirectories
    module_folders = [d for d in base_path.iterdir() if d.is_dir()]
    
    print("=" * 60)
    print("EXTRACTING EUNACOM QUESTIONS BY MODULE")
    print("=" * 60)
    print(f"Base folder: {base_path}")
    print(f"Modules found: {len(module_folders)}")
    
    all_questions_by_module = {}
    
    for module_folder in sorted(module_folders):
        questions = extract_from_module_folder(module_folder)
        if questions:
            all_questions_by_module[module_folder.name] = questions
    
    return all_questions_by_module


def save_questions_by_module(questions_by_module, output_dir="MIEUNACOM"):
    """Save questions to separate JSON files by module"""
    output_path = Path(output_dir)
    output_path.mkdir(exist_ok=True)
    
    print(f"\n{'='*60}")
    print(f"SAVING BY MODULE TO: {output_dir}/")
    print(f"{'='*60}")
    
    for module_name, questions in questions_by_module.items():
        # Create module subfolder
        module_path = output_path / module_name
        module_path.mkdir(exist_ok=True)
        
        # Save questions.json
        json_file = module_path / "questions.json"
        with open(json_file, "w", encoding="utf-8") as f:
            json.dump(questions, f, ensure_ascii=False, indent=2)
        
        print(f"   âœ“ {module_name}/questions.json ({len(questions)} questions)")


def print_summary(questions_by_module):
    """Print extraction summary"""
    if not questions_by_module:
        print("\nâš  No questions extracted!")
        return
    
    total_all = sum(len(qs) for qs in questions_by_module.values())
    
    print(f"\n{'='*60}")
    print("ðŸ“Š SUMMARY")
    print(f"{'='*60}")
    print(f"Total modules: {len(questions_by_module)}")
    print(f"Total questions: {total_all}")
    print()
    
    # Per-module stats
    for module_name, questions in sorted(questions_by_module.items()):
        total = len(questions)
        correct = sum(1 for q in questions if q["user_result"] == "Correct")
        incorrect = total - correct
        
        pct = (correct / total * 100) if total > 0 else 0
        print(f"{module_name:20s} {total:4d} questions  âœ“ {correct:3d} ({pct:5.1f}%)  âœ— {incorrect:3d}")


# ============================================================================
# MAIN EXECUTION
# ============================================================================

base_folder = r"C:\Users\vales\Downloads\EUNACOM_TOPICO"

# Extract all questions organized by module
questions_by_module = extract_all_modules(base_folder)

# Save to MIEUNACOM folder structure
save_questions_by_module(questions_by_module, "MIEUNACOM")

# Print summaryq
print_summary(questions_by_module)

print(f"\nâœ… Done! Ready to generate PDFs with generate_pdf.py")

EXTRACTING EUNACOM QUESTIONS BY MODULE
Base folder: C:\Users\vales\Downloads\EUNACOM_TOPICO
Modules found: 5

ðŸ“š MODULE: GINECOLOGIA
   Files: 2
   ðŸ“„ 1.html â†’ 20 questions
   ðŸ“„ 2.html â†’ 20 questions
   âœ“ Total: 40 questions

ðŸ“š MODULE: INFECTOLOGIA
   Files: 15
   ðŸ“„ 1.html â†’ 20 questions
   ðŸ“„ 10.html â†’ 20 questions
   ðŸ“„ 11.html â†’ 20 questions
   ðŸ“„ 12.html â†’ 20 questions
   ðŸ“„ 13.html â†’ 20 questions
   ðŸ“„ 14.html â†’ 20 questions
   ðŸ“„ 15.html â†’ 20 questions
   ðŸ“„ 2.html â†’ 20 questions
   ðŸ“„ 3.html â†’ 20 questions
   ðŸ“„ 4.html â†’ 20 questions
   ðŸ“„ 5.html â†’ 20 questions
   ðŸ“„ 6.html â†’ 20 questions
   ðŸ“„ 7.html â†’ 20 questions
   ðŸ“„ 8.html â†’ 20 questions
   ðŸ“„ 9.html â†’ 20 questions
   âœ“ Total: 300 questions

ðŸ“š MODULE: NEURO
   Files: 15
   ðŸ“„ 1.html â†’ 20 questions
   ðŸ“„ 10.html â†’ 20 questions
   ðŸ“„ 11.html â†’ 20 questions
   ðŸ“„ 12.html â†’ 20 questions
   ðŸ“„ 13.html â†’ 20 questions
   ðŸ“„ 14.