In [12]:
from bs4 import BeautifulSoup
import pandas as pd
import json
import re
import os
from pathlib import Path
from datetime import datetime
import requests
from urllib.parse import urljoin, urlparse
import hashlib

# Configuration
BASE_DIR = r"C:\Users\vales\DataspellProjects\keuna\EUNACOM\ENSAYOS\GUEVARA"
OUTPUT_DIR = r"C:\Users\vales\DataspellProjects\keuna\EUNACOM\OUTPUTS"
IMAGES_DIR = os.path.join(OUTPUT_DIR, "images")

# Create directories if they don't exist
os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(IMAGES_DIR, exist_ok=True)


def get_browser_cookies():
    """
    Try to get cookies from browser for authentication
    """
    try:
        import browser_cookie3

        # Try Chrome first, then Firefox
        try:
            cookies = browser_cookie3.chrome(domain_name="doctorguevara.cl")
            print("  ✓ Using Chrome cookies")
            return cookies
        except:
            try:
                cookies = browser_cookie3.firefox(domain_name="doctorguevara.cl")
                print("  ✓ Using Firefox cookies")
                return cookies
            except:
                print("  ⚠ Could not load browser cookies")
                return None
    except ImportError:
        print("  ⚠ browser_cookie3 not installed. Run: pip install browser-cookie3")
        return None


def download_image(img_url, cookies=None):
    """
    Download image and save it locally
    Returns the local path
    """
    try:
        # Create a unique filename based on URL hash
        url_hash = hashlib.md5(img_url.encode()).hexdigest()

        # Get file extension from URL
        parsed_url = urlparse(img_url)
        ext = os.path.splitext(parsed_url.path)[1] or ".png"

        local_filename = f"{url_hash}{ext}"
        local_path = os.path.join(IMAGES_DIR, local_filename)

        # Check if already downloaded
        if os.path.exists(local_path):
            return local_path

        # Download the image
        headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"}

        response = requests.get(img_url, headers=headers, cookies=cookies, timeout=10)

        if response.status_code == 200:
            with open(local_path, "wb") as f:
                f.write(response.content)
            return local_path
        else:
            print(f"    ⚠ Failed to download image (status {response.status_code}): {img_url}")
            return None

    except Exception as e:
        print(f"    ⚠ Error downloading image: {e}")
        return None


def extract_images_from_element(element, base_url="https://cursosonline.doctorguevara.cl", cookies=None):
    """
    Extract and download all images from an HTML element
    Returns list of image info dictionaries
    """
    images = []

    if element:
        img_tags = element.find_all("img")

        for img in img_tags:
            img_src = img.get("src", "")
            img_alt = img.get("alt", "")

            if img_src:
                # Make absolute URL
                if img_src.startswith("http"):
                    full_url = img_src
                else:
                    full_url = urljoin(base_url, img_src)

                # Download image
                local_path = download_image(full_url, cookies)

                images.append(
                    {
                        "original_url": full_url,
                        "local_path": local_path,
                        "alt_text": img_alt,
                        "width": img.get("width", ""),
                        "height": img.get("height", ""),
                    }
                )

    return images


def extract_questions_from_html(html_content, source_filename, cookies=None):
    """
    Extract questions from HTML content with images
    Returns list of question dictionaries
    """
    # Parse the view-source page first
    soup_viewsource = BeautifulSoup(html_content, "html.parser")

    # Check if it's a view-source format
    line_contents = soup_viewsource.find_all("td", class_="line-content")

    if line_contents:
        # It's a view-source format, reconstruct HTML
        actual_html_lines = [line_td.get_text() for line_td in line_contents]
        actual_html = "\n".join(actual_html_lines)
        soup = BeautifulSoup(actual_html, "html.parser")
    else:
        # It's regular HTML
        soup = soup_viewsource

    # Find questions
    questions = soup.find_all("div", id=re.compile(r"question-\d+-\d+"))

    if not questions:
        # Try alternative pattern
        questions = soup.find_all("div", class_=re.compile(r"que.*multichoice"))

    extracted_data = []

    for idx, question in enumerate(questions, 1):
        try:
            # Extract question ID
            question_id = question.get("id", f"q_{idx}")

            # Question number
            qno_span = question.find("span", class_="qno")
            q_number = qno_span.get_text(strip=True) if qno_span else str(idx)

            # Question text - FULL TEXT with images
            qtext_div = question.find("div", class_="qtext")
            q_text = ""
            q_images = []

            if qtext_div:
                # Extract images first
                q_images = extract_images_from_element(qtext_div, cookies=cookies)

                # Get text without images
                qtext_copy = qtext_div.__copy__()
                for table in qtext_copy.find_all("table"):
                    table.decompose()
                for img in qtext_copy.find_all("img"):
                    img.decompose()
                q_text = qtext_copy.get_text(strip=True, separator=" ")

            # Find ALL answer options
            answer_divs = question.find_all("div", class_=re.compile(r"^r[0-1]$"))

            correct_answer = None
            all_options = []

            for ans_div in answer_divs:
                label = ans_div.find("div", class_="d-flex")
                if label:
                    letter_span = label.find("span", class_="answernumber")
                    text_div = label.find("div", class_="flex-fill")

                    if letter_span and text_div:
                        letter = letter_span.get_text(strip=True)
                        text = text_div.get_text(strip=True)

                        is_correct = "correct" in ans_div.get("class", [])

                        option_dict = {"letter": letter, "text": text, "is_correct": is_correct}

                        all_options.append(option_dict)

                        if is_correct:
                            correct_answer = f"{letter}{text}"

            # Get feedback/explanation - FULL TEXT with images
            feedback_div = question.find("div", class_="generalfeedback")
            feedback = ""
            feedback_images = []

            if feedback_div:
                # Extract images from feedback
                feedback_images = extract_images_from_element(feedback_div, cookies=cookies)

                # Get text
                feedback = feedback_div.get_text(strip=True, separator=" ")

            # Extract topic (first sentence or up to 150 chars)
            if q_text:
                sentences = q_text.split(".")
                topic = sentences[0][:150] if sentences else q_text[:150]
            else:
                topic = "Sin descripción"

            # Structure the data
            question_data = {
                "question_id": question_id,
                "question_number": q_number,
                "source_file": source_filename,
                "topic": topic,
                "question_text": q_text,
                "question_images": q_images,
                "answer_options": all_options,
                "correct_answer": correct_answer,
                "explanation": feedback,
                "explanation_images": feedback_images,
            }

            extracted_data.append(question_data)

        except Exception as e:
            print(f"    ✗ Error processing question {idx}: {e}")

    return extracted_data


def main():
    print("=" * 80)
    print("QUIZ DATA EXTRACTOR - BATCH PROCESSOR WITH IMAGE DOWNLOAD")
    print("=" * 80)
    print(f"Source directory: {BASE_DIR}")
    print(f"Output directory: {OUTPUT_DIR}")
    print(f"Images directory: {IMAGES_DIR}\n")

    # Try to get browser cookies for authenticated downloads
    print("Attempting to load browser cookies...")
    cookies = get_browser_cookies()
    print()

    # Find all HTML files
    html_files = list(Path(BASE_DIR).glob("*.html"))

    if not html_files:
        print("✗ No HTML files found in the directory!")
        return

    print(f"Found {len(html_files)} HTML files\n")
    print("=" * 80)

    # Process each file
    all_questions = []

    for file_idx, html_file in enumerate(html_files, 1):
        filename = html_file.name
        print(f"\n[{file_idx}/{len(html_files)}] {filename}")

        try:
            # Read file
            with open(html_file, "r", encoding="utf-8") as f:
                html_content = f.read()

            # Extract questions
            questions = extract_questions_from_html(html_content, filename, cookies)

            if questions:
                all_questions.extend(questions)

                # Count images
                total_images = sum(len(q["question_images"]) + len(q["explanation_images"]) for q in questions)

                print(f"  ✓ {len(questions)} questions, {total_images} images")
            else:
                print(f"  ⚠ No questions found")

        except Exception as e:
            print(f"  ✗ Error: {e}")

    # Generate output files
    print("\n" + "=" * 80)
    print("GENERATING OUTPUT FILES")
    print("=" * 80 + "\n")

    if not all_questions:
        print("✗ No questions were extracted!")
        return

    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

    # 1. Save complete JSON with all data
    output_json = os.path.join(OUTPUT_DIR, f"questions_complete_{timestamp}.json")
    with open(output_json, "w", encoding="utf-8") as f:
        json.dump(all_questions, f, ensure_ascii=False, indent=2)
    print(f"✓ Saved JSON: {os.path.basename(output_json)}")

    # 2. Create simplified DataFrame for Excel/CSV
    simplified_data = []
    for q in all_questions:
        row = {
            "question_id": q["question_id"],
            "question_number": q["question_number"],
            "source_file": q["source_file"],
            "topic": q["topic"],
            "question_text": q["question_text"],
            "has_question_images": len(q["question_images"]) > 0,
            "question_image_count": len(q["question_images"]),
            "question_image_paths": " | ".join(
                [img["local_path"] or img["original_url"] for img in q["question_images"]]
            ),
            "correct_answer": q["correct_answer"],
            "explanation": q["explanation"],
            "has_explanation_images": len(q["explanation_images"]) > 0,
            "explanation_image_count": len(q["explanation_images"]),
            "explanation_image_paths": " | ".join(
                [img["local_path"] or img["original_url"] for img in q["explanation_images"]]
            ),
        }

        # Add answer options
        for i, opt in enumerate(q["answer_options"], 1):
            row[f"option_{i}"] = f"{opt['letter']}{opt['text']}"
            row[f"option_{i}_correct"] = opt["is_correct"]

        simplified_data.append(row)

    df = pd.DataFrame(simplified_data)

    # Save Excel
    output_excel = os.path.join(OUTPUT_DIR, f"questions_database_{timestamp}.xlsx")
    df.to_excel(output_excel, index=False, engine="openpyxl")
    print(f"✓ Saved Excel: {os.path.basename(output_excel)}")

    # Save CSV
    output_csv = os.path.join(OUTPUT_DIR, f"questions_database_{timestamp}.csv")
    df.to_csv(output_csv, index=False, encoding="utf-8-sig")
    print(f"✓ Saved CSV: {os.path.basename(output_csv)}")

    # Print statistics
    print("\n" + "=" * 80)
    print("STATISTICS")
    print("=" * 80)
    print(f"Files processed: {len(html_files)}")
    print(f"Total questions: {len(all_questions)}")

    total_q_images = sum(len(q["question_images"]) for q in all_questions)
    total_e_images = sum(len(q["explanation_images"]) for q in all_questions)
    total_images = total_q_images + total_e_images

    print(f"Total images: {total_images}")
    print(f"  - In questions: {total_q_images}")
    print(f"  - In explanations: {total_e_images}")

    downloaded_images = sum(
        1 for q in all_questions for img in q["question_images"] + q["explanation_images"] if img["local_path"]
    )
    print(f"Successfully downloaded: {downloaded_images}/{total_images}")

    print("\n" + "=" * 80)
    print("COMPLETE!")
    print("=" * 80)


if __name__ == "__main__":
    main()

QUIZ DATA EXTRACTOR - BATCH PROCESSOR WITH IMAGE DOWNLOAD
Source directory: C:\Users\vales\DataspellProjects\keuna\EUNACOM\ENSAYOS\GUEVARA
Output directory: C:\Users\vales\DataspellProjects\keuna\EUNACOM\OUTPUTS
Images directory: C:\Users\vales\DataspellProjects\keuna\EUNACOM\OUTPUTS\images

Attempting to load browser cookies...
  ⚠ browser_cookie3 not installed. Run: pip install browser-cookie3

Found 70 HTML files


[1/70] 1.1 Cuestionario Diabetes.html
  ✓ 15 questions, 0 images

[2/70] 1.2. Cuestionario Diabetes.html
  ✓ 15 questions, 0 images

[3/70] 1.3 Cuestionario Diabetes.html
  ✓ 14 questions, 0 images

[4/70] 10.2 Cuestionario Neurología.html
  ✓ 14 questions, 0 images

[5/70] 10.3 Cuestionario Neurología.html
  ✓ 20 questions, 0 images

[6/70] 10.4 Cuestionario Neurología.html
  ✓ 14 questions, 0 images

[7/70] 10.5 Cuestionario Neurología.html
  ✓ 20 questions, 0 images

[8/70] 10.6 Cuestionario Neurología.html
  ✓ 7 questions, 1 images

[9/70] 2.1 Cuestionario Endocrinolo

In [None]:
questions_database_20251019_185913.xlsx

In [1]:
import polars as pl

In [4]:
pl.read_csv(r"C:\Users\vales\DataspellProjects\keuna\EUNACOM\OUTPUTS\questions_database_20251019_185913.csv")[
    "source_file"
].unique().to_list()

['7.2 Cuestionario Infectología.html',
 '1.2. Cuestionario Diabetes.html',
 'Prueba repaso 4.html',
 '2.2 Cuestionario Endocrinología.html',
 '3.6 Cuestionario Cardiología.html',
 '7.1 Cuestionario Infectología.html',
 '5.1 Cuestionario Reumatología.html',
 '8.4 Cuestionario Respiratorio.html',
 '2.3 Cuestionario Endocrinología.html',
 '4.5 Cuestionario Nefrología.html',
 'Reconstrucción Eunacom julio 2024 P2.html',
 '10.3 Cuestionario Neurología.html',
 '4.2 Cuestionario Nefrología.html',
 '9.1 Cuestionario Gastroenterología.html',
 '5.4 Cuestionario Reumatología.html',
 '2.1 Cuestionario Endocrinología.html',
 '6.1 Cuestionario Hematología.html',
 '6.6 Cuestionario Hematología.html',
 '8.7 Cuestionario Respiratorio.html',
 '1.3 Cuestionario Diabetes.html',
 '3.2 Cuestionario Cardiología.html',
 '2.4 Cuestionario Endocrinología.html',
 '9.4 Cuestionario Gastroenterología.html',
 '6.5 Cuestionario Hematología.html',
 '10.4 Cuestionario Neurología.html',
 '6.3 Cuestionario Hematología.h

In [6]:
from bs4 import BeautifulSoup
import pandas as pd
import json
import re
import os
from pathlib import Path
from datetime import datetime

# Configuration
BASE_DIR = r"C:\Users\vales\DataspellProjects\keuna\EUNACOM\ENSAYOS\GUEVARA"
OUTPUT_DIR = BASE_DIR  # Save outputs in the same directory


def extract_questions_from_html(html_content, source_filename):
    """
    Extract questions from HTML content
    Returns list of question dictionaries
    """
    # Parse the view-source page first
    soup_viewsource = BeautifulSoup(html_content, "html.parser")

    # Check if it's a view-source format
    line_contents = soup_viewsource.find_all("td", class_="line-content")

    if line_contents:
        # It's a view-source format, reconstruct HTML
        actual_html_lines = [line_td.get_text() for line_td in line_contents]
        actual_html = "\n".join(actual_html_lines)
        soup = BeautifulSoup(actual_html, "html.parser")
    else:
        # It's regular HTML
        soup = soup_viewsource

    # Find questions
    questions = soup.find_all("div", id=re.compile(r"question-\d+-\d+"))

    if not questions:
        # Try alternative pattern
        questions = soup.find_all("div", class_=re.compile(r"que.*multichoice"))

    extracted_data = []

    for idx, question in enumerate(questions, 1):
        try:
            # Extract question ID from the div id attribute
            question_id = question.get("id", f"q_{idx}")

            # Question number
            qno_span = question.find("span", class_="qno")
            q_number = qno_span.get_text(strip=True) if qno_span else str(idx)

            # Question text - FULL TEXT
            qtext_div = question.find("div", class_="qtext")
            if qtext_div:
                qtext_copy = qtext_div.__copy__()
                # Remove tables but keep the rest
                for table in qtext_copy.find_all("table"):
                    table.decompose()
                q_text = qtext_copy.get_text(strip=True, separator=" ")
            else:
                q_text = ""

            # Find ALL answer options
            answer_divs = question.find_all("div", class_=re.compile(r"^r[0-1]$"))

            correct_answer = None
            all_options = []

            for ans_div in answer_divs:
                label = ans_div.find("div", class_="d-flex")
                if label:
                    letter_span = label.find("span", class_="answernumber")
                    text_div = label.find("div", class_="flex-fill")

                    if letter_span and text_div:
                        letter = letter_span.get_text(strip=True)
                        text = text_div.get_text(strip=True)

                        is_correct = "correct" in ans_div.get("class", [])

                        option_dict = {"letter": letter, "text": text, "is_correct": is_correct}

                        all_options.append(option_dict)

                        if is_correct:
                            correct_answer = f"{letter} {text}"

            # Get feedback/explanation - FULL TEXT
            feedback_div = question.find("div", class_="generalfeedback")
            feedback = feedback_div.get_text(strip=True, separator=" ") if feedback_div else ""

            # Get the "correct answer" text
            rightanswer_div = question.find("div", class_="rightanswer")
            right_answer_text = rightanswer_div.get_text(strip=True) if rightanswer_div else ""

            # Extract topic (first 100 chars of question or first sentence)
            topic = q_text.split(".")[0][:100] if q_text else "Tema no especificado"

            # Structure the data
            question_data = {
                "question_id": question_id,
                "question_number": q_number,
                "topic": topic,
                "question_text": q_text,
                "answer_options": all_options,
                "correct_answer": correct_answer,
                "explanation": feedback,
                "right_answer_text": right_answer_text,
                "source_file": source_filename,
            }

            extracted_data.append(question_data)

        except Exception as e:
            print(f"    ✗ Error processing question {idx} in {source_filename}: {e}")

    return extracted_data


def main():
    print("=" * 80)
    print("QUIZ DATA EXTRACTOR - BATCH PROCESSOR")
    print("=" * 80)
    print(f"Source directory: {BASE_DIR}\n")

    # Find all HTML files
    html_files = list(Path(BASE_DIR).glob("*.html"))

    if not html_files:
        print("✗ No HTML files found in the directory!")
        return

    print(f"Found {len(html_files)} HTML files\n")
    print("=" * 80)

    # Process each file
    all_questions = []
    file_stats = []

    for file_idx, html_file in enumerate(html_files, 1):
        filename = html_file.name
        print(f"\n[{file_idx}/{len(html_files)}] Processing: {filename}")
        print("-" * 60)

        try:
            # Read file
            with open(html_file, "r", encoding="utf-8") as f:
                html_content = f.read()

            print(f"  File size: {len(html_content):,} characters")

            # Extract questions
            questions = extract_questions_from_html(html_content, filename)

            if questions:
                all_questions.extend(questions)
                print(f"  ✓ Extracted {len(questions)} questions")

                file_stats.append({"filename": filename, "questions_count": len(questions), "status": "Success"})
            else:
                print(f"  ⚠ No questions found")
                file_stats.append({"filename": filename, "questions_count": 0, "status": "No questions found"})

        except Exception as e:
            print(f"  ✗ Error processing file: {e}")
            file_stats.append({"filename": filename, "questions_count": 0, "status": f"Error: {str(e)[:50]}"})

    # Generate output files
    print("\n" + "=" * 80)
    print("GENERATING OUTPUT FILES")
    print("=" * 80 + "\n")

    if not all_questions:
        print("✗ No questions were extracted from any file!")
        return

    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

    # 1. Save as JSON
    output_json = os.path.join(OUTPUT_DIR, f"quiz_data_consolidated_{timestamp}.json")
    with open(output_json, "w", encoding="utf-8") as f:
        json.dump(all_questions, f, ensure_ascii=False, indent=2)
    print(f"✓ Saved JSON: {output_json}")

    # 2. Create flattened DataFrame for Excel/CSV
    flattened_data = []
    for q in all_questions:
        row = {
            "question_id": q["question_id"],
            "question_number": q["question_number"],
            "topic": q["topic"],
            "question_text": q["question_text"],
            "correct_answer": q["correct_answer"],
            "explanation": q["explanation"],
            "source_file": q["source_file"],
        }

        # Add each answer option as columns
        for i, opt in enumerate(q["answer_options"], 1):
            row[f"option_{i}"] = f"{opt['letter']} {opt['text']}"
            row[f"option_{i}_correct"] = "SÍ" if opt["is_correct"] else "NO"

        flattened_data.append(row)

    df = pd.DataFrame(flattened_data)

    # Save Excel
    output_excel = os.path.join(OUTPUT_DIR, f"quiz_data_consolidated_{timestamp}.xlsx")
    df.to_excel(output_excel, index=False, engine="openpyxl")
    print(f"✓ Saved Excel: {output_excel}")

    # Save CSV
    output_csv = os.path.join(OUTPUT_DIR, f"quiz_data_consolidated_{timestamp}.csv")
    df.to_csv(output_csv, index=False, encoding="utf-8-sig")
    print(f"✓ Saved CSV: {output_csv}")

    # 3. Save processing summary
    summary_df = pd.DataFrame(file_stats)
    summary_file = os.path.join(OUTPUT_DIR, f"processing_summary_{timestamp}.csv")
    summary_df.to_csv(summary_file, index=False, encoding="utf-8-sig")
    print(f"✓ Saved processing summary: {summary_file}")

    # Print statistics
    print("\n" + "=" * 80)
    print("FINAL STATISTICS")
    print("=" * 80)
    print(f"\nFiles processed: {len(html_files)}")
    print(f"Total questions extracted: {len(all_questions)}")
    print(f"Successful files: {len([s for s in file_stats if s['status'] == 'Success'])}")
    print(f"Failed files: {len([s for s in file_stats if s['status'] != 'Success'])}")

    print("\n" + "Per-file breakdown:")
    print(summary_df.to_string(index=False))

    if all_questions:
        avg_options = sum(len(q["answer_options"]) for q in all_questions) / len(all_questions)
        print(f"\nAverage options per question: {avg_options:.1f}")

        # Count questions by source file
        print("\nQuestions per file:")
        source_counts = {}
        for q in all_questions:
            source = q["source_file"]
            source_counts[source] = source_counts.get(source, 0) + 1

        for source, count in sorted(source_counts.items()):
            print(f"  {source}: {count} questions")

    print("\n" + "=" * 80)
    print("PROCESSING COMPLETE!")
    print("=" * 80)


if __name__ == "__main__":
    main()

QUIZ DATA EXTRACTOR - BATCH PROCESSOR
Source directory: C:\Users\vales\DataspellProjects\keuna\EUNACOM\ENSAYOS\GUEVARA

Found 70 HTML files


[1/70] Processing: 1.1 Cuestionario Diabetes.html
------------------------------------------------------------
  File size: 1,074,933 characters
  ✓ Extracted 15 questions

[2/70] Processing: 1.2. Cuestionario Diabetes.html
------------------------------------------------------------
  File size: 1,056,766 characters
  ✓ Extracted 15 questions

[3/70] Processing: 1.3 Cuestionario Diabetes.html
------------------------------------------------------------
  File size: 1,026,345 characters
  ✓ Extracted 14 questions

[4/70] Processing: 10.2 Cuestionario Neurología.html
------------------------------------------------------------
  File size: 1,025,094 characters
  ✓ Extracted 14 questions

[5/70] Processing: 10.3 Cuestionario Neurología.html
------------------------------------------------------------
  File size: 1,145,674 characters
  ✓ Extracted 

In [10]:
all_questions

NameError: name 'all_questions' is not defined

In [9]:
df.head().to_dicts()

AttributeError: 'DataFrame' object has no attribute 'to_dicts'

In [1]:
path = r"C:\Users\vales\DataspellProjects\keuna\EUNACOM\ENSAYOS\GUEVARA\view-source_https___cursosonline.doctorguevara.cl_mod_quiz_review.php_attempt=1263771&cmid=125317.html"

In [2]:
from bs4 import BeautifulSoup
import pandas as pd
import re

path = r"C:\Users\vales\DataspellProjects\keuna\EUNACOM\ENSAYOS\GUEVARA\view-source_https___cursosonline.doctorguevara.cl_mod_quiz_review.php_attempt=1263771&cmid=125317.html"

print("Reading view-source HTML file...")
with open(path, "r", encoding="utf-8") as f:
    view_source_html = f.read()

print(f"File size: {len(view_source_html)} characters\n")

# Parse the view-source page
print("Extracting actual HTML from view-source format...")
soup_viewsource = BeautifulSoup(view_source_html, "html.parser")

# Find all line-content cells
line_contents = soup_viewsource.find_all("td", class_="line-content")
print(f"Found {len(line_contents)} lines of code\n")

if not line_contents:
    print("✗ Could not find line-content elements. This might not be a view-source file.")
    exit()

# Extract and reconstruct the actual HTML
actual_html_lines = []
for line_td in line_contents:
    # Get all text from this line, which will include the HTML
    line_text = line_td.get_text()
    actual_html_lines.append(line_text)

# Join all lines to get the complete HTML
actual_html = "\n".join(actual_html_lines)

print(f"Reconstructed HTML size: {len(actual_html)} characters")
print("=" * 80)

# Now parse the ACTUAL HTML
soup = BeautifulSoup(actual_html, "html.parser")

print("SEARCHING FOR QUESTIONS IN RECONSTRUCTED HTML")
print("=" * 80)

# Search for questions
questions = soup.find_all("div", id=re.compile(r"question-\d+-\d+"))
print(f"Questions found: {len(questions)}\n")

if not questions:
    print("Still no questions found. Trying alternative patterns...")
    questions = soup.find_all("div", class_=re.compile(r"que.*multichoice"))
    print(f"Questions with 'que multichoice' class: {len(questions)}\n")

if not questions:
    print("✗ No questions found even after reconstruction")
    print("\nShowing sample of reconstructed HTML (chars 50000-50500):")
    print(actual_html[50000:50500])
    exit()

print(f"✓ Successfully found {len(questions)} questions!")
print("\n" + "=" * 80)
print("EXTRACTING QUESTION DATA")
print("=" * 80 + "\n")

# Extract data
data = []

for question in questions:
    try:
        # Question number
        qno_span = question.find("span", class_="qno")
        q_number = qno_span.get_text(strip=True) if qno_span else "?"

        # State
        state_div = question.find("div", class_="state")
        q_state = state_div.get_text(strip=True) if state_div else "N/A"

        # Question text
        qtext_div = question.find("div", class_="qtext")
        if qtext_div:
            # Remove tables to get clean question text
            for table in qtext_div.find_all("table"):
                table.decompose()
            q_text = qtext_div.get_text(strip=True, separator=" ")
        else:
            q_text = "N/A"

        # Find answer options
        answer_divs = question.find_all("div", class_=re.compile(r"^r[0-1]$"))

        correct_answer = None
        user_answer = None
        all_answers = []

        for ans_div in answer_divs:
            # Find the label with answer text
            label = ans_div.find("div", class_="d-flex")
            if label:
                letter_span = label.find("span", class_="answernumber")
                text_div = label.find("div", class_="flex-fill")

                if letter_span and text_div:
                    letter = letter_span.get_text(strip=True)
                    text = text_div.get_text(strip=True)
                    answer_full = f"{letter} {text}"

                    all_answers.append(answer_full)

                    # Check if this is correct
                    if "correct" in ans_div.get("class", []):
                        correct_answer = answer_full

                    # Check if this was selected by user
                    radio_input = ans_div.find("input", type="radio")
                    if radio_input and radio_input.get("checked"):
                        user_answer = answer_full

        # Get feedback/explanation
        feedback_div = question.find("div", class_="generalfeedback")
        feedback = feedback_div.get_text(strip=True, separator=" ") if feedback_div else ""

        # Determine if answer was correct
        was_correct = q_state == "Correcta"

        data.append(
            {
                "Q#": q_number,
                "Status": "✓" if was_correct else "✗",
                "Question": q_text[:200] + "..." if len(q_text) > 200 else q_text,
                "Your_Answer": user_answer if user_answer else correct_answer,
                "Correct_Answer": correct_answer or "N/A",
                "Explanation": feedback[:350] + "..." if len(feedback) > 350 else feedback,
            }
        )

        status_icon = "✓" if was_correct else "✗"
        print(f"{status_icon} Q{q_number}: {q_state} - {correct_answer[:60] if correct_answer else 'N/A'}...")

    except Exception as e:
        print(f"✗ Error processing question: {e}")
        import traceback

        traceback.print_exc()

# Create DataFrame
if data:
    df = pd.DataFrame(data)

    print("\n" + "=" * 80)
    print("EXTRACTION COMPLETE - SUMMARY TABLE")
    print("=" * 80 + "\n")

    # Display summary
    summary_df = df[["Q#", "Status", "Correct_Answer"]].copy()
    summary_df["Correct_Answer"] = summary_df["Correct_Answer"].str[:80]
    print(summary_df.to_string(index=False))

    # Save files
    output_excel = "quiz_results.xlsx"
    output_csv = "quiz_results.csv"

    df.to_excel(output_excel, index=False, engine="openpyxl")
    df.to_csv(output_csv, index=False, encoding="utf-8-sig")

    print(f"\n✓ Saved to: {output_excel}")
    print(f"✓ Saved to: {output_csv}")

    # Statistics
    print("\n" + "=" * 80)
    print("QUIZ STATISTICS")
    print("=" * 80)
    total = len(df)
    correct = len(df[df["Status"] == "✓"])
    incorrect = total - correct
    percentage = (correct / total * 100) if total > 0 else 0

    print(f"Total Questions: {total}")
    print(f"Correct: {correct}")
    print(f"Incorrect: {incorrect}")
    print(f"Score: {percentage:.2f}%")

else:
    print("\n✗ No data was extracted")

Reading view-source HTML file...
File size: 1074933 characters

Extracting actual HTML from view-source format...
Found 2815 lines of code

Reconstructed HTML size: 244337 characters
SEARCHING FOR QUESTIONS IN RECONSTRUCTED HTML
Questions found: 15

✓ Successfully found 15 questions!

EXTRACTING QUESTION DATA

✓ Q1: Correcta - c. Aumentar la insulina cristalina del almuerzo y cena...
✓ Q2: Correcta - d. Iniciar un fibrato...
✓ Q3: Correcta - a. Test de tolerancia a la glucosa...
✓ Q4: Correcta - e. Niveles plasmáticos de péptido C...
✓ Q5: Correcta - d. iECAs...
✓ Q6: Correcta - c. Iniciar insulina...
✓ Q7: Correcta - e. Hospitalizar, iniciar insulinoterapia y venlafaxina oral...
✓ Q8: Correcta - e. Iniciar insulina...
✓ Q9: Correcta - c. Indicar dieta e iniciar una estatina...
✓ Q10: Correcta - c. Hipoglicemia facticia por glibenclamida...
✓ Q11: Correcta - c. Iniciar insulina es esquema intensificado, con una dosis ...
✓ Q12: Correcta - c. Iniciar dieta y un fibrato...
✓ Q13: Correct

In [3]:
from bs4 import BeautifulSoup
import pandas as pd
import json
import re

path = r"C:\Users\vales\DataspellProjects\keuna\EUNACOM\ENSAYOS\GUEVARA\view-source_https___cursosonline.doctorguevara.cl_mod_quiz_review.php_attempt=1263771&cmid=125317.html"

print("Reading view-source HTML file...")
with open(path, "r", encoding="utf-8") as f:
    view_source_html = f.read()

print(f"File size: {len(view_source_html)} characters\n")

# Parse the view-source page
print("Extracting actual HTML from view-source format...")
soup_viewsource = BeautifulSoup(view_source_html, "html.parser")

# Find all line-content cells
line_contents = soup_viewsource.find_all("td", class_="line-content")
print(f"Found {len(line_contents)} lines of code\n")

if not line_contents:
    print("✗ Could not find line-content elements.")
    exit()

# Reconstruct the actual HTML
actual_html_lines = []
for line_td in line_contents:
    line_text = line_td.get_text()
    actual_html_lines.append(line_text)

actual_html = "\n".join(actual_html_lines)
print(f"Reconstructed HTML size: {len(actual_html)} characters")
print("=" * 80)

# Parse the ACTUAL HTML
soup = BeautifulSoup(actual_html, "html.parser")

print("SEARCHING FOR QUESTIONS IN RECONSTRUCTED HTML")
print("=" * 80)

# Search for questions
questions = soup.find_all("div", id=re.compile(r"question-\d+-\d+"))
print(f"Questions found: {len(questions)}\n")

if not questions:
    print("✗ No questions found")
    exit()

print(f"✓ Successfully found {len(questions)} questions!")
print("\n" + "=" * 80)
print("EXTRACTING QUESTION DATA (FULL TEXT, NO TRUNCATION)")
print("=" * 80 + "\n")

# Extract data
quiz_data = []

for idx, question in enumerate(questions, 1):
    try:
        # Extract question ID from the div id attribute
        question_id = question.get("id", "")

        # Question number
        qno_span = question.find("span", class_="qno")
        q_number = qno_span.get_text(strip=True) if qno_span else str(idx)

        # Question text - FULL TEXT, NO TRUNCATION
        qtext_div = question.find("div", class_="qtext")
        if qtext_div:
            # Remove tables but keep the rest
            qtext_copy = qtext_div.__copy__()
            for table in qtext_copy.find_all("table"):
                table.decompose()
            q_text = qtext_copy.get_text(strip=True, separator=" ")
        else:
            q_text = ""

        # Find ALL answer options
        answer_divs = question.find_all("div", class_=re.compile(r"^r[0-1]$"))

        correct_answer = None
        correct_explanation = None
        all_options = []

        for ans_div in answer_divs:
            # Find the label with answer text
            label = ans_div.find("div", class_="d-flex")
            if label:
                letter_span = label.find("span", class_="answernumber")
                text_div = label.find("div", class_="flex-fill")

                if letter_span and text_div:
                    letter = letter_span.get_text(strip=True)
                    text = text_div.get_text(strip=True)

                    # Check if this is correct
                    is_correct = "correct" in ans_div.get("class", [])

                    option_dict = {"letter": letter, "text": text, "is_correct": is_correct}

                    all_options.append(option_dict)

                    if is_correct:
                        correct_answer = f"{letter} {text}"

        # Get feedback/explanation - FULL TEXT
        feedback_div = question.find("div", class_="generalfeedback")
        feedback = feedback_div.get_text(strip=True, separator=" ") if feedback_div else ""

        # Get the "correct answer" text at the end
        rightanswer_div = question.find("div", class_="rightanswer")
        right_answer_text = rightanswer_div.get_text(strip=True) if rightanswer_div else ""

        # Try to extract topic from question text or feedback (first sentence or similar)
        # This is a simple heuristic - you may want to adjust
        topic = q_text.split(".")[0][:100] if q_text else "Tema no especificado"

        # Structure the data as requested
        question_data = {
            "question_id": question_id.split("-")[-1] if question_id else str(idx),
            "question_number": q_number,
            "topic": topic,
            "question_text": q_text,
            "answer_options": all_options,  # List of all options
            "correct_answer": correct_answer,
            "explanation": feedback,
            "right_answer_text": right_answer_text,
            "source_exam": "GUEVARA - Diabetes Quiz",  # You can modify this
        }

        quiz_data.append(question_data)

        print(
            f"✓ Q{q_number} extracted: {len(all_options)} options, {len(q_text)} chars question, {len(feedback)} chars explanation"
        )

    except Exception as e:
        print(f"✗ Error processing question {idx}: {e}")
        import traceback

        traceback.print_exc()

# Save as JSON
print("\n" + "=" * 80)
print("SAVING DATA")
print("=" * 80)

output_json = "quiz_data_complete.json"
with open(output_json, "w", encoding="utf-8") as f:
    json.dump(quiz_data, f, ensure_ascii=False, indent=2)

print(f"✓ Saved complete data to: {output_json}")

# Also create a flattened version for Excel/CSV
flattened_data = []
for q in quiz_data:
    # Create a row with all answer options as separate columns
    row = {
        "question_id": q["question_id"],
        "question_number": q["question_number"],
        "topic": q["topic"],
        "question_text": q["question_text"],
        "correct_answer": q["correct_answer"],
        "explanation": q["explanation"],
        "source_exam": q["source_exam"],
    }

    # Add each answer option as a column
    for i, opt in enumerate(q["answer_options"], 1):
        row[f"option_{i}"] = f"{opt['letter']} {opt['text']}"
        row[f"option_{i}_correct"] = "SÍ" if opt["is_correct"] else "NO"

    flattened_data.append(row)

df = pd.DataFrame(flattened_data)

output_excel = "quiz_data_complete.xlsx"
output_csv = "quiz_data_complete.csv"

df.to_excel(output_excel, index=False, engine="openpyxl")
df.to_csv(output_csv, index=False, encoding="utf-8-sig")

print(f"✓ Saved flattened data to: {output_excel}")
print(f"✓ Saved flattened data to: {output_csv}")

# Print sample
print("\n" + "=" * 80)
print("SAMPLE OUTPUT (First Question)")
print("=" * 80)
if quiz_data:
    print(json.dumps(quiz_data[0], ensure_ascii=False, indent=2))

print("\n" + "=" * 80)
print("STATISTICS")
print("=" * 80)
print(f"Total questions extracted: {len(quiz_data)}")
if quiz_data:
    avg_options = sum(len(q["answer_options"]) for q in quiz_data) / len(quiz_data)
    avg_question_length = sum(len(q["question_text"]) for q in quiz_data) / len(quiz_data)
    avg_explanation_length = sum(len(q["explanation"]) for q in quiz_data) / len(quiz_data)

    print(f"Average options per question: {avg_options:.1f}")
    print(f"Average question length: {avg_question_length:.0f} characters")
    print(f"Average explanation length: {avg_explanation_length:.0f} characters")

Reading view-source HTML file...
File size: 1074933 characters

Extracting actual HTML from view-source format...
Found 2815 lines of code

Reconstructed HTML size: 244337 characters
SEARCHING FOR QUESTIONS IN RECONSTRUCTED HTML
Questions found: 15

✓ Successfully found 15 questions!

EXTRACTING QUESTION DATA (FULL TEXT, NO TRUNCATION)

✓ Q1 extracted: 5 options, 311 chars question, 829 chars explanation
✓ Q2 extracted: 5 options, 193 chars question, 446 chars explanation
✓ Q3 extracted: 5 options, 207 chars question, 283 chars explanation
✓ Q4 extracted: 5 options, 411 chars question, 776 chars explanation
✓ Q5 extracted: 5 options, 125 chars question, 662 chars explanation
✓ Q6 extracted: 5 options, 232 chars question, 675 chars explanation
✓ Q7 extracted: 5 options, 408 chars question, 610 chars explanation
✓ Q8 extracted: 5 options, 321 chars question, 505 chars explanation
✓ Q9 extracted: 5 options, 433 chars question, 450 chars explanation
✓ Q10 extracted: 5 options, 365 chars qu

In [4]:
quiz_data

[{'question_id': '1',
  'question_number': '1',
  'topic': 'Un paciente de 20 años, diabético tipo 1, en tratamiento con una dosis de Lantus (insulina glargina)',
  'question_text': 'Un paciente de 20 años, diabético tipo 1, en tratamiento con una dosis de Lantus (insulina glargina) en la noche y tres dosis de insulina cristalina, previas al desayuno, almuerzo y cena, presenta la siguiente tabla de glicemias promedio: Además su hemoglobina glicosilada es de 8%. La conducta más adecuada es:',
  'answer_options': [{'letter': 'a.',
    'text': 'Aumentar la dosis de Lantus',
    'is_correct': False},
   {'letter': 'b.',
    'text': 'Aumentar la insulina cristalina del desayuno y del almuerzo',
    'is_correct': False},
   {'letter': 'c.',
    'text': 'Aumentar la insulina cristalina del almuerzo y cena',
    'is_correct': True},
   {'letter': 'd.',
    'text': 'Aumentar la dosis de Lantus y de las 3 insulinas cristalinas',
    'is_correct': False},
   {'letter': 'e.',
    'text': 'Agregar 

In [3]:
from bs4 import BeautifulSoup
import pandas as pd

path = r"C:\Users\vales\DataspellProjects\keuna\EUNACOM\ENSAYOS\GUEVARA\view-source_https___cursosonline.doctorguevara.cl_mod_quiz_review.php_attempt=1263771&cmid=125317.html"

# Read the HTML
print("Reading HTML file...")
with open(path, "r", encoding="utf-8") as f:
    html_content = f.read()

print(f"File size: {len(html_content)} characters\n")

# Parse with BeautifulSoup
soup = BeautifulSoup(html_content, "html.parser")

# Debug: Let's see what classes are available
print("=" * 80)
print("DEBUGGING: Finding all div classes")
print("=" * 80)
all_divs = soup.find_all("div")
classes_found = set()
for div in all_divs:
    if div.get("class"):
        classes_found.update(div.get("class"))

print(f"Total divs found: {len(all_divs)}")
print(f"Unique classes found: {len(classes_found)}")
print("\nClasses containing 'que' or 'question':")
for cls in sorted(classes_found):
    if "que" in cls.lower() or "question" in cls.lower():
        print(f"  - {cls}")

# Try to find questions with different selectors
print("\n" + "=" * 80)
print("TRYING DIFFERENT SELECTORS")
print("=" * 80)

# Method 1: Look for divs with id starting with "question"
questions_by_id = soup.find_all("div", id=lambda x: x and x.startswith("question"))
print(f"\nMethod 1 - Divs with id starting with 'question': {len(questions_by_id)}")

# Method 2: Look for class 'que'
questions_by_class = soup.find_all("div", class_="que")
print(f"Method 2 - Divs with class 'que': {len(questions_by_class)}")

# Method 3: Look for specific class combinations
questions_multichoice = soup.find_all("div", class_="que multichoice deferredfeedback")
print(f"Method 3 - Divs with 'que multichoice deferredfeedback': {len(questions_multichoice)}")

# Method 4: Look for question tags
h3_tags = soup.find_all("h3", class_="no")
print(f"Method 4 - H3 tags with class 'no': {len(h3_tags)}")

# Let's examine the first question in detail
print("\n" + "=" * 80)
print("EXAMINING FIRST QUESTION STRUCTURE")
print("=" * 80)

if questions_by_id:
    first_q = questions_by_id[0]
    print(f"\nFirst question ID: {first_q.get('id')}")
    print(f"First question classes: {first_q.get('class')}")

    # Check for question text
    qtext = first_q.find("div", class_="qtext")
    if qtext:
        print(f"\nQuestion text found (first 200 chars):")
        print(qtext.get_text(strip=True)[:200])

    # Check for answers
    answers = first_q.find_all("div", class_="r0") + first_q.find_all("div", class_="r1")
    print(f"\nAnswer divs found (r0/r1): {len(answers)}")

    # Check for correct answer
    correct = first_q.find("div", class_="correct")
    print(f"Correct answer div found: {correct is not None}")

    # Print partial structure
    print("\n--- Partial HTML structure of first question ---")
    print(str(first_q)[:1000])

print("\n" + "=" * 80)
print("NOW ATTEMPTING FULL EXTRACTION")
print("=" * 80)

Reading HTML file...
File size: 1074933 characters

DEBUGGING: Finding all div classes
Total divs found: 1
Unique classes found: 1

Classes containing 'que' or 'question':

TRYING DIFFERENT SELECTORS

Method 1 - Divs with id starting with 'question': 0
Method 2 - Divs with class 'que': 0
Method 3 - Divs with 'que multichoice deferredfeedback': 0
Method 4 - H3 tags with class 'no': 0

EXAMINING FIRST QUESTION STRUCTURE

NOW ATTEMPTING FULL EXTRACTION


In [4]:
from bs4 import BeautifulSoup
import pandas as pd

path = r"C:\Users\vales\DataspellProjects\keuna\EUNACOM\ENSAYOS\GUEVARA\view-source_https___cursosonline.doctorguevara.cl_mod_quiz_review.php_attempt=1263771&cmid=125317.html"

with open(path, "r", encoding="utf-8") as f:
    html_content = f.read()

soup = BeautifulSoup(html_content, "html.parser")

# Find questions (adjust based on debugging output)
questions = soup.find_all("div", id=lambda x: x and x.startswith("question"))

print(f"Found {len(questions)} questions\n")

# Extract data
data = []

for question in questions:
    try:
        # Question number
        qno = question.find("span", class_="qno")
        q_number = qno.get_text(strip=True) if qno else "N/A"

        # Question text
        qtext = question.find("div", class_="qtext")
        q_text = qtext.get_text(strip=True, separator=" ") if qtext else "N/A"

        # State (correct/incorrect)
        state = question.find("div", class_="state")
        q_state = state.get_text(strip=True) if state else "N/A"

        # Find all answer options
        answer_containers = question.find_all("div", class_=["r0", "r1"])

        correct_answer = None
        all_answers = []

        for container in answer_containers:
            # Get answer letter and text
            answer_label = container.find("div", class_="d-flex")
            if answer_label:
                letter_span = answer_label.find("span", class_="answernumber")
                text_div = answer_label.find("div", class_="flex-fill")

                if letter_span and text_div:
                    letter = letter_span.get_text(strip=True)
                    text = text_div.get_text(strip=True)

                    all_answers.append(f"{letter} {text}")

                    # Check if this is correct
                    if "correct" in container.get("class", []):
                        correct_answer = f"{letter} {text}"

        # Feedback
        feedback = question.find("div", class_="generalfeedback")
        feedback_text = feedback.get_text(strip=True) if feedback else ""

        data.append(
            {
                "Question_Number": q_number,
                "Status": q_state,
                "Question": q_text,
                "Correct_Answer": correct_answer,
                "All_Answers": " | ".join(all_answers),
                "Feedback": feedback_text,
            }
        )

        print(f"✓ Extracted Q{q_number}: {q_state}")

    except Exception as e:
        print(f"✗ Error processing question: {e}")

# Create DataFrame
df = pd.DataFrame(data)

# Display
print("\n" + "=" * 80)
print("EXTRACTED DATA")
print("=" * 80)
print(df[["Question_Number", "Status", "Correct_Answer"]].to_string())

# Save to Excel
output_excel = "quiz_results.xlsx"
df.to_excel(output_excel, index=False, engine="openpyxl")
print(f"\n✓ Saved to {output_excel}")

# Save to CSV
output_csv = "quiz_results.csv"
df.to_csv(output_csv, index=False, encoding="utf-8-sig")
print(f"✓ Saved to {output_csv}")

# Print summary
print(f"\n" + "=" * 80)
print(f"Total questions: {len(df)}")
print(f"Correct: {len(df[df['Status'] == 'Correcta'])}")
print(f"Incorrect: {len(df[df['Status'] == 'Incorrecta'])}")

Found 0 questions


EXTRACTED DATA


KeyError: "None of [Index(['Question_Number', 'Status', 'Correct_Answer'], dtype='object')] are in the [columns]"