In [3]:
!pip install google-generativeai PyMuPDF pandas tqdm


Collecting PyMuPDF
  Downloading pymupdf-1.26.3-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.26.3-cp39-abi3-manylinux_2_28_x86_64.whl (24.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m77.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyMuPDF
Successfully installed PyMuPDF-1.26.3


In [13]:
# NCERT Class 8 Science Structured Data Extractor using Google Gemini API

import os
import json
import fitz  # PyMuPDF
import pandas as pd
from tqdm import tqdm
import google.generativeai as genai

# -------- CONFIGURATION --------
API_KEY = "Add Gemini Key"
PDF_PATHS = [
    "./hesc106.pdf",
    "./hesc107.pdf",

]
OUTPUT_DIR = "output"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# -------- PROMPT FOR STRUCTURED EXTRACTION --------
STRUCTURED_PROMPT = """
You are an expert educational data extractor.

Given the content of a chapter from the Class 8 NCERT Science textbook, extract the following elements in a structured format:

- Chapter Name
- Topic Name(s)
- Sub-topic Headers
- Paragraphs (labelled under their corresponding sub-topic)
- Tables (if any, include captions and descriptions)
- Figures or Images (mention their captions or figure numbers if present)
- Examples and Activities (with proper labeling and association with sub-topics)
- Questions and Exercises (grouped appropriately)
- Boxed Facts or External References (label as such)

Return the result in clean, structured JSON with this hierarchy:

{
  "chapter_name": "",
  "topics": [
    {
      "topic_name": "",
      "subtopics": [
        {
          "subtopic_name": "",
          "content": {
            "paragraphs": [],
            "tables": [],
            "figures": [],
            "examples": [],
            "activities": [],
            "questions": [],
            "boxed_facts": []
          }
        }
      ]
    }
  ]
}

Do not interpret or summarize. Only extract exactly what appears in the chapter.
"""

# -------- LOAD CHAPTER TEXT FROM PDF --------
def extract_text_from_pdf(path):
    doc = fitz.open(path)
    full_text = ""
    for page in doc:
        full_text += page.get_text()
    return full_text

# -------- CALL GOOGLE GEMINI API --------
def extract_structured_json(text):
    import re
    genai.configure(api_key=API_KEY)
    model = genai.GenerativeModel("models/gemini-1.5-flash-latest")

    try:
        print(f"📄 Sending input with {len(text)} characters...")
        response = model.generate_content([STRUCTURED_PROMPT, text])
        print("✅ Raw response received")

        content = response.text.strip()
        print("🔍 First 300 characters of response:\n" + content[:300])

        if content.startswith("```json"):
            content = content.replace("```json", "").strip()
        if content.endswith("```"):
            content = content[:-3].strip()

        try:
            return json.loads(content)
        except json.JSONDecodeError:
            print("⚠️ Attempting to repair malformed JSON...")
            json_match = re.search(r'\{[\s\S]*\}', content)
            if json_match:
                repaired = json_match.group()
                return json.loads(repaired)
            else:
                raise
    except json.JSONDecodeError:
        print("❌ Response was not valid JSON even after repair. Saving raw response.")
        os.makedirs(OUTPUT_DIR, exist_ok=True)
        with open(os.path.join(OUTPUT_DIR, "last_raw_response.txt"), "w") as f:
            f.write(response.text)
        return {}
    except Exception as e:
        print("💥 API Error:", e)
        return {}


# -------- SAVE OUTPUT --------
def save_json(obj, filename):
    with open(filename, 'w') as f:
        json.dump(obj, f, indent=2)

def json_to_excel(json_data, excel_path):
    rows = []
    for topic in json_data.get('topics', []):
        topic_name = topic.get('topic_name', '')
        for sub in topic.get('subtopics', []):
            subtopic_name = sub.get('subtopic_name', '')
            content = sub.get('content', {})

            def safe_join(items):
                if not isinstance(items, list):
                    return ""
                return " | ".join(str(i) if isinstance(i, str) else json.dumps(i) for i in items)

            row = {
                'Topic': topic_name,
                'Sub-topic': subtopic_name,
                'Paragraphs': safe_join(content.get('paragraphs', [])),
                'Tables': safe_join(content.get('tables', [])),
                'Figures': safe_join(content.get('figures', [])),
                'Examples': safe_join(content.get('examples', [])),
                'Activities': safe_join(content.get('activities', [])),
                'Questions': safe_join(content.get('questions', [])),
                'Boxed Facts': safe_join(content.get('boxed_facts', []))
            }
            rows.append(row)
    df = pd.DataFrame(rows)
    df.to_excel(excel_path, index=False)

# -------- RUN EXTRACTION --------
all_json_data = []
for path in tqdm(PDF_PATHS):
    chapter_text = extract_text_from_pdf(path)
    structured = extract_structured_json(chapter_text)

    # Save
    base_name = os.path.basename(path).replace(".pdf", "")
    json_path = os.path.join(OUTPUT_DIR, f"{base_name}.json")
    save_json(structured, json_path)

    if isinstance(structured, dict) and structured:
        excel_path = os.path.join(OUTPUT_DIR, f"{base_name}.xlsx")
        json_to_excel(structured, excel_path)
        all_json_data.append(structured)

# -------- SAVE FINAL CONSOLIDATED JSON --------
final_json_path = os.path.join(OUTPUT_DIR, "chapter-extract.json")
with open(final_json_path, "w") as f:
    json.dump(all_json_data, f, indent=2)

# -------- STUDY PLANNER GENERATOR --------
def generate_study_plan(json_data, total_days):
    topics = []
    for chapter in json_data:
        chapter_name = chapter.get("chapter_name", "")
        for topic in chapter.get("topics", []):
            topic_name = topic.get("topic_name", "")
            topic_count = sum(len(sub.get("content", {}).get("paragraphs", [])) for sub in topic.get("subtopics", []))
            topics.append((chapter_name, topic_name, topic_count))

    total_weight = sum(weight for _, _, weight in topics)
    days_allocated = []
    for chapter, topic, weight in topics:
        days = max(1, round((weight / total_weight) * total_days))
        days_allocated.append((chapter, topic, days))

    # Flatten into day-wise schedule
    schedule = []
    day = 1
    for chapter, topic, days in days_allocated:
        for d in range(days):
            schedule.append({
                "Day": day,
                "Chapter": chapter,
                "Topic": topic
            })
            day += 1

    planner_path = os.path.join(OUTPUT_DIR, "study_planner_part1.xlsx")
    pd.DataFrame(schedule).to_excel(planner_path, index=False)
    print(f"Study planner saved to {planner_path}")

# -------- INVOKE STUDY PLANNER --------
try:
    days = int(input("Enter total number of days for the study planner (e.g., 10): "))
    with open(final_json_path) as f:
        consolidated_data = json.load(f)
    generate_study_plan(consolidated_data, days)
except Exception as e:
    print("Error generating study planner:", e)


  0%|          | 0/2 [00:00<?, ?it/s]

📄 Sending input with 21620 characters...


 50%|█████     | 1/2 [00:46<00:46, 46.55s/it]

✅ Raw response received
🔍 First 300 characters of response:
```json
{
  "chapter_name": "Reproduction in Animals",
  "topics": [
    {
      "topic_name": "Modes of Reproduction",
      "subtopics": [
        {
          "subtopic_name": "Modes of Reproduction",
          "content": {
            "paragraphs": [
              "Have you seen the young ones of
📄 Sending input with 25772 characters...


100%|██████████| 2/2 [01:35<00:00, 47.85s/it]


✅ Raw response received
🔍 First 300 characters of response:
```json
{
  "chapter_name": "Reaching the Age of Adolescence",
  "topics": [
    {
      "topic_name": "Adolescence and Puberty",
      "subtopics": [
        {
          "subtopic_name": "Adolescence and Puberty",
          "content": {
            "paragraphs": [
              "Boojho was celebrat
Enter total number of days for the study planner (e.g., 10): 80
Study planner saved to output/study_planner_part1.xlsx


In [14]:
import os
import json
import re
import fitz  # PyMuPDF
import pandas as pd
from tqdm import tqdm
import google.generativeai as genai
from concurrent.futures import ThreadPoolExecutor, as_completed

# -------- CONFIGURATION --------
API_KEY = "AIzaSyCVH7_QDZaEudYjp9Vioj1N6bEfPDFu_JY"  # Replace with your actual API key
PDF_PATHS = [

    "./hesc108.pdf",
    "./hesc113.pdf"
]
OUTPUT_DIR = "output_part2"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# -------- ENHANCED PROMPT --------
STRUCTURED_PROMPT = """
You are an expert educational content parser. Extract NCERT Class 8 Science content in this EXACT JSON format:

{
  "chapter_name": "string",
  "topics": [
    {
      "topic_name": "string",
      "subtopics": [
        {
          "subtopic_name": "string",
          "content": {
            "paragraphs": ["string"],
            "tables": [{"caption": "string", "content": "string"}],
            "figures": [{"caption": "string", "description": "string"}],
            "examples": ["string"],
            "activities": ["string"],
            "questions": ["string"],
            "boxed_facts": ["string"]
          }
        }
      ]
    }
  ]
}

STRICT RULES:
1. Only return pure JSON with no surrounding text
2. Escape all special characters in strings
3. No trailing commas
4. All brackets must be balanced
5. Use empty arrays for missing data
6. Maintain exactly this structure
7. Preserve original text without interpretation
"""

# -------- PDF PROCESSING --------
def extract_text_from_pdf(path, max_pages=None):
    """Extract text from PDF with page limit for large files"""
    doc = fitz.open(path)
    full_text = ""
    for i, page in enumerate(doc):
        if max_pages and i >= max_pages:
            break
        full_text += page.get_text()
    return full_text

# -------- JSON VALIDATION & REPAIR --------
def repair_json(json_str):
    """Comprehensive JSON repair with multiple strategies"""
    try:
        # Remove common non-JSON artifacts
        json_str = re.sub(r'^[^{]*', '', json_str)  # Remove text before first {
        json_str = re.sub(r'[^}]*$', '', json_str)  # Remove text after last }

        # Fix common JSON issues
        json_str = re.sub(r',\s*([}\]])', r'\1', json_str)  # Remove trailing commas
        json_str = re.sub(r'\\\'', "'", json_str)  # Fix escaped quotes
        json_str = re.sub(r'([{\[,])\s*([}\]])', r'\1\2', json_str)  # Fix empty elements

        # Balance brackets if needed
        open_braces = json_str.count('{')
        close_braces = json_str.count('}')
        if open_braces > close_braces:
            json_str += '}' * (open_braces - close_braces)
        elif close_braces > open_braces:
            json_str = '{' * (close_braces - open_braces) + json_str

        return json_str.strip()
    except Exception as e:
        print(f"Repair error: {str(e)}")
        return json_str

def validate_json_structure(data):
    """Thorough validation of extracted structure"""
    if not isinstance(data, dict):
        return False
    if 'chapter_name' not in data or not isinstance(data['chapter_name'], str):
        return False
    if 'topics' not in data or not isinstance(data['topics'], list):
        return False

    for topic in data['topics']:
        if not isinstance(topic, dict) or 'topic_name' not in topic:
            return False
        if 'subtopics' not in topic or not isinstance(topic['subtopics'], list):
            return False

        for subtopic in topic['subtopics']:
            if not isinstance(subtopic, dict) or 'subtopic_name' not in subtopic:
                return False
            if 'content' not in subtopic or not isinstance(subtopic['content'], dict):
                return False

    return True

# -------- API COMMUNICATION --------
def process_with_gemini(text, max_retries=3):
    """Handle API communication with retries"""
    genai.configure(api_key=API_KEY)
    model = genai.GenerativeModel("models/gemini-1.5-flash-latest")

    for attempt in range(max_retries):
        try:
            response = model.generate_content([STRUCTURED_PROMPT, text])
            content = response.text.strip()

            # Initial parse attempt
            try:
                data = json.loads(content)
                if validate_json_structure(data):
                    return data
            except json.JSONDecodeError:
                pass

            # Repair attempt
            repaired = repair_json(content)
            try:
                data = json.loads(repaired)
                if validate_json_structure(data):
                    return data
            except json.JSONDecodeError:
                continue

        except Exception as e:
            print(f"Attempt {attempt + 1} failed: {str(e)}")
            continue

    return None

# -------- DATA PROCESSING --------
def process_pdf(path):
    """Process a single PDF file"""
    try:
        print(f"\nProcessing {os.path.basename(path)}...")
        text = extract_text_from_pdf(path, max_pages=20)  # Limit pages for large files

        if not text:
            print("⚠️ No text extracted")
            return None

        # Process in chunks if needed
        max_chars = 30000
        if len(text) > max_chars:
            print(f"Trimming from {len(text)} to {max_chars} chars")
            text = text[:max_chars]

        data = process_with_gemini(text)
        if not data:
            print("❌ Failed to extract structured data")
            return None

        return data

    except Exception as e:
        print(f"💥 Processing error: {str(e)}")
        return None

# -------- OUTPUT GENERATION --------
def save_results(data, base_name):
    """Save results in multiple formats"""
    if not data:
        return False

    try:
        # Save JSON
        json_path = os.path.join(OUTPUT_DIR, f"{base_name}.json")
        with open(json_path, 'w') as f:
            json.dump(data, f, indent=2)
        print(f"💾 Saved JSON to {json_path}")

        # Save Excel
        excel_path = os.path.join(OUTPUT_DIR, f"{base_name}.xlsx")
        rows = []

        for topic in data.get('topics', []):
            for subtopic in topic.get('subtopics', []):
                content = subtopic.get('content', {})
                row = {
                    'Chapter': data.get('chapter_name', ''),
                    'Topic': topic.get('topic_name', ''),
                    'Subtopic': subtopic.get('subtopic_name', ''),
                    'Paragraphs': '\n'.join(content.get('paragraphs', [])),
                    'Tables': len(content.get('tables', [])),
                    'Figures': len(content.get('figures', [])),
                    'Examples': len(content.get('examples', [])),
                    'Activities': len(content.get('activities', [])),
                    'Questions': len(content.get('questions', [])),
                    'Boxed Facts': len(content.get('boxed_facts', []))
                }
                rows.append(row)

        pd.DataFrame(rows).to_excel(excel_path, index=False)
        print(f"💾 Saved Excel to {excel_path}")
        return True

    except Exception as e:
        print(f"⚠️ Output saving failed: {str(e)}")
        return False

# -------- MAIN EXECUTION --------
def main():
    all_data = []

    # Process files in parallel
    with ThreadPoolExecutor(max_workers=2) as executor:
        futures = {executor.submit(process_pdf, path): path for path in PDF_PATHS}

        for future in tqdm(as_completed(futures), total=len(PDF_PATHS)):
            path = futures[future]
            data = future.result()

            if data:
                base_name = os.path.basename(path).replace('.pdf', '')
                if save_results(data, base_name):
                    all_data.append(data)

    # Save consolidated results
    if all_data:
        consolidated_path = os.path.join(OUTPUT_DIR, "consolidated.json")
        with open(consolidated_path, 'w') as f:
            json.dump(all_data, f, indent=2)
        print(f"\n✅ Consolidated data saved to {consolidated_path}")

        # Generate study plan
        generate_study_plan(all_data)
    else:
        print("\n❌ No successful extractions")

# -------- STUDY PLAN GENERATOR --------
def generate_study_plan(data, default_days=30):
    if not data:
        return

    try:
        days = int(input(f"\nEnter study days (default {default_days}): ") or default_days)
    except:
        days = default_days

    plan = []
    day_counter = 1

    for chapter in data:
        chapter_name = chapter.get('chapter_name', 'Unknown')
        topics = chapter.get('topics', [])

        for topic in topics:
            topic_name = topic.get('topic_name', 'Unknown')
            subtopics = topic.get('subtopics', [])

            # Calculate weight based on content
            weight = sum(
                len(subtopic.get('content', {}).get('paragraphs', []))
                for subtopic in subtopics
            )

            # Allocate days proportionally
            allocated_days = max(1, round(weight / 10))  # 1 day per ~10 paragraphs
            plan.append({
                'Day Range': f"{day_counter}-{day_counter + allocated_days - 1}",
                'Days': allocated_days,
                'Chapter': chapter_name,
                'Topic': topic_name,
                'Subtopics': ', '.join(s.get('subtopic_name', '') for s in subtopics)
            })
            day_counter += allocated_days

    # Save plan
    plan_path = os.path.join(OUTPUT_DIR, "study_plan_part2.xlsx")
    pd.DataFrame(plan).to_excel(plan_path, index=False)
    print(f"📅 Study plan saved to {plan_path}")
    print(f"\nTotal study days: {day_counter - 1}")

if __name__ == "__main__":
    main()


Processing hesc108.pdf...

Processing hesc113.pdf...


  0%|          | 0/2 [00:00<?, ?it/s]

Trimming from 33161 to 30000 chars


 50%|█████     | 1/2 [00:51<00:51, 51.22s/it]

Attempt 1 failed: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
💾 Saved JSON to output_part2/hesc108.json
💾 Saved Excel to output_part2/hesc108.xlsx


100%|██████████| 2/2 [01:34<00:00, 47.28s/it]


💾 Saved JSON to output_part2/hesc113.json
💾 Saved Excel to output_part2/hesc113.xlsx

✅ Consolidated data saved to output_part2/consolidated.json

Enter study days (default 30): 80
📅 Study plan saved to output_part2/study_plan_part2.xlsx

Total study days: 21
