# Hierarchical Section Planning Notebook

This notebook contains code to:
1. Parse a raw index string into a hierarchical JSON-like structure.
2. Chunk top-level sections into groups (e.g., five at a time).
3. Build prompts for an LLM to generate a plan for each section.
4. Call the LLM to fill in plans and return a complete JSON structure.

Replace the placeholder `llm_client` and `AZURE_OPENAI_MODEL_NAME` with your actual client and model details.


In [3]:
from typing import List, Optional
from pydantic import BaseModel
import re
import json
from typing import List, Optional
from pydantic import BaseModel
import re
import json
from openai import AzureOpenAI


In [None]:

# ----------------------------
# 1. Define the Pydantic model for one section node
# ----------------------------
class SectionNode(BaseModel):
    title: str
    section: str
    level: int
    children: List["SectionNode"] = []

# Define a response model that wraps the list of sections
class SectionHierarchy(BaseModel):
    sections: List[SectionNode]

# ----------------------------
# 2. Function to call the LLM and get a hierarchical JSON from a raw index string
# ----------------------------
def build_hierarchy_from_index(
    raw_index: str, llm_client, model_name: str
) -> List[SectionNode]:
    """
    Given a raw index string (e.g., "1. Introduction\n1.1 Background\n1.2 Problem Statement\n2. Methodology\n..."),
    call the LLM to parse it into a hierarchical JSON structure matching SectionNode.

    Returns a list of SectionNode objects (top‐level sections with nested children).
    """
    prompt = f"""Parse the following raw index into a hierarchical structure.  
                Each node should have:
                - title: the section title
                - section: the section number (e.g., "1.1", "2.3.4") if it's already there use as it's if not then create it. based on the context.
                - level: integer where 0 is top-level, 1 is subsection, etc.
                - children: array of nested sections with the same structure

                For example:
                "1. Introduction" -> level 0
                "1.1 Background" -> level 1 (child of Introduction)
                "1.1.1 Historical Context" -> level 2 (child of Background)
                "1.2 Problem Statement" -> level 1 (child of Introduction)
                "2. Literature Review" -> level 0

                Raw index:
                {raw_index.strip()}
                """

    try:
        response = llm_client.beta.chat.completions.parse(
            model=model_name,
            messages=[{"role": "user", "content": prompt}],
            response_format=SectionHierarchy,
            temperature=0.0,
            max_tokens=4096
        )
        
        # Extract the parsed data
        parsed_data = response.choices[0].message.parsed
        
        # Debug: Print the structured response
        print("Structured LLM Response:")
        print(parsed_data)
        print("\n" + "="*50 + "\n")
        
        # Return the list of sections
        return parsed_data.sections
        
    except Exception as e:
        print(f"Error calling LLM or processing response: {e}")
        raise


# ----------------------------
# 3. Usage example
# ----------------------------
if __name__ == "__main__":
    # Example raw index string (passed separately from context)
    raw_index_text = """
    
1  Introduction ........................................................................................................ 1  
  1.1 Motivation and Objectives ........................................................................... 2  
  1.2 Thesis Scope and Contributions ................................................................. 4  
  1.3 Thesis Organization ...................................................................................... 6  

2  Background and Foundations ................................................................. 9  
  2.1 Domain Background: Conversational Agents in Higher Education ............ 9  
    2.1.1 Evolution and Definitions of Conversational Agents .............................. 10  
    2.1.2 Use Cases and Impact in University Administration .............................. 11  
  2.2 Technical Foundations: LLM Architectures and Methods .................... 12  
    2.2.1 Transformer Architecture Overview ................................................ 13  
    2.2.2 Pretraining Paradigms and Domain Adaptation ................................. 14  
    2.2.3 Parameter-Efficient Fine-Tuning (LoRA) ........................................ 16  
      2.2.3.1 LoRA Methodology and Mechanisms ................................................ 17  
      2.2.3.2 Comparison with Other PEFT Techniques ....................................... 18  
    2.2.4 Supervised Fine-Tuning vs. Direct Preference Optimization............ 19  
      2.2.4.1 Supervised Fine-Tuning (SFT) Techniques ......................................... 19  
      2.2.4.2 Direct Preference Optimization (DPO) Principles ............................. 20  
      2.2.4.3 Hybrid SFT + DPO Approaches .......................................................... 20  
    2.2.5 Topic Modeling and Content Filtering Techniques ................................ 21  
      2.2.5.1 Unsupervised Topic Modeling ............................................ 22  
      2.2.5.2 LLM-Based Theme Classification ...................................................... 23  

3  Related Work .................................................................................................... 25  
  3.1 Domain-Specific LLMs for Administrative Support .................................. 25  
  3.2 Prompt Engineering for User-Aware Generation ....................................... 29  
  3.3 Evaluation Metrics for Conversational Agents ............................................ 32  
  3.4 Research Gaps in University Management Chatbots ................................. 35  

4  Data Acquisition and Preprocessing ........................................................ 39  
  4.1 FAU Administrative Data Harvesting ......................................................... 39  
  4.2 Text Segmentation into 4 000-Token Chunks ................................................ 42  
  4.3 Initial QA-Pair Generation with Mistral .................................................... 45  
  4.4 Two-Stage Filtering Pipeline  
    4.4.1 Unsupervised Topic Modeling (BERTopic) ......................................... 48  
    4.4.2 LLM-Based Theme Classification and Lecture Removal ......................... 51  
  4.5 Construction of Chosen vs. Rejected Response Sets .................................. 54  

5  Model Fine-Tuning and Prompt Engineering ............................................. 57  
  5.1 Model Selection: LLaMA 3 and Falcon 7B .................................................... 57  
    5.1.1 RAG vs. Role-Aware Fine-Tuning: A Design Justification  
  5.2 Supervised Fine-Tuning on Management QA Corpus .................................. 60  
  5.3 Direct Preference Optimization (DPO) with QLoRA .................................... 63  
  5.4 Custom Prompt Templates and Persona Embedding .................................... 67  
    5.4.1 Static Persona Injection (User Background & Expert Role)  
    5.4.2 Role-Based Persona Modeling (Context-Adaptive Generation)  
    5.4.3 Dynamic Context Windows and Memory Traces  
  5.5 Implementation Details and Training Infrastructure ................................ 70  
6  Experimental Design and Evaluation ...................................................... 73  
  6.1 Evaluation Metrics: Precision, Recall, F1-Score ............................................ 73  
  6.2 Baseline vs. Fine-Tuned Model Comparisons ............................................ 76  
  6.3 User-Role Simulation and Contextual Tests .............................................. 80  
  6.4 Ablation Study on Prompt Components ..................................................... 83  
  6.5 Statistical Significance and Error Analysis .............................................. 86  

7  Results and Discussion .................................................................................. 89  
  7.1 Quantitative Performance Gains (+10 % Precision, +12 % F1) .................... 89  
  7.2 Qualitative Case Studies and Exemplars .................................................... 93  
  7.3 Limitations of the Current Approach ......................................................... 97  
  7.4 Implications for University Administrative Workflows ........................... 100  

8  Conclusion and Future Work ...................................................................... 103  
  8.1 Summary of Contributions ............................................................................. 103  
  8.2 Recommendations for Deployment .............................................................. 105  
  8.3 Directions for Further Research ................................................................. 107  

List of Abbreviations ........................................................................................ 110  
List of Figures ...................................................................................................... 112  
List of Tables ....................................................................................................... 114  
References ............................................................................................................ 115  




    """
    
    # Initialize your LLM client (replace with your actual Azure OpenAI client)
    llm_client = AzureOpenAI(
    )

    # Call the function to get a hierarchical structure
    hierarchy = build_hierarchy_from_index(raw_index_text, llm_client, model_name)

    # Print out the result as JSON
    # print(json.dumps([node.model_dump() for node in hierarchy], indent=2, ensure_ascii=False))
    
    # Save the result to a file
    with open("section_hierarchy.json", "w", encoding="utf-8") as f:
        json.dump([node.model_dump() for node in hierarchy], f, indent=2, ensure_ascii=False)








Structured LLM Response:
sections=[SectionNode(title='Introduction', section='1', level=0, children=[SectionNode(title='Motivation and Objectives', section='1.1', level=1, children=[]), SectionNode(title='Thesis Scope and Contributions', section='1.2', level=1, children=[]), SectionNode(title='Thesis Organization', section='1.3', level=1, children=[])]), SectionNode(title='Background and Foundations', section='2', level=0, children=[SectionNode(title='Domain Background: Conversational Agents in Higher Education', section='2.1', level=1, children=[SectionNode(title='Evolution and Definitions of Conversational Agents', section='2.1.1', level=2, children=[]), SectionNode(title='Use Cases and Impact in University Administration', section='2.1.2', level=2, children=[])]), SectionNode(title='Technical Foundations: LLM Architectures and Methods', section='2.2', level=1, children=[SectionNode(title='Transformer Architecture Overview', section='2.2.1', level=2, children=[]), SectionNode(title='

In [3]:
# Now we read the content from the above json file
# Read the complete content from the pdf file
# and pass the compelte text to the LLM with the first 5 sections and ask for the plan what shall come inside it 

# Read the section hierarchy from the JSON file
def read_section_hierarchy(file_path: str) -> List[SectionNode]:
    with open(file_path, "r", encoding="utf-8") as f:
        data = json.load(f)
    return [SectionNode(**section) for section in data]

# read the pdf  
def read_pdf_content(file_path: str) -> str:
    from PyPDF2 import PdfReader
    reader = PdfReader(file_path)
    content = []
    for page in reader.pages:
        content.append(page.extract_text())
    return "\n".join(content)

# Updated models for better section and subsection planning
class SectionPlan(BaseModel):
    section_number: str
    title: str
    level: int
    plan: str

class BatchSectionPlans(BaseModel):
    plans: List[SectionPlan]

# Function to flatten all sections and subsections into a single list
def flatten_sections(sections: List[SectionNode]) -> List[SectionNode]:
    """
    Flatten the hierarchical structure into a single list containing all sections and subsections.
    """
    flattened = []
    
    def flatten_recursive(section_list: List[SectionNode]):
        for section in section_list:
            flattened.append(section)
            if section.children:
                flatten_recursive(section.children)
    
    flatten_recursive(sections)
    return flattened

# Function to group sections into batches of 3 main sections with all their subsections
def group_sections_for_batch_processing(sections: List[SectionNode], batch_size: int = 3) -> List[List[SectionNode]]:
    """
    Group top-level sections (level 0) into batches, including all their children.
    Each batch contains a specified number of main sections and all their subsections.
    """
    top_level_sections = [s for s in sections if s.level == 0]
    batches = []
    
    for i in range(0, len(top_level_sections), batch_size):
        batch = top_level_sections[i:i + batch_size]
        # For each batch, include the main sections and all their nested children
        batch_with_children = []
        for main_section in batch:
            batch_with_children.extend(flatten_sections([main_section]))
        batches.append(batch_with_children)
    
    return batches

# Function to generate plans for a batch of sections
def generate_batch_section_plans(
    section_batch: List[SectionNode], llm_client, model_name: str, pdf_content: str = ""
) -> List[SectionPlan]:
    """
    Generate plans for a batch of sections (including main sections and subsections).
    """
    # Create a structured prompt with all sections in the batch
    sections_info = []
    for section in section_batch:
        sections_info.append(f"- Section {section.section}: {section.title} (Level {section.level})")
    
    sections_text = "\n".join(sections_info)
    
    # Use more PDF content (up to 8000 characters to stay within token limits)
    pdf_context = pdf_content[:] if pdf_content else ""
    
    prompt = f"""You are an expert academic writer helping to create a detailed thesis plan. 
    Create UNIQUE and SPECIFIC plans for EACH of the following sections and subsections. 
    
    CRITICAL REQUIREMENTS:
    1. Each plan must be DIFFERENT and UNIQUE - NO repetition between sections
    2. Main sections (level 0) should have broader, strategic plans
    3. Subsections (level 1, 2, etc.) should have very specific, focused plans that dive deep into particular aspects
    4. Each subsection plan should complement but NOT repeat its parent section
    5. Use the PDF context below to make plans more specific and relevant
    
    Context: This is for an academic thesis about conversational agents in higher education.
    
    PDF Context (use this to make plans more specific):
    {pdf_context}
    
    Sections to plan (create a UNIQUE plan for EACH one):
    {sections_text}
    
    For EACH section/subsection, provide a comprehensive and UNIQUE plan that includes:
    
    FOR MAIN SECTIONS (Level 0):
    - Broad objectives and strategic goals
    - Overall approach and methodology
    - Key research questions to address
    - Expected major outcomes
    - How it fits in the overall thesis narrative
    
    FOR SUBSECTIONS (Level 1, 2, etc.):
    - Very specific objectives focused on one aspect
    - Detailed methodologies or approaches specific to this subsection
    - Specific data, examples, or case studies to include
    - Particular research questions or hypotheses
    - Specific deliverables (tables, figures, algorithms, etc.)
    - Exact connection to parent section without repeating content
    
    IMPORTANT: Make each plan distinct and avoid generic language. Be specific about what content, methods, and outcomes are expected for each individual section.
    """

    try:
        response = llm_client.beta.chat.completions.parse(
            model=model_name,
            messages=[{"role": "user", "content": prompt}],
            response_format=BatchSectionPlans,
            temperature=0.3,
            max_tokens=8192  # Increased token limit for more detailed plans
        )
        
        return response.choices[0].message.parsed.plans
        
    except Exception as e:
        print(f"Error generating batch plans: {e}")
        return []

# Main function to generate plans for all sections in batches
def generate_all_section_plans(
    sections: List[SectionNode], llm_client, model_name: str, pdf_content: str = "", batch_size: int = 3
) -> List[SectionPlan]:
    """
    Generate plans for all sections by processing them in batches.
    """
    all_plans = []
    
    # Group sections into batches
    section_batches = group_sections_for_batch_processing(sections, batch_size)
    
    print(f"Processing {len(section_batches)} batches...")
    print(f"PDF content length: {len(pdf_content)} characters")
    
    for i, batch in enumerate(section_batches):
        print(f"Processing batch {i+1}/{len(section_batches)} with {len(batch)} sections...")
        
        # Show which sections are in this batch
        batch_sections = [f"{s.section}: {s.title} (L{s.level})" for s in batch]
        print(f"  Sections in batch: {', '.join(batch_sections)}")
        
        batch_plans = generate_batch_section_plans(batch, llm_client, model_name, pdf_content)
        all_plans.extend(batch_plans)
        
        print(f"Generated {len(batch_plans)} plans for batch {i+1}")
        print("-" * 50)
    
    return all_plans

In [18]:
# call the function to generate plans
if __name__ == "__main__":
    # Read the section hierarchy from the JSON file
    sections = read_section_hierarchy("section_hierarchy.json")
    
    # Read the PDF content (if needed)
    try:
        pdf_content = read_pdf_content("inital_info.pdf")  # Replace with your actual PDF file path
        print(f"Successfully loaded PDF with {len(pdf_content)} characters")
        print(f"First 500 characters of PDF: {pdf_content[:500]}...")
    except Exception as e:
        print(f"Could not read PDF: {e}")
        pdf_content = ""
    

    # Generate plans for all sections in batches of 3 main sections
    all_section_plans = generate_all_section_plans(
        sections, llm_client, model_name, pdf_content, batch_size=3
    )

    # Print out the plans as JSON
    print(f"Generated {len(all_section_plans)} total plans")
    # print(json.dumps([plan.model_dump() for plan in all_section_plans], indent=2, ensure_ascii=False))
    
    # Print a summary of the plans generated
    for plan in all_section_plans:
        print(f"Section {plan.section_number}: {plan.title} (Level {plan.level})")
        print(f"  Plan length: {len(plan.plan)} characters")
        print(f"  Plan preview: {plan.plan[:100]}...")
        print()

Successfully loaded PDF with 13752 characters
First 500 characters of PDF: 1.1 Motivation and Objectives  
Over the last decade, higher -education institutions have adopted a variety of digital 
tools—ranging from static FAQ pages to rudimentary rule -based chatbots —to 
streamline administrative processes and enhance service delivery. However, at 
Friedrich -Alexander -Universität Erlangen -Nürnberg (FAU), students and staff still 
encounter persistent obstacles when seeki ng information about enrollment 
procedures, course registration deadlines, examination regulati...
Processing 4 batches...
PDF content length: 13752 characters
Processing batch 1/4 with 26 sections...
  Sections in batch: 1: Introduction (L0), 1.1: Motivation and Objectives (L1), 1.2: Thesis Scope and Contributions (L1), 1.3: Thesis Organization (L1), 2: Background and Foundations (L0), 2.1: Domain Background: Conversational Agents in Higher Education (L1), 2.1.1: Evolution and Definitions of Conversational Agents 

In [6]:
# save the plans to a file
with open("section_plans.json", "w", encoding="utf-8") as f:
    json.dump([plan.model_dump() for plan in all_section_plans], f, indent=2, ensure_ascii=False)

In [8]:
import json
from typing import List, Optional
from pydantic import BaseModel
from PyPDF2 import PdfReader
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm

class SectionNode(BaseModel):
    section: str
    title: str
    level: int
    children: List["SectionNode"] = []
    bullet_points: Optional[List[str]] = None
    plan: Optional[str] = None

SectionNode.update_forward_refs()

class SectionPlan(BaseModel):
    section_number: str
    title: str
    level: int
    plan: str

class BulletPointsResponse(BaseModel):
    bullet_points: List[str]

def read_pdf_pages(path: str) -> List[str]:
    reader = PdfReader(path)
    return [page.extract_text() or "" for page in reader.pages]

def load_section_hierarchy(path: str) -> List[SectionNode]:
    with open(path, "r", encoding="utf-8") as f:
        data = json.load(f)
    return [SectionNode(**s) for s in data]

def load_section_plans(path: str) -> List[SectionPlan]:
    with open(path, "r", encoding="utf-8") as f:
        data = json.load(f)
    return [SectionPlan(**s) for s in data]

def plans_map_by_number(plans: List[SectionPlan]):
    return {p.section_number: p for p in plans}

def flatten_sections(sections: List[SectionNode]) -> List[SectionNode]:
    result = []
    def rec(nodes):
        for n in nodes:
            result.append(n)
            if n.children:
                rec(n.children)
    rec(sections)
    return result

def extract_bullet_points_for_section(
    section: SectionNode,
    plan: Optional[SectionPlan],
    pages: List[str],
    llm_client,
    model_name: str,
    bullets_per_section: int = 8
) -> List[str]:
    all_bullets = []
    for page_num, page_text in enumerate(pages, start=1):
        prompt = f"""
You are given a section context and a single page of a PDF.

Section: {section.section} - {section.title} (Level {section.level})

Section Plan (use this as focus guide):
{plan.plan if plan else ''}

Page Number: {page_num}

Page Text:
{page_text}

Extract up to {bullets_per_section} concise, actionable bullet points strictly relevant to this section and plan.
Return only a list of bullet points.
"""
        try:
            response = llm_client.beta.chat.completions.parse(
                model=model_name,
                messages=[{"role": "user", "content": prompt}],
                response_format=BulletPointsResponse,
                temperature=0.3,
                max_tokens=1024
            )
            for bp in response.choices[0].message.parsed.bullet_points:
                if bp not in all_bullets:
                    all_bullets.append(bp)
        except Exception as e:
            print(f"Error on section {section.section}, page {page_num}: {e}")
    return all_bullets

def main():
    # --- Load inputs ---
    hierarchy = load_section_hierarchy("section_hierarchy.json")
    plans = load_section_plans("section_plans.json")
    plans_map = plans_map_by_number(plans)
    pages = read_pdf_pages("initial_info.pdf")
    print(f"Loaded {len(hierarchy)} root sections; {len(plans)} plans; {len(pages)} PDF pages.")

    # --- Flatten all sections (including children) ---
    flat_sections = flatten_sections(hierarchy)

    # --- Parallel bullet extraction ---
    results_map = {}  # section_number: (plan, bullets)

    print("Extracting bullet points for all sections in parallel...")
    with ThreadPoolExecutor(max_workers=5) as executor:
        futures = []
        for section in flat_sections:
            plan = plans_map.get(section.section, None)
            futures.append(
                executor.submit(
                    extract_bullet_points_for_section,
                    section, plan, pages, llm_client, model_name
                )
            )
        for section, future in tqdm(zip(flat_sections, futures), total=len(flat_sections)):
            try:
                bullets = future.result()
                results_map[section.section] = {
                    "plan": plans_map[section.section].plan if section.section in plans_map else None,
                    "bullet_points": bullets,
                }
                print(f"[DONE] Section {section.section} - {section.title}: {len(bullets)} bullets")
            except Exception as e:
                print(f"[ERROR] Section {section.section} - {section.title}: {e}")

    # --- Attach results back into the hierarchy tree ---
    def attach_results(nodes):
        for node in nodes:
            res = results_map.get(node.section, None)
            if res:
                node.plan = res["plan"]
                node.bullet_points = res["bullet_points"]
            if node.children:
                attach_results(node.children)
    attach_results(hierarchy)

    # --- Export enriched tree ---
    with open("section_hierarchy_with_bullets.json", "w", encoding="utf-8") as f:
        json.dump([s.model_dump() for s in hierarchy], f, indent=2, ensure_ascii=False)
    print("Exported: section_hierarchy_with_bullets.json")

if __name__ == "__main__":
    main()


C:\Users\YAYT\AppData\Local\Temp\ipykernel_12708\3926822292.py:16: PydanticDeprecatedSince20: The `update_forward_refs` method is deprecated; use `model_rebuild` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  SectionNode.update_forward_refs()


Loaded 12 root sections; 63 plans; 7 PDF pages.
Extracting bullet points for all sections in parallel...


  2%|▏         | 1/63 [00:12<13:07, 12.71s/it]

[DONE] Section 1 - Introduction: 56 bullets


  3%|▎         | 2/63 [00:18<08:53,  8.74s/it]

[DONE] Section 1.1 - Motivation and Objectives: 53 bullets
[DONE] Section 1.2 - Thesis Scope and Contributions: 56 bullets
[DONE] Section 1.3 - Thesis Organization: 52 bullets
[DONE] Section 2 - Background and Foundations: 56 bullets


 10%|▉         | 6/63 [00:28<03:41,  3.88s/it]

[DONE] Section 2.1 - Domain Background: Conversational Agents in Higher Education: 56 bullets
[DONE] Section 2.1.1 - Evolution and Definitions of Conversational Agents: 50 bullets


 13%|█▎        | 8/63 [00:31<02:50,  3.10s/it]

[DONE] Section 2.1.2 - Use Cases and Impact in University Administration: 56 bullets


 14%|█▍        | 9/63 [00:34<02:44,  3.05s/it]

[DONE] Section 2.2 - Technical Foundations: LLM Architectures and Methods: 55 bullets
[DONE] Section 2.2.1 - Transformer Architecture Overview: 55 bullets


 17%|█▋        | 11/63 [00:40<02:35,  3.00s/it]

[DONE] Section 2.2.2 - Pretraining Paradigms and Domain Adaptation: 55 bullets


 19%|█▉        | 12/63 [00:45<02:54,  3.42s/it]

[DONE] Section 2.2.3 - Parameter-Efficient Fine-Tuning (LoRA): 56 bullets


 21%|██        | 13/63 [00:47<02:37,  3.16s/it]

[DONE] Section 2.2.3.1 - LoRA Methodology and Mechanisms: 56 bullets


 22%|██▏       | 14/63 [00:48<02:00,  2.46s/it]

[DONE] Section 2.2.3.2 - Comparison with Other PEFT Techniques: 54 bullets


 24%|██▍       | 15/63 [00:53<02:30,  3.14s/it]

[DONE] Section 2.2.4 - Supervised Fine-Tuning vs. Direct Preference Optimization: 56 bullets


 25%|██▌       | 16/63 [00:57<02:38,  3.38s/it]

[DONE] Section 2.2.4.1 - Supervised Fine-Tuning (SFT) Techniques: 56 bullets


 27%|██▋       | 17/63 [00:59<02:24,  3.14s/it]

[DONE] Section 2.2.4.2 - Direct Preference Optimization (DPO) Principles: 56 bullets


 29%|██▊       | 18/63 [01:02<02:20,  3.13s/it]

[DONE] Section 2.2.4.3 - Hybrid SFT + DPO Approaches: 55 bullets
[DONE] Section 2.2.5 - Topic Modeling and Content Filtering Techniques: 55 bullets


 32%|███▏      | 20/63 [01:11<02:41,  3.75s/it]

[DONE] Section 2.2.5.1 - Unsupervised Topic Modeling: 55 bullets


 33%|███▎      | 21/63 [01:12<02:10,  3.10s/it]

[DONE] Section 2.2.5.2 - LLM-Based Theme Classification: 56 bullets


 35%|███▍      | 22/63 [01:15<02:01,  2.97s/it]

[DONE] Section 3 - Related Work: 56 bullets


 37%|███▋      | 23/63 [01:16<01:41,  2.53s/it]

[DONE] Section 3.1 - Domain-Specific LLMs for Administrative Support: 56 bullets


 38%|███▊      | 24/63 [01:18<01:30,  2.31s/it]

[DONE] Section 3.2 - Prompt Engineering for User-Aware Generation: 56 bullets


 40%|███▉      | 25/63 [01:27<02:35,  4.10s/it]

[DONE] Section 3.3 - Evaluation Metrics for Conversational Agents: 56 bullets


 41%|████▏     | 26/63 [01:28<02:02,  3.32s/it]

[DONE] Section 3.4 - Research Gaps in University Management Chatbots: 54 bullets


 43%|████▎     | 27/63 [01:30<01:47,  2.99s/it]

[DONE] Section 4 - Data Acquisition and Preprocessing: 49 bullets
[DONE] Section 4.1 - FAU Administrative Data Harvesting: 48 bullets


 46%|████▌     | 29/63 [01:31<01:01,  1.81s/it]

[DONE] Section 4.2 - Text Segmentation into 4 000-Token Chunks: 50 bullets


 48%|████▊     | 30/63 [01:42<02:10,  3.95s/it]

[DONE] Section 4.3 - Initial QA-Pair Generation with Mistral: 51 bullets


 49%|████▉     | 31/63 [01:43<01:47,  3.36s/it]

[DONE] Section 4.4 - Two-Stage Filtering Pipeline: 44 bullets
[DONE] Section 4.4.1 - Unsupervised Topic Modeling (BERTopic): 55 bullets


 52%|█████▏    | 33/63 [01:45<01:10,  2.34s/it]

[DONE] Section 4.4.2 - LLM-Based Theme Classification and Lecture Removal: 53 bullets


 54%|█████▍    | 34/63 [01:47<01:06,  2.28s/it]

[DONE] Section 4.5 - Construction of Chosen vs. Rejected Response Sets: 47 bullets


 56%|█████▌    | 35/63 [01:55<01:39,  3.57s/it]

[DONE] Section 5 - Model Fine-Tuning and Prompt Engineering: 55 bullets


 57%|█████▋    | 36/63 [01:59<01:40,  3.74s/it]

[DONE] Section 5.1 - Model Selection: LLaMA 3 and Falcon 7B: 56 bullets
[DONE] Section 5.1.1 - RAG vs. Role-Aware Fine-Tuning: A Design Justification: 55 bullets


 60%|██████    | 38/63 [01:59<00:54,  2.20s/it]

[DONE] Section 5.2 - Supervised Fine-Tuning on Management QA Corpus: 54 bullets


 62%|██████▏   | 39/63 [02:04<01:07,  2.82s/it]

[DONE] Section 5.3 - Direct Preference Optimization (DPO) with QLoRA: 51 bullets


 63%|██████▎   | 40/63 [02:08<01:13,  3.21s/it]

[DONE] Section 5.4 - Custom Prompt Templates and Persona Embedding: 51 bullets


 65%|██████▌   | 41/63 [02:12<01:14,  3.40s/it]

[DONE] Section 5.4.1 - Static Persona Injection (User Background & Expert Role): 55 bullets


 67%|██████▋   | 42/63 [02:13<00:53,  2.57s/it]

[DONE] Section 5.4.2 - Role-Based Persona Modeling (Context-Adaptive Generation): 54 bullets
[DONE] Section 5.4.3 - Dynamic Context Windows and Memory Traces: 48 bullets


 70%|██████▉   | 44/63 [02:19<00:54,  2.86s/it]

[DONE] Section 5.5 - Implementation Details and Training Infrastructure: 56 bullets


 71%|███████▏  | 45/63 [02:24<00:58,  3.23s/it]

[DONE] Section 6 - Experimental Design and Evaluation: 54 bullets


 73%|███████▎  | 46/63 [02:27<00:54,  3.19s/it]

[DONE] Section 6.1 - Evaluation Metrics: Precision, Recall, F1-Score: 54 bullets
[DONE] Section 6.2 - Baseline vs. Fine-Tuned Model Comparisons: 54 bullets


 76%|███████▌  | 48/63 [02:27<00:28,  1.91s/it]

[DONE] Section 6.3 - User-Role Simulation and Contextual Tests: 55 bullets


 78%|███████▊  | 49/63 [02:33<00:38,  2.77s/it]

[DONE] Section 6.4 - Ablation Study on Prompt Components: 56 bullets


 79%|███████▉  | 50/63 [02:38<00:43,  3.36s/it]

[DONE] Section 6.5 - Statistical Significance and Error Analysis: 55 bullets


 81%|████████  | 51/63 [02:40<00:36,  3.01s/it]

[DONE] Section 7 - Results and Discussion: 56 bullets
[DONE] Section 7.1 - Quantitative Performance Gains (+10 % Precision, +12 % F1): 41 bullets


 84%|████████▍ | 53/63 [02:41<00:18,  1.87s/it]

[DONE] Section 7.2 - Qualitative Case Studies and Exemplars: 55 bullets


 86%|████████▌ | 54/63 [02:45<00:21,  2.42s/it]

[DONE] Section 7.3 - Limitations of the Current Approach: 56 bullets


 87%|████████▋ | 55/63 [02:52<00:29,  3.64s/it]

[DONE] Section 7.4 - Implications for University Administrative Workflows: 55 bullets


 89%|████████▉ | 56/63 [02:55<00:24,  3.54s/it]

[DONE] Section 8 - Conclusion and Future Work: 56 bullets
[DONE] Section 8.1 - Summary of Contributions: 56 bullets


 92%|█████████▏| 58/63 [02:57<00:12,  2.46s/it]

[DONE] Section 8.2 - Recommendations for Deployment: 56 bullets


 94%|█████████▎| 59/63 [02:59<00:09,  2.37s/it]

[DONE] Section 8.3 - Directions for Further Research: 56 bullets


 95%|█████████▌| 60/63 [03:05<00:09,  3.08s/it]

[DONE] Section  - List of Abbreviations: 55 bullets


 97%|█████████▋| 61/63 [03:08<00:06,  3.13s/it]

[DONE] Section  - List of Figures: 56 bullets


 98%|█████████▊| 62/63 [03:09<00:02,  2.69s/it]

[DONE] Section  - List of Tables: 56 bullets


100%|██████████| 63/63 [03:12<00:00,  3.05s/it]

[DONE] Section  - References: 56 bullets
Exported: section_hierarchy_with_bullets.json





In [25]:
# ===============================
# Section Drafting Pipeline (Hierarchy + Plans -> Drafts with Child Context)
# ===============================
import json
from typing import List, Optional
from pydantic import BaseModel

# ===============================
# 1. MODELS (recursive hierarchy)
# ===============================
class SectionNode(BaseModel):
    section: str
    title: str
    level: int
    children: List["SectionNode"] = []
    draft_content: Optional[str] = None

SectionNode.update_forward_refs()

class SectionPlan(BaseModel):
    section_number: str
    plan: str

class DraftResponse(BaseModel):
    draft: str

# ===============================
# 2. HELPERS (load/save)
# ===============================

def load_hierarchy(path: str) -> List[SectionNode]:
    with open(path, "r", encoding="utf-8") as f:
        data = json.load(f)
    return [SectionNode(**s) for s in data]


def load_plans(path: str) -> dict:
    with open(path, "r", encoding="utf-8") as f:
        raw = json.load(f)
    return {p["section_number"]: p["plan"] for p in raw}


def save_hierarchy(path: str, hierarchy: List[SectionNode]):
    with open(path, "w", encoding="utf-8") as f:
        json.dump([s.model_dump() for s in hierarchy], f, indent=2, ensure_ascii=False)

# ===============================
# 3. Draft Generation for Each Section
# ===============================

def generate_draft_for_section(
    section: SectionNode,
    plans: dict,
    llm_client,
    draft_model_name: str
) -> str:
    # Main section plan
    main_plan = plans.get(section.section, "")
    # Gather child headings and their plans
    child_lines = []
    for child in section.children:
        cp = plans.get(child.section, "")
        child_lines.append(f"Subsection {child.section}: {child.title}\nPlan: {cp}")
    child_context = "\n\n".join(child_lines) if child_lines else ""

    prompt = f"""
You are a meticulous academic writer drafting a thesis section. Write in a clear, human-like tone, ensure originality (no plagiarism), and maintain academic rigor.

SECTION CONTEXT
---------------
Section {section.section}: {section.title}
Plan: {main_plan}

"""
    if child_context:
        prompt += f"""
SUBSECTION CONTEXT
------------------
{child_context}

"""
    prompt += f"""
INSTRUCTIONS
------------
Based on the section and its subsections' plans above, write coherent, concise draft paragraph(s) covering the main plan and reflecting the structure implied by the subsections. Use academic style, human tone, and avoid copying verbatim from sources.

Return only the draft text.
Remeber based on the section and the content you are writing adjust the size of the draft and the tone, sna dmek it perfect liek a final version.
"""

    response = llm_client.beta.chat.completions.parse(
        model=draft_model_name,
        messages=[{"role": "user", "content": prompt}],
        response_format=DraftResponse,
        temperature=0.3,
        max_tokens=4096
    )
    return response.choices[0].message.parsed.draft

# ===============================
# 4. Recursive Draft Enrichment
# ===============================

def enrich_with_drafts(
    nodes: List[SectionNode],
    plans: dict,
    llm_client,
    draft_model_name: str
):
    for node in nodes:
        if node.section in plans:
            print(f"Drafting section {node.section} - {node.title}")
            node.draft_content = generate_draft_for_section(
                node, plans, llm_client, draft_model_name
            )
        else:
            print(f"Warning: No plan found for section {node.section}")
        if node.children:
            enrich_with_drafts(node.children, plans, llm_client, draft_model_name)

# ===============================
# 5. Main Execution
# ===============================

def main():
    hierarchy = load_hierarchy("section_hierarchy.json")
    plans = load_plans("section_plans.json")

    enrich_with_drafts(hierarchy, plans, llm_client, model_name)

    save_hierarchy("section_hierarchy_with_drafts.json", hierarchy)
    print("Drafts generated and saved to section_hierarchy_with_drafts.json")

if __name__ == "__main__":
    main()


C:\Users\YAYT\AppData\Local\Temp\ipykernel_12708\854873559.py:18: PydanticDeprecatedSince20: The `update_forward_refs` method is deprecated; use `model_rebuild` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  SectionNode.update_forward_refs()


Drafting section 1 - Introduction
Drafting section 1.1 - Motivation and Objectives
Drafting section 1.2 - Thesis Scope and Contributions
Drafting section 1.3 - Thesis Organization
Drafting section 2 - Background and Foundations
Drafting section 2.1 - Domain Background: Conversational Agents in Higher Education
Drafting section 2.1.1 - Evolution and Definitions of Conversational Agents
Drafting section 2.1.2 - Use Cases and Impact in University Administration
Drafting section 2.2 - Technical Foundations: LLM Architectures and Methods
Drafting section 2.2.1 - Transformer Architecture Overview
Drafting section 2.2.2 - Pretraining Paradigms and Domain Adaptation
Drafting section 2.2.3 - Parameter-Efficient Fine-Tuning (LoRA)
Drafting section 2.2.3.1 - LoRA Methodology and Mechanisms
Drafting section 2.2.3.2 - Comparison with Other PEFT Techniques
Drafting section 2.2.4 - Supervised Fine-Tuning vs. Direct Preference Optimization
Drafting section 2.2.4.1 - Supervised Fine-Tuning (SFT) Techni

In [28]:
# ===============================
# Enhanced Section Pipeline with Web Search & Content Integration
# ===============================
import json
import re
from typing import List, Optional, Dict, Any
from pydantic import BaseModel
import requests
from urllib.parse import urlparse, urljoin
from urllib.robotparser import RobotFileParser
import trafilatura
from concurrent.futures import ThreadPoolExecutor, as_completed
import time

# ===============================
# 1. MODELS
# ===============================
class SectionNode(BaseModel):
    section: str
    title: str
    level: int
    children: List["SectionNode"] = []
    draft_content: Optional[str] = None
    needs_search: Optional[bool] = None
    search_queries: Optional[List[str]] = None
    search_results: Optional[List[Dict[str, Any]]] = None
    enhanced_content: Optional[str] = None  # Final enhanced content with citations

SectionNode.update_forward_refs()

class SearchDecision(BaseModel):
    needs_search: bool
    queries: List[str]
    reasoning: str

class ContentEnhancement(BaseModel):
    enhanced_content: str
    citations_added: List[str]
    improvements_made: List[str]

# ===============================
# 2. HELPER FUNCTIONS
# ===============================
def load_hierarchy(path: str) -> List[SectionNode]:
    """Load section hierarchy from JSON file"""
    with open(path, "r", encoding="utf-8") as f:
        data = json.load(f)
    return [SectionNode(**s) for s in data]

def save_hierarchy(path: str, hierarchy: List[SectionNode]):
    """Save section hierarchy to JSON file"""
    with open(path, "w", encoding="utf-8") as f:
        json.dump([s.model_dump() for s in hierarchy], f, indent=2, ensure_ascii=False)

# ===============================
# 3. SEARCH DECISION LOGIC
# ===============================
def decide_search_for_section(
    section: SectionNode,
    llm_client,
    model_name: str
) -> SearchDecision:
    """Analyze section draft and decide if web search is needed"""
    
    draft = section.draft_content or ""
    
    prompt = f"""
You are an academic research assistant for a Masters thesis in Data Science about "Conversational Agents in Higher Education".

SECTION: {section.section} - {section.title} (Level {section.level})

DRAFT CONTENT:
{draft}

STRICT SEARCH CRITERIA - Only search if the section SPECIFICALLY needs:

1. TECHNICAL CITATIONS: Does this section mention specific technical methods, algorithms, or frameworks that need academic references?
2. EMPIRICAL EVIDENCE: Does this section make claims about performance, effectiveness, or results that need supporting studies?
3. LITERATURE FOUNDATION: Is this a literature review, related work, or background section that requires comprehensive citations?
4. STATISTICAL DATA: Does this section mention statistics or metrics that need source attribution?

DO NOT SEARCH FOR:
- Organizational/structural sections (thesis organization, conclusion summaries)
- General introductory statements without specific claims
- Methodology descriptions that are self-contained
- Sections that are primarily descriptive of your own work

SEARCH DECISION:
- Introduction sections: ONLY if they make specific technical or empirical claims
- Background/Literature sections: YES - these need comprehensive citations
- Technical foundation sections: YES - need authoritative sources for methods
- Methodology sections: ONLY if referencing established methods/frameworks
- Results/Discussion: ONLY if comparing to other studies or citing benchmarks
- Conclusion/Organization: NO - these don't need external citations

If search is needed, create 1-2 HIGHLY SPECIFIC search queries using exact technical terms from the draft.
Format queries as: "exact technical term" + "research papers" or "scholarly articles"

RESPONSE FORMAT:
- needs_search: true/false
- queries: ["specific technical query"] (empty if no search)
- reasoning: One sentence explaining your decision
"""

    try:
        response = llm_client.beta.chat.completions.parse(
            model=model_name,
            messages=[{"role": "user", "content": prompt}],
            response_format=SearchDecision,
            temperature=0.2,
            max_tokens=800
        )
        return response.choices[0].message.parsed
    except Exception as e:
        print(f"Error in search decision for section {section.section}: {e}")
        return SearchDecision(needs_search=False, queries=[], reasoning="Error in analysis")

# ===============================
# 4. WEB SEARCH & SCRAPING
# ===============================
def bing_search(
    query: str,
    subscription_key: str,
    endpoint: str,
    count: int = 10
) -> List[Dict[str, Any]]:
    """Perform Bing web search"""
    headers = {"Ocp-Apim-Subscription-Key": subscription_key}
    
    # Enhanced academic search query
    academic_query = f'"{query}" filetype:pdf OR site:arxiv.org OR site:ieee.org OR site:acm.org OR site:springer.com OR site:sciencedirect.com OR site:researchgate.net'
    
    params = {
        "q": academic_query,
        "count": count,
        "responseFilter": "webPages",
        "safeSearch": "Strict"
    }
    
    results = []
    try:
        response = requests.get(endpoint, headers=headers, params=params, timeout=30)
        response.raise_for_status()
        data = response.json()
        
        for i, item in enumerate(data.get("webPages", {}).get("value", [])):
            url = item.get("url", "")
            title = item.get("name", "")
            snippet = item.get("snippet", "")
            
            # Prioritize academic sources
            is_academic = any(domain in url.lower() for domain in [
                'arxiv.org', 'ieee.org', 'acm.org', 'springer.com', 
                'elsevier.com', 'sciencedirect.com', 'researchgate.net',
                'scholar.google.com', 'doi.org', 'pubmed.ncbi.nlm.nih.gov',
                'jstor.org', 'tandfonline.com', 'wiley.com'
            ])
            
            # Also check for PDF files (often research papers)
            is_pdf = url.lower().endswith('.pdf') or 'filetype:pdf' in title.lower()
            
            # Check for academic keywords in title/snippet
            academic_keywords = ['research', 'study', 'analysis', 'evaluation', 'framework', 'model', 'algorithm', 'methodology']
            has_academic_keywords = any(keyword in (title + snippet).lower() for keyword in academic_keywords)
            
            if is_academic or is_pdf or has_academic_keywords:
                results.append({
                    "url": url,
                    "title": title,
                    "snippet": snippet,
                    "rank": i,
                    "is_academic": is_academic,
                    "is_pdf": is_pdf,
                    "academic_score": sum([is_academic, is_pdf, has_academic_keywords])
                })
                
    except Exception as e:
        print(f"Bing search failed for query '{query}': {e}")
    
    # Sort by academic score (highest first), then by rank
    results.sort(key=lambda x: (-x.get("academic_score", 0), x.get("rank", 999)))
    
    return results

def is_allowed_by_robots(url: str, user_agent: str = "*") -> bool:
    """Check if URL is allowed by robots.txt"""
    try:
        parsed = urlparse(url)
        robots_url = urljoin(f"{parsed.scheme}://{parsed.netloc}", "/robots.txt")
        rp = RobotFileParser()
        rp.set_url(robots_url)
        rp.read()
        return rp.can_fetch(user_agent, url)
    except:
        return True  # If can't check, assume it's allowed

def crawl_url(url: str) -> str:
    """Extract content from URL using trafilatura"""
    try:
        downloaded = trafilatura.fetch_url(url)
        if downloaded:
            content = trafilatura.extract(downloaded, include_comments=False, include_tables=True)
            return content[:5000] if content else ""  # Limit content length
    except Exception as e:
        print(f"Failed to crawl {url}: {e}")
    return ""

def process_search_results(search_results: List[Dict[str, Any]], max_crawl: int = 5) -> List[Dict[str, Any]]:
    """Process and crawl search results"""
    # Sort by academic sources first, then by rank
    sorted_results = sorted(search_results, key=lambda x: (not x.get("is_academic", False), x.get("rank", 999)))
    
    # Check robots.txt and crawl allowed URLs
    crawlable_results = []
    for result in sorted_results[:max_crawl * 2]:  # Check more than we need
        if is_allowed_by_robots(result["url"]):
            result["can_crawl"] = True
            crawlable_results.append(result)
        if len(crawlable_results) >= max_crawl:
            break
    
    # Crawl URLs in parallel
    with ThreadPoolExecutor(max_workers=3) as executor:
        future_to_result = {
            executor.submit(crawl_url, result["url"]): result 
            for result in crawlable_results
        }
        
        crawled_results = []
        for future in as_completed(future_to_result):
            result = future_to_result[future]
            try:
                content = future.result()
                if content and len(content.strip()) > 100:  # Only keep substantial content
                    result["crawled_content"] = content
                    crawled_results.append(result)
            except Exception as e:
                print(f"Error crawling {result['url']}: {e}")
    
    return crawled_results

# ===============================
# 5. CONTENT ENHANCEMENT
# ===============================
def enhance_section_with_research(
    section: SectionNode,
    llm_client,
    model_name: str
) -> ContentEnhancement:
    """Enhance section content with research findings and proper citations"""
    
    draft = section.draft_content or ""
    search_results = section.search_results or []
    
    # Prepare research context
    research_context = []
    for i, result in enumerate(search_results, 1):
        research_context.append(f"""
SOURCE {i}:
Title: {result.get('title', 'Unknown')}
URL: {result.get('url', '')}
Snippet: {result.get('snippet', '')}
Content: {result.get('crawled_content', '')[:1500]}...
Academic Score: {result.get('academic_score', 0)}
""")
    
    research_text = "\n".join(research_context)
    
    prompt = f"""
You are an expert academic writer enhancing a Masters thesis section with research citations.

THESIS CONTEXT: "Conversational Agents in Higher Education" - Data Science Masters at Friedrich Alexander University

SECTION: {section.section} - {section.title} (Level {section.level})

ORIGINAL DRAFT:
{draft}

RESEARCH SOURCES:
{research_text}

ENHANCEMENT GUIDELINES:
1. SELECTIVE INTEGRATION: Only integrate research that directly supports or extends the draft content
2. ACADEMIC CITATIONS: Use proper format (Author, Year) or [Reference Number] 
3. STRENGTHEN CLAIMS: Back factual statements with evidence from credible sources
4. MAINTAIN FLOW: Keep the original structure and writing style
5. ADD VALUE: Include specific statistics, methodologies, or findings that enhance understanding
6. INDICATE IMAGES: Where technical concepts need visualization, add "[Image needed: description]"

QUALITY STANDARDS:
- Prioritize sources with higher academic scores
- Only cite sources that are genuinely relevant to the section content
- Don't force citations where they don't naturally fit
- Maintain academic rigor while keeping readability
- Add specific technical details from the research where appropriate

SPECIAL INSTRUCTIONS:
- If discussing technical methods, include implementation details from sources
- For background sections, focus on recent developments and key studies
- For methodology sections, cite authoritative sources for techniques used
- Add "[Image needed: X]" where diagrams, architectures, or charts would help explain concepts

OUTPUT REQUIREMENTS:
- enhanced_content: Improved section with integrated research and proper citations
- citations_added: List of specific citations you added with brief relevance explanation
- improvements_made: List of key enhancements made to strengthen the academic quality
"""

    try:
        response = llm_client.beta.chat.completions.parse(
            model=model_name,
            messages=[{"role": "user", "content": prompt}],
            response_format=ContentEnhancement,
            temperature=0.3,
            max_tokens=4096
        )
        return response.choices[0].message.parsed
    except Exception as e:
        print(f"Error enhancing section {section.section}: {e}")
        return ContentEnhancement(
            enhanced_content=draft,
            citations_added=[],
            improvements_made=["Error occurred during enhancement"]
        )

# ===============================
# 6. MAIN PROCESSING PIPELINE
# ===============================
def process_section_with_research(
    section: SectionNode,
    llm_client,
    model_name: str,
    bing_key: str,
    bing_endpoint: str
) -> None:
    """Process a single section: decide search -> search -> enhance"""
    
    print(f"\n{'='*60}")
    print(f"Processing: {section.section} - {section.title}")
    print(f"Level: {section.level}")
    
    # Step 1: Decide if search is needed
    print("Step 1: Analyzing search requirements...")
    decision = decide_search_for_section(section, llm_client, model_name)
    section.needs_search = decision.needs_search
    section.search_queries = decision.queries
    
    print(f"Search needed: {decision.needs_search}")
    print(f"Reasoning: {decision.reasoning}")
    
    if decision.needs_search and decision.queries:
        print(f"Search queries: {decision.queries}")
        
        # Step 2: Perform web search
        print("Step 2: Performing web searches...")
        all_results = []
        for query in decision.queries:
            print(f"  Searching: {query}")
            results = bing_search(query, bing_key, bing_endpoint)
            all_results.extend(results)
            time.sleep(1)  # Rate limiting
        
        # Step 3: Process and crawl results
        print("Step 3: Processing search results...")
        section.search_results = process_search_results(all_results)
        print(f"  Crawled {len(section.search_results)} sources")
        
        # Step 4: Enhance content
        if section.search_results:
            print("Step 4: Enhancing content with research...")
            enhancement = enhance_section_with_research(section, llm_client, model_name)
            section.enhanced_content = enhancement.enhanced_content
            print(f"  Citations added: {len(enhancement.citations_added)}")
            print(f"  Improvements: {len(enhancement.improvements_made)}")
        else:
            print("Step 4: No search results to integrate")
            section.enhanced_content = section.draft_content
    else:
        print("No search needed - keeping original draft")
        section.enhanced_content = section.draft_content
        section.search_results = []

def process_hierarchy_with_research(
    nodes: List[SectionNode],
    llm_client,
    model_name: str,
    bing_key: str,
    bing_endpoint: str
) -> None:
    """Recursively process all sections in the hierarchy"""
    
    for node in nodes:
        if node.draft_content:  # Only process sections with drafts
            process_section_with_research(node, llm_client, model_name, bing_key, bing_endpoint)
        
        # Process children
        if node.children:
            process_hierarchy_with_research(node.children, llm_client, model_name, bing_key, bing_endpoint)

# ===============================
# 7. MAIN EXECUTION
# ===============================
def main():
    """Main execution function"""
    
    # Configuration
    BING_KEY = "a7079abbee3b4a12a6db317f03bf13ca"
    BING_ENDPOINT = "https://api.bing.microsoft.com/v7.0/search"
    
    print("Loading section hierarchy with drafts...")
    hierarchy = load_hierarchy("section_hierarchy_with_drafts.json")
    
    print(f"Loaded {len(hierarchy)} root sections")
    
    # Process all sections
    print("\nStarting research enhancement pipeline...")
    process_hierarchy_with_research(
        hierarchy,
        llm_client,
        model_name,
        BING_KEY,
        BING_ENDPOINT
    )
    
    # Save enhanced hierarchy
    print("\nSaving enhanced hierarchy...")
    save_hierarchy("section_hierarchy_enhanced.json", hierarchy)
    
    # Generate summary report
    print("\n" + "="*60)
    print("ENHANCEMENT SUMMARY")
    print("="*60)
    
    def print_summary(nodes, indent=0):
        for node in nodes:
            if node.draft_content:
                prefix = "  " * indent
                search_status = "✓ ENHANCED" if node.needs_search else "○ ORIGINAL"
                search_count = len(node.search_results) if node.search_results else 0
                print(f"{prefix}{node.section} - {node.title} [{search_status}] ({search_count} sources)")
            
            if node.children:
                print_summary(node.children, indent + 1)
    
    print_summary(hierarchy)
    print(f"\nEnhanced hierarchy saved to: section_hierarchy_enhanced.json")

if __name__ == "__main__":
    main()

C:\Users\YAYT\AppData\Local\Temp\ipykernel_12708\1315602683.py:29: PydanticDeprecatedSince20: The `update_forward_refs` method is deprecated; use `model_rebuild` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  SectionNode.update_forward_refs()


Loading section hierarchy with drafts...
Loaded 12 root sections

Starting research enhancement pipeline...

Processing: 1 - Introduction
Level: 0
Step 1: Analyzing search requirements...
Search needed: True
Reasoning: The introduction makes claims about the effectiveness of conversational agents and language models in administrative processes, which require empirical evidence and technical citations.
Search queries: ['conversational agents in higher education research papers', 'language models administrative efficiency empirical studies']
Step 2: Performing web searches...
  Searching: conversational agents in higher education research papers
Search needed: True
Reasoning: The introduction makes claims about the effectiveness of conversational agents and language models in administrative processes, which require empirical evidence and technical citations.
Search queries: ['conversational agents in higher education research papers', 'language models administrative efficiency empirical 

In [30]:
# ===============================
# MARKDOWN THESIS GENERATOR (FROM ENHANCED JSON)
# ===============================
import json
from typing import List

def generate_markdown_from_enhanced_json(
    input_file: str = "section_hierarchy_enhanced.json",
    output_file: str = "Complete_Thesis.md"
):
    """
    Convert enhanced section hierarchy JSON to complete thesis markdown
    """
    
    # Load the enhanced hierarchy
    with open(input_file, "r", encoding="utf-8") as f:
        data = json.load(f)
    
    sections = [SectionNode(**s) for s in data]
    
    # Start building markdown content
    markdown_content = []
    
    # Add thesis header
    markdown_content.append("""# Conversational Agents in Higher Education
## A Data Science Approach to University Administrative Support

**Masters Thesis in Data Science**  
Friedrich Alexander University, Erlangen, Germany

---

""")
    
    def process_node(node: SectionNode, depth: int = 0):
        """Recursively process each section node"""
        
        # Determine markdown heading level (# ## ### etc.)
        heading_level = "#" * (depth + 1)
        
        # Add section heading
        markdown_content.append(f"{heading_level} {node.section} {node.title}\n")
        
        # Add the enhanced content (or draft if no enhancement)
        content = node.enhanced_content or node.draft_content or ""
        if content.strip():
            markdown_content.append(f"{content}\n")
        
        # Add some spacing
        markdown_content.append("\n")
        
        # Process children recursively
        for child in node.children:
            process_node(child, depth + 1)
    
    # Process all root sections
    for section in sections:
        process_node(section)
    
    # Add bibliography placeholder
    markdown_content.append("""---

## References

[References will be populated based on the citations added during enhancement]

---

## List of Figures

[Figure references will be populated based on "[Image needed: X]" placeholders in the content]

---

## List of Tables

[Table references will be populated based on content analysis]

---

## Appendices

[Additional materials and supplementary content]

""")
    
    # Join all content and save
    final_markdown = "\n".join(markdown_content)
    
    with open(output_file, "w", encoding="utf-8") as f:
        f.write(final_markdown)
    
    print(f"✅ Complete thesis saved as: {output_file}")
    print(f"📄 Total length: {len(final_markdown)} characters")
    
    # Print summary of sections processed
    def count_sections(nodes, level=0):
        count = 0
        for node in nodes:
            if node.enhanced_content or node.draft_content:
                count += 1
            count += count_sections(node.children, level + 1)
        return count
    
    total_sections = count_sections(sections)
    print(f"📚 Sections processed: {total_sections}")
    
    return final_markdown

# ===============================
# ENHANCED MARKDOWN GENERATOR (WITH METADATA)
# ===============================
def generate_detailed_markdown_with_stats(
    input_file: str = "section_hierarchy_enhanced.json",
    output_file: str = "Complete_Thesis_Detailed.md"
):
    """
    Generate markdown with additional metadata and statistics
    """
    
    # Load the enhanced hierarchy
    with open(input_file, "r", encoding="utf-8") as f:
        data = json.load(f)
    
    sections = [SectionNode(**s) for s in data]
    
    # Collect statistics
    stats = {
        "total_sections": 0,
        "enhanced_sections": 0,
        "citations_count": 0,
        "image_placeholders": 0,
        "total_words": 0
    }
    
    markdown_content = []
    
    # Add comprehensive header
    markdown_content.append(f"""# Conversational Agents in Higher Education
## A Data Science Approach to University Administrative Support

**Masters Thesis in Data Science**  
Friedrich Alexander University, Erlangen, Germany

**Author:** [Your Name]  
**Supervisor:** [Supervisor Name]  
**Date:** {__import__('datetime').datetime.now().strftime('%B %Y')}

---

## Abstract

This thesis explores the development and implementation of conversational agents specifically designed for higher education administrative support. Through advanced data science methodologies and large language model fine-tuning, we present a comprehensive framework for enhancing university administrative efficiency while maintaining academic rigor and user satisfaction.

---

## Table of Contents

""")
    
    # Generate table of contents
    def generate_toc(nodes, depth=0):
        toc_lines = []
        for node in nodes:
            indent = "  " * depth
            toc_lines.append(f"{indent}- [{node.section} {node.title}](#{node.section.lower().replace('.', '')}-{node.title.lower().replace(' ', '-').replace(':', '').replace('(', '').replace(')', '')})")
            if node.children:
                toc_lines.extend(generate_toc(node.children, depth + 1))
        return toc_lines
    
    toc = generate_toc(sections)
    markdown_content.extend(toc)
    markdown_content.append("\n---\n")
    
    def process_node_detailed(node: SectionNode, depth: int = 0):
        """Process node with detailed statistics tracking"""
        
        # Update statistics
        stats["total_sections"] += 1
        
        # Determine markdown heading level
        heading_level = "#" * (depth + 1)
        
        # Create anchor-friendly section ID
        section_id = f"{node.section.lower().replace('.', '')}-{node.title.lower().replace(' ', '-').replace(':', '').replace('(', '').replace(')', '')}"
        
        # Add section heading with anchor
        markdown_content.append(f'{heading_level} {node.section} {node.title} {{#{section_id}}}\n')
        
        # Get content
        content = node.enhanced_content or node.draft_content or ""
        
        if content.strip():
            # Track statistics
            if node.enhanced_content:
                stats["enhanced_sections"] += 1
            
            # Count citations (look for patterns like (Author, Year) or [Reference])
            citation_patterns = len(__import__('re').findall(r'\([A-Za-z]+,?\s+\d{4}\)|\[\d+\]|\[Reference\s+\d+\]', content))
            stats["citations_count"] += citation_patterns
            
            # Count image placeholders
            image_patterns = len(__import__('re').findall(r'\[Image needed:.*?\]', content))
            stats["image_placeholders"] += image_patterns
            
            # Count words
            word_count = len(content.split())
            stats["total_words"] += word_count
            
            # Add content with metadata comment
            markdown_content.append(f"<!-- Section {node.section}: {word_count} words, {citation_patterns} citations, {image_patterns} images -->\n")
            markdown_content.append(f"{content}\n")
        
        # Add spacing
        markdown_content.append("\n")
        
        # Process children
        for child in node.children:
            process_node_detailed(child, depth + 1)
    
    # Process all sections
    for section in sections:
        process_node_detailed(section)
    
    # Add comprehensive footer
    markdown_content.append(f"""---

## Document Statistics

- **Total Sections:** {stats['total_sections']}
- **Enhanced Sections:** {stats['enhanced_sections']} ({stats['enhanced_sections']/stats['total_sections']*100:.1f}%)
- **Total Word Count:** {stats['total_words']:,} words
- **Citations Added:** {stats['citations_count']}
- **Image Placeholders:** {stats['image_placeholders']}

---

## References

*Note: This section should be populated with the actual references based on citations used throughout the thesis.*

---

## List of Figures

*Note: Figures should be created based on the {stats['image_placeholders']} "[Image needed: X]" placeholders identified in the content.*

---

## Appendices

### Appendix A: Technical Implementation Details
### Appendix B: Data Processing Workflows  
### Appendix C: Evaluation Metrics and Results
### Appendix D: Source Code and Documentation

---

*Generated on {__import__('datetime').datetime.now().strftime('%B %d, %Y at %H:%M')}*
""")
    
    # Save the file
    final_markdown = "\n".join(markdown_content)
    
    with open(output_file, "w", encoding="utf-8") as f:
        f.write(final_markdown)
    
    print(f"✅ Detailed thesis saved as: {output_file}")
    print(f"📊 Statistics:")
    print(f"   📄 Total sections: {stats['total_sections']}")
    print(f"   ✨ Enhanced sections: {stats['enhanced_sections']} ({stats['enhanced_sections']/stats['total_sections']*100:.1f}%)")
    print(f"   📝 Total words: {stats['total_words']:,}")
    print(f"   📚 Citations: {stats['citations_count']}")
    print(f"   🖼️ Image placeholders: {stats['image_placeholders']}")
    
    return final_markdown

# ===============================
# EXECUTION FUNCTIONS
# ===============================
def create_thesis_markdown():
    """Create both simple and detailed markdown versions"""
    
    print("🚀 Starting thesis markdown generation...")
    
    # Check if enhanced file exists
    import os
    if not os.path.exists("section_hierarchy_enhanced.json"):
        print("❌ Error: section_hierarchy_enhanced.json not found!")
        print("   Please run the enhancement pipeline first.")
        return
    
    # Generate simple version
    print("\n📝 Generating simple thesis markdown...")
    generate_markdown_from_enhanced_json()
    
    # Generate detailed version
    print("\n📊 Generating detailed thesis markdown with statistics...")
    generate_detailed_markdown_with_stats()
    
    print("\n✅ Thesis markdown generation complete!")
    print("📁 Files created:")
    print("   • Complete_Thesis.md (simple version)")
    print("   • Complete_Thesis_Detailed.md (with statistics)")

if __name__ == "__main__":
    create_thesis_markdown()

🚀 Starting thesis markdown generation...

📝 Generating simple thesis markdown...
✅ Complete thesis saved as: Complete_Thesis.md
📄 Total length: 197120 characters
📚 Sections processed: 59

📊 Generating detailed thesis markdown with statistics...
✅ Detailed thesis saved as: Complete_Thesis_Detailed.md
📊 Statistics:
   📄 Total sections: 63
   ✨ Enhanced sections: 59 (93.7%)
   📝 Total words: 25,919
   📚 Citations: 65
   🖼️ Image placeholders: 51

✅ Thesis markdown generation complete!
📁 Files created:
   • Complete_Thesis.md (simple version)
   • Complete_Thesis_Detailed.md (with statistics)
