# Making the model

- Logic/changes
    - using large models
    - First generating a detailed page summary(by giving page text and cumulative summary) then using this page summary and cumulative summary we generate the question(also answer and answer explaintaion)
    - Removing validation layer
    - Summarize this detailed summary 
    - then adding this page summary also to the cumulative summary

### Basic imports and pdf spliting

In [13]:
import os
import json
from pathlib import Path
from dotenv import load_dotenv
from PyPDF2 import PdfReader, PdfWriter
from langchain.prompts import PromptTemplate
from langchain_core.runnables import RunnableLambda
from langchain.schema.runnable import RunnableParallel
# from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_openai import ChatOpenAI
from langchain_community.document_loaders import PDFMinerLoader
from langchain_core.runnables.passthrough import RunnableAssign

load_dotenv(override=True)
# os.environ["GOOGLE_API_KEY"] = os.getenv("GOOGLE_API_KEY")
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
os.environ["LANGCHAIN_API_KEY"] = os.getenv("LANGCHAIN_API_KEY")
os.environ["LANGCHAIN_PROJECT"] = os.getenv("LANGCHAIN_PROJECT")
os.environ["LANGCHAIN_TRACING_V2"] = os.getenv("LANGCHAIN_TRACING_V2")

input_pdf_path = Path(r"C:\Users\USER\Desktop\QA-Model\data\movie1\avengersEndgameScript.pdf")  # Replace with your PDF path
pdf_output_folder = Path(r"C:\Users\USER\Desktop\QA-Model\data\movie1\split_pdf")  # Replace with your output folder path
output_folder = Path(r"C:\Users\USER\Desktop\QA-Model\data\movie1\output")
cumulative_detailed_summary_file = output_folder / "cumulative_detailed_summary.txt"
cumulative_concise_summary_file = output_folder / "cumulative_concise_summary.txt"

# Ensure the output folder exists
pdf_output_folder.mkdir(parents=True, exist_ok=True)
output_folder.mkdir(parents=True, exist_ok=True)

### Spliting the pdf

In [14]:
def split_pdf(input_pdf_path, output_folder):
    # Open the PDF file
    reader = PdfReader(input_pdf_path)
    total_pages = len(reader.pages)
    
    # Iterate through all pages
    for i in range(total_pages):
        writer = PdfWriter()
        writer.add_page(reader.pages[i])  # Add one page to the writer
        
        # Generate the output file name
        output_file_path = output_folder / f"page_{i + 1}.pdf"
        with output_file_path.open("wb") as output_file:
            writer.write(output_file)
    return total_pages

total_pdfs=split_pdf(input_pdf_path, pdf_output_folder)

### Getting the model

In [15]:
question_generation_model = ChatOpenAI(model="o4-mini-2025-04-16") #both the model does not support custom temperature values and only allows the default value of 1
summary_generation_model = ChatOpenAI(model="gpt-4.1-mini-2025-04-14")

### Getting the output parser

In [16]:
question_json_schema = {
    "title": "question_answers_and_explanations",   
    "type": "object",
    "properties": {
        "all_questions": {                        
            "type": "array",
            "items": {                        
                "type": "object",
                "properties": {
                    "question": {
                        "type": "string",
                        "description": "Compose a question that can be answered with a simple Yes or No"
                    },
                    "correct_answer": {
                        "type": "string",
                        "enum": ["Yes", "No"],
                        "description": "Return the correct answer to the question"
                    },
                    "answer_explanation": {
                        "type": "string",
                        "description": "Write the explanation for the correct answer"
                    }
                },
                "required": ["question", "correct_answer", "answer_explanation"]
            },
            "description": "This array contains all the questions, correct answers and explanations",
        },
        "Page_interesting_rating": {
            "type": "string",
            "enum": ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10"],
            "description": """
RUTHLESSLY rate 1-10 CURRENT PAGE engagement (MUST BE HYPER-CRITICAL):

10-Point Reality Check:
10 = Story-defining climax (ONLY 1-2x per entire story)
9 = Genre-changing twist (alters fundamental plot trajectory)
8 = Protagonist life-threatening crisis (permanent consequences)
7 = Major character betrayal/irreversible decision
6 = Standard action scene (no lasting story impact)
5 = Routine character development (backstory reveals)
4 = Transitional dialogue (moving between locations)
3 = Atmospheric descriptions (weather, environments)
2 = Filler content (characters eating/sleeping)
1 = Pure mechanical transition ("They left the building")

Rating Restrictions:
- 9-10: Reserved for pages that would make viewers gasp aloud
- 7-8: Requires permanent plot/relationship changes
- 5-6: Default for competent but unremarkable pages
- 1-4: For pages authors would consider deleting

Anti-Bias Rules:
❌ Never rate based on previous/future pages
❌ No extra points for "setup" or "foreshadowing"
❌ Dialogue-heavy ≠ automatically interesting
⛔ If unsure between two numbers, PICK THE LOWER ONE

Distribution Guidance:
• 9-10: Extreamly rare
• 7-8: Very rare
• 1-6: Common
"""
        }
    },
    "required": ["all_questions", "Page_interesting_rating"]
}

In [17]:
summary_json_schema = {
    "title": "detailed_summary_and_concise_summary",
    "type": "object",
    "properties": {
        "detail_page_summary": {
            "type": "string",
            "description": "Comprehensive summary including characters, dialogue, actions, and plot details"
        },
        "concise_page_summary": {
            "type": "string",
            "description": "Condensed summary focusing only on major plot developments and key actions (3-5 sentences max)"
        }
    },
    "required": ["detail_page_summary", "concise_page_summary"]
}

In [18]:
structured_question_generation_model = question_generation_model.with_structured_output(question_json_schema)
structured_summary_generation_model = summary_generation_model.with_structured_output(summary_json_schema)

In [19]:
extract = RunnableLambda(lambda x: {
    "page_text": x["page_text"],
    "cumulative_concise_summary": x["cumulative_concise_summary"],
    "detail_page_summary": x["both_summary"]["detail_page_summary"],
    "concise_page_summary": x["both_summary"]["concise_page_summary"]
})

### Making the prompt

In [20]:
question_prompt = PromptTemplate(
    input_variables=["detail_page_summary", "cumulative_concise_summary"],
    template=("""Generate 10 diverse questions, answer and answer explanation by following these STRICT RULES:
   
### VALIDATION RULES

   1. Questions must be answerable with Yes/No.
   2. Base questions only on explicit information presnt in current page summary.
   3. If insufficient information for 10 questions, generate fewer questions.
   4. Ensure answers are unambiguous.
   5. Avoid similar questions about the same fact
   6. Include varied question types (actions, dialogue, presence, etc.)
   7. For answer explanations:
      - MUST use: "In the scene...", "As shown...", "Through dialogue...", "Visually we see..."
      - BANNED PHRASES: "summary", "script", "page", "document", "text"
   8. If answer cannot be DIRECTLY VERIFIED from given content:
      - DO NOT create the question
   9. Absolute prohibitions:
      - No assumptions beyond given context


### CONTEXT 

   Previous Story Context (for reference only):
   \n{cumulative_concise_summary}\n

   Current Page Summary:
   \n{detail_page_summary}\n

"""))

In [21]:
summary_prompt = PromptTemplate(
    input_variables=["page_text", "cumulative_concise_summary"],
    template="""Analyze this movie script page with STRICT ADHERENCE TO VISIBLE CONTENT ONLY to generate a DETAILED SUMMARY and CONCISE SUMMARY.

# DETAILED SUMMARY
- Include:
    1. Characters:
        - ONLY newly introduced characters with DESCRIPTIONS EXPLICITLY STATED
        - If none: "No new characters introduced"
    2. Dialogue
        - EXACT quotes from 3-5 CRUCIAL exchanges
        - If none: "No key dialogue exchanges"
    3. Actions:
        - Specific actions taken by characters
        - If none: "No significant physical actions"
    4. Plot:
        - Important plot developments
        - If none: "No plot advancements"
    5. Locations:
        - Scene locations and time references
        - If vague: "Location and time unspecified"
    6. Background:
        - Background information
        - If none: "No background information"
- Absolute Prohibitions:
    1. Using Context from previous page to generate DeTAILED SUMMARY
    2. NO character psychology interpretation
    3. NO dialogue paraphrasing
    4. NO location extrapolation
    5. NO time assumptions
- Format with markdown headers:
  ### Characters
  ### Dialogue
  ### Actions
  ### Plot
  ### Locations
  ### Background

# CONSISTENT SUMMARY
- Construction Rules:
    1. MUST connect to previous context using TEMPORAL MARKERS:
        - "Following [previous event]..."
        - "While [ongoing action]..."
        - "[Time] later..."
    2. If no connection possible: Start with "Meanwhile..." using current facts
    3. Maintain consistent character voices
    4. Highlight plot consequences
    5. ONLY use CURRENT PAGE FACTS + EXPLICIT PREVIOUS CONTEXT
- Output Requirements:
    - Novel-style prose
    - 5-7 sentences 
    - Focus on overall storyline
    - Cause-effect chain emphasis

# Fallback Protocol
    - For ANY ambiguous/missing information:
        1. Acknowledge uncertainty explicitly
        2. DO NOT attempt to "fill in gaps"
        3. Use phrase: "Not explicitly stated in scene"
    - If page contains:
        1. Blank page -> Return empty summaries with "Empty page" note
        2. Transition scenes with no developments -> Return empty summaries with "Scene transition" note
        3. Copyright notices -> Return empty summaries with "Copyright notice" note

# CONTEXT
- Context from previous pages (for reference only):
\n{cumulative_concise_summary}\n

- Current Page Content:
\n{page_text}\n

"""
)

### Making the model in a loop with chain

In [22]:
cumulative_concise_summary = ""
cumulative_detailed_summary = ""
for i in range(total_pdfs):
    current_page_number = i + 1

    pdf_name = pdf_output_folder / f"page_{current_page_number}.pdf"
    loader = PDFMinerLoader(pdf_name)
    docs = loader.load()
    page_text = docs[0].page_content

    summary_chain = summary_prompt | structured_summary_generation_model
    question_chain = question_prompt | structured_question_generation_model

    chain1 = RunnableAssign(RunnableParallel({"both_summary": summary_chain}))
    chain2 = RunnableAssign(RunnableParallel({"question": question_chain}))

    final_chain = chain1 | extract | chain2 
    result = final_chain.invoke({"page_text":page_text,"cumulative_concise_summary":cumulative_concise_summary})
    
    question = result["question"]
    concise_page_summary = result["concise_page_summary"]
    detailed_page_summary = result["detail_page_summary"]

    # Save results
    page_folder = output_folder / f"page_{current_page_number}"
    page_folder.mkdir(parents=True, exist_ok=True)
    question_json = page_folder / "question.json"

    with question_json.open("w") as f:
        json.dump(question, f, indent=4)
    
    cumulative_concise_summary += f"\n\nPage {current_page_number} Summary:\n{concise_page_summary}\n"
    cumulative_detailed_summary += f"\n\nPage {current_page_number} Summary:\n{detailed_page_summary}\n"

    with cumulative_detailed_summary_file.open("w") as f:
        f.write(cumulative_detailed_summary)
    
    with cumulative_concise_summary_file.open("w") as f:
        f.write(cumulative_concise_summary) 

### Testing token size for each summary

In [22]:
a= summary_generation_model.get_num_tokens(cumulative_detailed_summary)
b= summary_generation_model.get_num_tokens(cumulative_concise_summary)

In [23]:
print(a)
print(b)

46012
14434
