## Imports

In [1]:
import os
import json
import whisper
from fpdf import FPDF
from pathlib import Path
from dotenv import load_dotenv
from moviepy import VideoFileClip
from PyPDF2 import PdfReader, PdfWriter
from langchain.prompts import PromptTemplate
from langchain_core.runnables import RunnableLambda
from langchain.schema.runnable import RunnableParallel
from langchain_google_genai import ChatGoogleGenerativeAI
# from langchain_openai import ChatOpenAI
from langchain_community.document_loaders import PDFMinerLoader
from langchain_core.runnables.passthrough import RunnableAssign

load_dotenv(override=True)
os.environ["GOOGLE_API_KEY"] = os.getenv("GOOGLE_API_KEY")
# os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
os.environ["LANGCHAIN_API_KEY"] = os.getenv("LANGCHAIN_API_KEY")
os.environ["LANGCHAIN_PROJECT"] = os.getenv("LANGCHAIN_PROJECT")
os.environ["LANGCHAIN_TRACING_V2"] = os.getenv("LANGCHAIN_TRACING_V2")

input_video_path = Path(r"C:\Users\USER\Desktop\QA-Model-Lacture\data\VIDEO\1.mp4")
input_audio_path = Path(r"C:\Users\USER\Desktop\QA-Model-Lacture\data\audio\1.mp3")
input_pdf_path = Path(r"C:\Users\USER\Desktop\QA-Model-Lacture\data\pdfs\1.pdf")  

output_dir = Path(r"C:\Users\USER\Desktop\QA-Model-Lacture\data\ouput")
split_pdf_dir = Path(r"C:\Users\USER\Desktop\QA-Model-Lacture\data\ouput\split_pdfs")  
input_audio_dir = Path(r"C:\Users\USER\Desktop\QA-Model-Lacture\data\audio")
input_pdf_dir = Path(r"C:\Users\USER\Desktop\QA-Model-Lacture\data\pdfs")  
cumulative_concise_summary_file = output_dir / "cumulative_concise_summary.txt"
cumulative_detailed_summary_file = output_dir / "cumulative_detailed_summary.txt"

# Ensure the output folder exists
output_dir.mkdir(parents=True, exist_ok=True)
split_pdf_dir.mkdir(parents=True, exist_ok=True)
input_audio_dir.mkdir(parents=True, exist_ok=True)
input_pdf_dir.mkdir(parents=True, exist_ok=True)

## Converting movie video to audio 

In [2]:
def video_to_audio(video_path, output_path=None):
    """Convert video to audio regardless of length"""
    # Validate input path
    if not os.path.exists(video_path):
        raise FileNotFoundError(f"Video file not found: {video_path}")

    try:
        # Process video in chunks
        with VideoFileClip(video_path) as video:
            audio = video.audio
            audio.write_audiofile(
                output_path,
                codec='mp3',
                bitrate='192k',
                logger=None  # Disable progress bar for cleaner output
            )
        return output_path
    except Exception as e:
        # Clean up partial files on error
        if os.path.exists(output_path):
            os.remove(output_path)
        raise RuntimeError(f"Conversion failed: {str(e)}")

In [3]:
video_to_audio(input_video_path, output_path=input_audio_path)

WindowsPath('C:/Users/USER/Desktop/QA-Model-Lacture/data/audio/1.mp3')

## Converting Audio to text using openai-wisper, pytorch ffmpeg

- with turbo model a audio file( .mp3 ) of length 22.06 min takes 12 min to tarnslate. we need to deploy this over a t3.xlarge ec2 instance which will gone cost us around 63 doller month. which is 0.0875 doller per month. and to convert a 1 hour audio file it would take 30 min costing us -> 0.04375

In [4]:
model = whisper.load_model("turbo")

In [5]:
result = model.transcribe(str(input_audio_path))
# print(result["text"])



# Saving the text as pdf

In [6]:
def save_text_to_pdf(text, output_path):
    pdf = FPDF()
    pdf.add_page()
    pdf.set_auto_page_break(auto=True, margin=15)
    pdf.set_font("Arial", size=12)
    pdf.multi_cell(0, 10, text)
    pdf.output(output_path)

In [7]:
save_text_to_pdf(result["text"], output_path=input_pdf_path)

# Now making the model

### PDf spliter

In [8]:
def split_pdf(input_pdf_path, output_folder):
    # Open the PDF file
    reader = PdfReader(input_pdf_path)
    total_pages = len(reader.pages)
    
    # Iterate through all pages
    for i in range(total_pages):
        writer = PdfWriter()
        writer.add_page(reader.pages[i])  # Add one page to the writer
        
        # Generate the output file name
        output_file_path = output_folder / f"page_{i + 1}.pdf"
        with output_file_path.open("wb") as output_file:
            writer.write(output_file)
    return total_pages

total_pdfs=split_pdf(input_pdf_path, split_pdf_dir)

### Getting the model

In [9]:
question_generation_model = ChatGoogleGenerativeAI(model="gemini-2.5-flash-preview-05-20") #both the model does not support custom temperature values and only allows the default value of 1
summary_generation_model = ChatGoogleGenerativeAI(model="gemini-2.0-flash")

### Output parser

In [10]:
question_json_schema = {
    "title": "question_answers_and_explanations",   
    "type": "object",
    "properties": {
        "hard_difficult_questions": {                        
            "type": "array",
            "items": {                        
                "type": "object",
                "properties": {
                    "question": {
                        "type": "string",
                        "description": "Compose a very difficult multiple choice question with 1 correct answer and 3 incorrect answers"
                    },
                    "options":{
                        "type":"array",
                        "items":{
                            "type":"string"
                        },
                        "description":"Compose 4 different answer options for the question with 1 correct answer and 3 incorrect answers"
                    },
                    "correct_answer": {
                        "type": "string",
                        "description": "Return the correct answer to the question"
                    },
                    "answer_explanation": {
                        "type": "string",
                        "description": "Write the detailed explanation for the correct answer"
                    }
                },
                "required": ["question", "options" ,"correct_answer", "answer_explanation"]
            },
            "description": "This array contains all the hard difficulty questions, correct answers and answer explanations",
        },
        "medium_difficult_questions": {                        
            "type": "array",
            "items": {                        
                "type": "object",
                "properties": {
                    "question": {
                        "type": "string",
                        "description": "Compose a medium difficulty multiple choice question with 1 correct answer and 3 incorrect answers"
                    },
                    "options":{
                        "type":"array",
                        "items":{
                            "type":"string"
                        },
                        "description":"Compose 4 different answer options for the question with 1 correct answer and 3 incorrect answers"
                    },
                    "correct_answer": {
                        "type": "string",
                        "description": "Return the correct answer to the question"
                    },
                    "answer_explanation": {
                        "type": "string",
                        "description": "Write the detailed explanation for the correct answer"
                    }
                },
                "required": ["question", "options" ,"correct_answer", "answer_explanation"]
            },
            "description": "This array contains all the medium difficulty questions, correct answers and answer explanations",
        },
        "easy_difficult_questions": {                        
            "type": "array",
            "items": {                        
                "type": "object",
                "properties": {
                    "question": {
                        "type": "string",
                        "description": "Compose a easy difficulty multiple choice question with 1 correct answer and 3 incorrect answers"
                    },
                    "options":{
                        "type":"array",
                        "items":{
                            "type":"string"
                        },
                        "description":"Compose 4 different answer options for the question with 1 correct answer and 3 incorrect answers"
                    },
                    "correct_answer": {
                        "type": "string",
                        "description": "Return the correct answer to the question"
                    },
                    "answer_explanation": {
                        "type": "string",
                        "description": "Write the detailed explanation for the correct answer"
                    }
                },
                "required": ["question", "options" ,"correct_answer", "answer_explanation"]
            },
            "description": "This array contains all the easy difficulty questions, correct answers and answer explanations",
        },
        "Topic_importance_rating": {
            "type": "string",
            "enum": ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10"],
            "description": """
RUTHLESSLY rate 1-10 CURRENT PAGE IMORTANCE for understanding the complete topic (MUST BE HYPER-CRITICAL):

**10-Point Importance Scale**:
- **10**: The page is absolutely essential; it introduces the main topic or provides a comprehensive overview. Without this page, understanding the topic would be severely compromised.
- **9**: The page introduces a major subtopic or key concept that is critical for understanding the topic.
- **8**: The page provides significant information or explanations that are necessary for a full understanding.
- **7**: The page offers useful details or examples that enhance understanding but are not absolutely necessary.
- **6**: The page contains information that is relevant but could be omitted without greatly affecting understanding.
- **5**: The page has content that is part of the topic but not particularly crucial.
- **4**: The page includes some relevant information but is mostly peripheral.
- **3**: The page has minimal importance; it is largely tangential or a digression.
- **2**: The page has very low importance; it contains little to no relevant information.
- **1**: The page is irrelevant; it does not contribute to understanding the topic at all.

**Anti-Bias Rules**:
- ❌ Never rate based on previous/future pages; consider only the current page's content.
- ❌ No extra points for "setup" or "foreshadowing"; rate based on the actual content provided.
- ❌ Content-heavy ≠ automatically important; focus on the significance of the information, not just the quantity.
- ⛔ If unsure between two numbers, PICK THE LOWER ONE; be conservative in assigning higher ratings.

**Distribution Guidance**:
- **9-10**: Extremely rare, only for the very best pages.
- **7-8**: Very rare, for pages clearly above average.
- **1-6**: Common, with most pages likely in the 4-6 range.
"""
        }
    },
    "required": ["hard_difficult_questions", "medium_difficult_questions", "easy_difficult_questions","Topic_importance_rating"]
}

In [11]:
summary_json_schema = {
    "title": "detailed_page_summary_and_concise_page_summary",
    "type": "object",
    "properties": {
        "detail_page_summary": {
            "type": "string",
            "description": "Comprehensive summary including .."
        },
        "concise_page_summary": {
            "type": "string",
            "description": "Condensed summary focusing ... (3-5 sentences max)"
        }
    },
    "required": ["detail_page_summary", "concise_page_summary"]
}

In [12]:
structured_question_generation_model = question_generation_model.with_structured_output(question_json_schema)
structured_summary_generation_model = summary_generation_model.with_structured_output(summary_json_schema)

In [13]:
extract = RunnableLambda(lambda x: {
    "page_text": x["page_text"],
    "cumulative_concise_summary": x["cumulative_concise_summary"],
    "detail_page_summary": x["both_summary"]["detail_page_summary"],
    "concise_page_summary": x["both_summary"]["concise_page_summary"]
})

### Prompt

In [14]:
question_prompt = PromptTemplate(
    input_variables=["detail_page_summary", "cumulative_concise_summary"],
    template="""
Generate up to 30 multiple-choice questions (10 Easy, 10 Medium, 10 Hard) with **four answer options each** (A, B, C, D), correct answer option (eg. A), and a brief answer explanation based on the **detailed page summary**. Use the cumulative concise summary from previous pages only for context, not for generating the questions themselves. Follow these STRICT RULES:

### VALIDATION RULES

1. **Question Basis**  
   - All questions must be based solely on the information provided in the `detail_page_summary`. Do not use any information from the `cumulative_concise_summary` to formulate the questions or their answers, except for context.
   - If there is not enough material to generate all 30 questions, produce as many as possible, but preserve the ratio of Easy:Medium:Hard as closely as possible.

2. **Diversity**
   - Cover a range of topics from the **detailed page summary**, including main topics, key points, explanations, examples, interactive elements, etc.

3. **Difficulty Levels**  
   - **Easy** (10): Direct recall or very simple recognition questions (e.g., definitions, straightforward facts).  
   - **Medium** (10): Require applying a concept to a slightly modified scenario or combining two ideas from the summary.  
   - **Hard** (10): Involve Analysis, synthesis, evaluation, complex inferences, multi‐step reasoning, or distinguishing subtle nuances (e.g., “Which of the following best explains why… based on the lecturer’s reasoning?”).

4. **Answer Explanation**  
   - For each question, provide a clear explanation of why the correct answer is right and why the others are wrong.
   - Each answer explanation, begin with either “In the lecture…” or “As defined…” or “According to the instructor…”  or "From the example given...", or "In the discussion, it is stated that..."
   - DO NOT use banned phrases: “in the transcript,” “in the page,” “in the document,” “in the script,” or “according to the text.”

5. **Stylistic Constraints**  
   - Use clear, academic language.  
   - Avoid phrasing that implies inference beyond what is written (e.g., “It can be assumed that…”).  
   - Do NOT reference “previous page” or “next page”—only refer to “the summary” or “the lecture.”

### CONTEXT
- **Concise Summary of Previous Pages (for reference only):**  
{cumulative_concise_summary}

- **Detailed Summary of Current Page:**  
{detail_page_summary}
""")

In [15]:
summary_prompt = PromptTemplate(
    input_variables=["page_text", "cumulative_concise_summary"],
    template="""
Analyze this educational lecture transcript page with STRICT ADHERENCE TO VISIBLE CONTENT ONLY to generate a DETAILED PAGE SUMMARY and a CONCISE PAGE SUMMARY.

# DETAILED PAGE SUMMARY
- Structure your summary under these Markdown headers:
  ### Key Topics
    - List the main subjects or themes introduced on this page.
    - If none: “No new topics introduced.”
  ### Definitions & Terminology
    - Capture any formal definitions, technical terms, or jargon explicitly defined.
    - Formal notations if "definitions & terminology" are mathematically presented
    - If none: “No definitions or technical terms introduced.”
  ### Explanations & Concepts
    - Summarize each concept or idea as explained, using the lecturer’s own wording as closely as possible.
    - Include any step‐by‐step reasoning, derivations, or pedagogical points.
    - If none: “No detailed explanations on this page.”
  ### Examples & Illustrations
    - Describe any example problems, case studies, analogies, or illustrative anecdotes given.
    - If none: “No examples or illustrations provided.”
  ### Important Details & Emphases
    - Note any crucial facts, caveats, or emphases (e.g., “Remember that…”, “It’s important not to…”).
    - If none: “No critical details emphasized.”
  ### Visual Aids
    - Descriptions of figures/diagrams using ONLY captions/text references.
    - If none: "No visual aids referenced"

- Absolute Prohibitions:
  1. Do NOT introduce any information not explicitly in the CURRENT PAGE CONTENT.
  2. Do NOT infer the lecturer’s intent or add personal commentary.
  3. Do NOT paraphrase examples beyond what is directly stated.
  4. Do NOT reference anything beyond this page’s content.
- Use plain Markdown bullet points within each section.

# CONCISE PAGE SUMMARY
- Construction Rules:
  1. MUST connect to previous context using logical connectors or temporal markers:
     - “Building on [previous concept]…”
     - “Next, the lecturer explains…”
     - “Consequently, the focus shifts to…”
     - "Following the discussion on [previous topic]..."
  2. If no clear connection to earlier pages: begin with "In this section..." or similar.
  3. Maintain neutral academic tone (no storytelling voice).
  4. Emphasize the flow of ideas (cause→effect or definition→application).
  5. ONLY use CURRENT PAGE FACTS + the explicit previous summary provided.
  6. Maintain consistency with the overall lecture theme.

- Output Requirements:
  - Novel‐style academic prose (no lists, no bullet points).
  - 3–5 sentences total.
  - Focus on how this page advances the overall lecture narrative.
  - Highlight one or two pivotal points from this page.
  - Emphasize logical flow between topics

# FALLBACK PROTOCOL
- For ANY ambiguous or missing information:
  1. Acknowledge uncertainty by stating: “Not explicitly stated in transcript.”
  2. Do NOT attempt to invent details.
- If the page contains:
  1. Blank or scanned page with no text → Return both summaries as empty with “Empty page” note.
  2. Only title or transition remarks (no substantive content) → Return both summaries with “Transition/overview only” note.
  3. Copyright/legal boilerplate → Return both summaries with “Copyright notice” note.
  4. Transition or filler content with no new information -> Return empty summaries with "No new information" note

# CONTEXT
- Concise summary of all previous pages (for reference only):  
{cumulative_concise_summary}

# CURRENT PAGE CONTENT
{page_text}
"""
)

### Model in loop

In [None]:
cumulative_concise_summary = ""
cumulative_detailed_summary = ""
for i in range(total_pdfs):
    current_page_number = i + 1

    pdf_name = split_pdf_dir / f"page_{current_page_number}.pdf"
    loader = PDFMinerLoader(pdf_name)
    docs = loader.load()
    page_text = docs[0].page_content

    summary_chain = summary_prompt | structured_summary_generation_model
    question_chain = question_prompt | structured_question_generation_model

    chain1 = RunnableAssign(RunnableParallel({"both_summary": summary_chain}))
    chain2 = RunnableAssign(RunnableParallel({"question": question_chain}))

    final_chain = chain1 | extract | chain2 
    result = final_chain.invoke({"page_text":page_text,"cumulative_concise_summary":cumulative_concise_summary})
    
    question = result["question"]
    concise_page_summary = result["concise_page_summary"]
    detailed_page_summary = result["detail_page_summary"]

    # Save results
    page_folder = output_dir / f"page_{current_page_number}"
    page_folder.mkdir(parents=True, exist_ok=True)
    question_json = page_folder / "question.json"

    with question_json.open("w") as f:
        json.dump(question, f, indent=4)
    
    cumulative_concise_summary += f"\n\nPage {current_page_number} Summary:\n{concise_page_summary}\n"
    cumulative_detailed_summary += f"\n\nPage {current_page_number} Summary:\n{detailed_page_summary}\n"

    with cumulative_detailed_summary_file.open("w") as f:
        f.write(cumulative_detailed_summary)
    
    with cumulative_concise_summary_file.open("w") as f:
        f.write(cumulative_concise_summary) 

Key 'parameters' is not supported in schema, ignoring
Key 'parameters' is not supported in schema, ignoring
Key 'parameters' is not supported in schema, ignoring
Key 'parameters' is not supported in schema, ignoring
Key 'parameters' is not supported in schema, ignoring
Key 'parameters' is not supported in schema, ignoring
Key 'parameters' is not supported in schema, ignoring
Key 'parameters' is not supported in schema, ignoring
Key 'parameters' is not supported in schema, ignoring
Key 'parameters' is not supported in schema, ignoring
Key 'parameters' is not supported in schema, ignoring
Key 'parameters' is not supported in schema, ignoring
Retrying langchain_google_genai.chat_models._chat_with_retry.<locals>._chat_with_retry in 2.0 seconds as it raised InternalServerError: 500 An internal error has occurred. Please retry or report in https://developers.generativeai.google/guide/troubleshooting.
Key 'parameters' is not supported in schema, ignoring
Key 'parameters' is not supported in s