In [190]:
from pydantic import BaseModel, Field
from openai import OpenAI
from dotenv import load_dotenv
from typing import List

In [191]:
load_dotenv()
client = OpenAI()

In [192]:
class SentenceLine(BaseModel):
    """Schema for a transcript line"""
    line_number: int = Field(..., description="Line number in the transcript")
    text: str = Field(..., description="The sentence on this line")

class Section(BaseModel):
    """Schema for a transcript section"""
    header: str = Field(..., description="Header/title of the section")
    summary: str = Field(..., description = "Brief summary of the section")
    start_line: int = Field(..., description="Start line number of the section (inclusive)")
    end_line: int = Field(..., description="End line number of the section (inclusive)")
    lines: List[SentenceLine] = Field(..., description="List of sentences (word-for-word) in this section")

class AnnotatedTranscript(BaseModel):
    """Schema for a transcript"""
    summary: str = Field(..., description = "Detailed summary of the transcript")
    sections: List[Section] = Field(..., description="List of structured transcript sections")

In [194]:
def annotate_transcript(transcript: str) -> list[dict]:
    """Create an annotated transcript, broken up into sections by line numbers"""
    res = client.beta.chat.completions.parse(
        model = "gpt-4.1",
        messages = [
            {
                "role": "system", 
                "content": "Break the following transcript into 5 or 6 sections"
            },
            {
                "role": "user", 
                "content": transcript
            }
        ],
        response_format = AnnotatedTranscript
    )
    transcript_sections = res.choices[0].message.parsed.model_dump()["sections"]
    return transcript_sections

In [195]:
def normalize_section(section: dict) -> dict:
    """Normalize a transcript section by combining lines into a single string"""
    transcript = ""
    for line in section["lines"]:
        transcript += line["text"] + "\n"
    return {
        "header": section["header"], 
        "summary": section["summary"], 
        "transcript": transcript
    }

In [245]:
class NoteSection(BaseModel):
    """Schema for a note section"""
    header: str = Field(..., description="Short, catchy header/title of the section formatted in markdown using '##'. Include a relevant emoji.")
    one_liner: str = Field(..., description = "Short, interesting, one-liner that encapsulates section. Put the one-liner in "". Use > 'quote' markdown")
    content: str = Field(..., description = "Detailed, bullet-points illustrating the section")

In [264]:
def generate_note_section(transcript_section: dict) -> dict:
    """Combine transcript section metadata and content to create a note section"""
    normalized_section = normalize_section(transcript_section)
    res = client.beta.chat.completions.parse(
        model = "gpt-4.1",
        messages = [
            {
                "role": "system", 
                "content": 
                    """
                    You are a course creator for a college class. 
                    Use the following section header, summary, and transcript to create a detailed note.
                    Indicate subheaders using '##', bold key terms and concepts for emphasis, use bullet points to represent lists, and indentation to indicate dependencies.
                    Format headers, one-liners, and text in markdown. Format math equations and symbols in katex.
                    """
            },
            {
                "role": "user", 
                "content": f"header: {normalized_section['header']} \n summary: {normalized_section['summary']} \n transcript: {normalized_section['transcript']}"
            }
        ],
        response_format = NoteSection
    )
    note_section = res.choices[0].message.parsed.model_dump()
    return note_section

In [265]:
def compile_note_section(note_section: dict) -> str:
    """Combine note section components into a string"""
    combined = note_section["header"] + "\n" + note_section["one_liner"] + "\n" + note_section["content"]
    return combined

In [282]:
class Extras(BaseModel):
    """Schema for additional content"""
    headline: str = Field(..., description = "Short, catchy title that encapsulates this lecture. Formatted in markdown using '##'.")
    summary: str = Field(..., description = "Short, engaging outline of the lecture. Begin with 'This lecture covers...'. Italicize in markdown using '*'.")
    todo_list: str = Field(
                            ..., 
                            description = 
                                    """
                                        Short list of the most important todos and next steps mentioned by the professor in the lecture. 
                                        Prioritize readings, exams, and assignments. 
                                        Include deadlines. 
                                        Format the title 'Todo List' in markdown using '###'.
                                        Use markdown format to **bold** parts of each todo. Keep each todo to 15 words or less. 
                                        Include markdown checkboxes '- [ ]' before each todo.
                                    """
                        )
    glossary: str = Field(..., description = "Glossary of key terms and definitions used throughout the lecture. Format the title 'Glossary' in markdown using '###'. Format the glossary as a markdown table with a divider '---' at the end.")
    quiz: str = Field(..., description = "Written quiz that tests a student's understanding of the lecture. Format the title 'quiz' in markdown using '###'.")
    answers: str = Field(..., description = "Answers to the written quiz. Format the title 'answers' in markdown using '###'.")

In [283]:
def generate_extras(transcript: str) -> dict:
    """Generate additional content"""
    res = client.beta.chat.completions.parse(
        model = "gpt-4.1",
        messages = [
            {
                "role": "system", 
                "content": 
                    """
                    You are a course creator for a college class. 
                    Generate a headline, summary, todo list, glossary, and quiz that reflects the following transcript.
                    Indicate subheaders using '##', bold key terms and concepts for emphasis, use bullet points to represent lists, and indentation to indicate dependencies.
                    Format the response in markdown. Format math equations and symbols in katex.
                    """
            },
            {
                "role": "user", 
                "content": transcript
            }
        ],
        response_format = Extras
    )
    extras = res.choices[0].message.parsed.model_dump()
    return extras

In [284]:
def compile_note(transcript: str) -> str:
    """
    Break a transcript into semantically meaningful sections
    Generate a note for each section, and additional content for the transcript
    Combine each note and additional content together
    """
    transcript_sections = annotate_transcript(transcript)
    compiled_note_sections = []

    for transcript_section in transcript_sections:
        note_section = generate_note_section(transcript_section)
        compiled_note = compile_note_section(note_section)
        compiled_note_sections.append(compiled_note)

    extras = generate_extras(transcript)

    combined_note_sections = ""
    for compiled_note_section in compiled_note_sections:
        combined_note_sections += compiled_note_section + "\n"

    combined_note = (
                        extras["headline"] + 
                        "\n" +
                        extras["summary"] + 
                        "\n" +
                        extras["todo_list"] + 
                        "\n" +
                        combined_note_sections +
                        extras["glossary"] +
                        "\n" +
                        extras["quiz"] +
                        "\n" +
                        extras["answers"]
                    )
    return combined_note

In [212]:
with open("C:/Users/Kaden/OneDrive/Professional/Athena/Note Creation/Semantic Chunking/transcripts/ww1.txt", encoding = "utf-8-sig") as f:
    transcript = f.read()

In [285]:
note = compile_note(transcript)

In [286]:
print(note)

## Was World War I Inevitable? Bargaining, Rationality, and the Limits of International Institutions
*This lecture covers the complexities behind the onset and aftermath of World War I, exploring whether the conflict was a deliberate act or a tragic miscalculation. We examine key IR theories—like bargaining theory, rationalist explanations for war, commitment and information problems, issue indivisibility, and the roles of offensive and defensive realism. The discussion expands to the legacy of WWI, the Paris Peace Conference, reparations, and the League of Nations, using theorists like Lieber, Cohen, Martin, and Span. The limits and biases of international institutions are debated, especially the dynamics between powerful and weaker states.*
### Todo List
- [ ] **Read Lieber's paper on rationalist explanations for WWI** (by next seminar).
- [ ] **Review readings by Cohen, Martin, and Span on international institutions**.
- [ ] **Check email for released essay questions; begin outlinin

In [None]:
# improve katex formatting and add hallucination checks for each section
# in long term, let's build out knowledge bases for each subject to improve accuracy and reference in hallucination checks