In [16]:
from langchain_community.document_loaders import UnstructuredPowerPointLoader
from langchain_core.prompts import SystemMessagePromptTemplate,HumanMessagePromptTemplate,ChatPromptTemplate
from langchain.schema import Document
from langchain_core.output_parsers import StrOutputParser

In [2]:
def extract_ppt_data(file_path: str, mode: str = 'elements', verbose: bool = False) -> dict:
    """
    Extracts content from a PowerPoint file and organizes it by page number.

    Args:
        file_path (str): Path to the PowerPoint file.
        mode (str): Mode for loading the PowerPoint file. Default is 'elements'.
        verbose (bool): If True, displays progress messages. Default is False.

    Returns:
        dict: A dictionary where keys are page numbers and values are the concatenated content of each page.

    Raises:
        FileNotFoundError: If the file path is invalid.
        ValueError: If the loader fails to process the file.
    """
    if verbose:
        print(f"Initializing UnstructuredPowerPointLoader with file: {file_path} and mode: {mode}...")

    try:
        loader = UnstructuredPowerPointLoader(file_path=file_path, mode=mode)
        docs = loader.load()
        if verbose:
            print(f"Successfully loaded {len(docs)} documents.")
    except FileNotFoundError:
        raise FileNotFoundError(f"The file '{file_path}' does not exist.")
    except Exception as e:
        raise ValueError(f"Failed to load PowerPoint file. Error: {str(e)}")

    ppt_data = {}

    for idx, doc in enumerate(docs, start=1):
        if isinstance(doc, Document):
            page_number = doc.metadata.get('page_number', None)
            if page_number:
                ppt_data[page_number] = ppt_data.get(page_number, '') + '\n' + doc.page_content
            if verbose:
                print(f"Processed document {idx}/{len(docs)}: Page {page_number if page_number else 'break'}")

    if verbose:
        print("Extraction complete.")

    return ppt_data

In [3]:
ppt_data = extract_ppt_data(file_path="./ml_course.pptx",verbose=True)

Initializing UnstructuredPowerPointLoader with file: ./ml_course.pptx and mode: elements...
Successfully loaded 47 documents.
Processed document 1/47: Page 1
Processed document 2/47: Page 1
Processed document 3/47: Page 1
Processed document 4/47: Page 1
Processed document 5/47: Page 2
Processed document 6/47: Page 2
Processed document 7/47: Page 3
Processed document 8/47: Page 3
Processed document 9/47: Page 3
Processed document 10/47: Page 3
Processed document 11/47: Page 3
Processed document 12/47: Page 3
Processed document 13/47: Page 4
Processed document 14/47: Page 4
Processed document 15/47: Page 4
Processed document 16/47: Page 4
Processed document 17/47: Page 5
Processed document 18/47: Page 5
Processed document 19/47: Page 5
Processed document 20/47: Page 6
Processed document 21/47: Page 6
Processed document 22/47: Page 6
Processed document 23/47: Page 6
Processed document 24/47: Page 6
Processed document 25/47: Page 6
Processed document 26/47: Page 6
Processed document 27/47:

In [4]:
ppt_data

{1: '\nMachine Learning Model Deployment\nIntroduction to ML Pipeline\nhttps://bit.ly/bert_nlp\n',
 2: '\nWhat is Machine Learning Pipeline?\n',
 3: '\nType of ML Deployment\nBatch: In batch deployment, ML models process large volumes of data at scheduled intervals, ideal for tasks like end-of-day reporting or monthly analytics.\nStream: Stream deployment enables ML models to process and analyze data in real-time as it flows in, suitable for applications like fraud detection or live social media analysis.\nRealtime: Realtime deployment allows ML models to provide instant predictions or decisions in response to incoming data, essential for use cases like recommendation systems or autonomous driving.\nEdge: Edge deployment involves running ML models on local devices close to the data source, reducing latency and bandwidth usage, which is crucial for IoT applications and smart devices.\n',
 4: '\nInfrastructure and Integration\nHardware and Software: Setting up the right environment for m

In [5]:
def build_context_from_ppt_data(ppt_data: dict, verbose: bool = False) -> str:
    """
    Builds a formatted context string from PowerPoint data organized by page number.

    Args:
        ppt_data (dict): A dictionary where keys are page numbers (int) and 
                         values are the corresponding page content (str).
        verbose (bool): If True, displays progress messages. Default is False.

    Returns:
        str: A formatted context string with page information.

    Raises:
        ValueError: If `ppt_data` is empty or not a dictionary.
    """
    if not isinstance(ppt_data, dict):
        raise ValueError("Invalid input: `ppt_data` must be a dictionary.")
    if not ppt_data:
        raise ValueError("Invalid input: `ppt_data` cannot be empty.")

    if verbose:
        print("Building context from PowerPoint data...")

    context = ""

    for page_number, page_content in sorted(ppt_data.items()):
        if not isinstance(page_number, int):
            if verbose:
                print(f"Skipping invalid page number: {page_number}")
            continue
        if not isinstance(page_content, str):
            if verbose:
                print(f"Skipping invalid page content for page {page_number}")
            continue

        if verbose:
            print(f"Adding content for Page-{page_number}...")

        context += f"### Page-{page_number}\n{page_content.strip()}\n\n"

    if not context:
        raise ValueError("Context generation failed: No valid content found in `ppt_data`.")

    if verbose:
        print("Context generation complete.")
    
    return context


In [6]:
context = build_context_from_ppt_data(ppt_data=ppt_data,verbose=True)

Building context from PowerPoint data...
Adding content for Page-1...
Adding content for Page-2...
Adding content for Page-3...
Adding content for Page-4...
Adding content for Page-5...
Adding content for Page-6...
Adding content for Page-7...
Adding content for Page-8...
Adding content for Page-9...
Context generation complete.


In [19]:
print(context)

### Page-1
Machine Learning Model Deployment
Introduction to ML Pipeline
https://bit.ly/bert_nlp

### Page-2
What is Machine Learning Pipeline?

### Page-3
Type of ML Deployment
Batch: In batch deployment, ML models process large volumes of data at scheduled intervals, ideal for tasks like end-of-day reporting or monthly analytics.
Stream: Stream deployment enables ML models to process and analyze data in real-time as it flows in, suitable for applications like fraud detection or live social media analysis.
Realtime: Realtime deployment allows ML models to provide instant predictions or decisions in response to incoming data, essential for use cases like recommendation systems or autonomous driving.
Edge: Edge deployment involves running ML models on local devices close to the data source, reducing latency and bandwidth usage, which is crucial for IoT applications and smart devices.

### Page-4
Infrastructure and Integration
Hardware and Software: Setting up the right environment for m

In [10]:
from langchain_openai import ChatOpenAI
from dotenv import load_dotenv

In [11]:
load_dotenv()

True

In [12]:
llm = ChatOpenAI(model="gpt-4o-mini", temperature=0.8)

In [21]:
system_prompt = SystemMessagePromptTemplate.from_template("""You are helpful AI assistant who create script from the given context 
                                                          and context is extracted from a PPT.""")

human_prompt = HumanMessagePromptTemplate.from_template("""Create a script based on the provided context ONLY! If you do not know the answer, just say "I don't know".
            ### Context:
            ```{context}```

            ### Question:
            ```{question}```

            ### Answer:""")


messages = [system_prompt, human_prompt]

template = ChatPromptTemplate.from_messages(messages=messages)

In [22]:
qna_chain = template | llm | StrOutputParser()

def ask_llm(context, question):
    return qna_chain.invoke(
        {
            'context': context,
            'question': question
        }
    )

In [None]:
question =("For each PowerPoint slide provided above, write a 3-4 minute script "
          "that effectively conveys the key points. Ensure a smooth flow between slides, "
          "maintaining a clear and engaging narrative.")

response = ask_llm(context=context,
                   question=question)

print(response)

**Slide 1: Introduction to Machine Learning Model Deployment**

Welcome everyone to our presentation on Machine Learning Model Deployment. Today, we will dive into the crucial components of the Machine Learning (ML) pipeline, highlighting the processes involved in deploying ML models effectively. For further reading, you can check out the link provided. Let's get started!

---

**Slide 2: What is Machine Learning Pipeline?**

First, let’s clarify what we mean by a Machine Learning pipeline. It is a streamlined series of steps that include data collection, processing, model training, evaluation, and deployment. Each component plays a vital role in ensuring that our models are effective and reliable once they are deployed in real-world applications.

---

**Slide 3: Types of ML Deployment**

Now, let's explore the different types of ML deployment. 

1. **Batch Deployment** processes large volumes of data at scheduled intervals, making it ideal for tasks like end-of-day reporting or month

***

# Final code with Python OOP's

In [24]:
from typing import Dict
from langchain_community.document_loaders import UnstructuredPowerPointLoader
from langchain_core.prompts import (
    SystemMessagePromptTemplate,
    HumanMessagePromptTemplate,
    ChatPromptTemplate,
)
from langchain.schema import Document
from langchain_core.output_parsers import StrOutputParser
from langchain_openai import ChatOpenAI
from dotenv import load_dotenv

load_dotenv()

True

In [25]:
class PowerPointProcessor:
    """
    A class to handle PowerPoint content extraction and context generation.
    """

    def __init__(self, verbose: bool = False):
        """
        Initialize the processor.

        Args:
            verbose (bool): Whether to enable verbose logging.
        """
        self.verbose = verbose

    def extract_data(self, file_path: str, mode: str = "elements") -> Dict[int, str]:
        """
        Extracts content from a PowerPoint file.

        Args:
            file_path (str): Path to the PowerPoint file.
            mode (str): Mode for loading the PowerPoint file. Default is 'elements'.

        Returns:
            Dict[int, str]: A dictionary where keys are page numbers and values
                            are the concatenated content of each page.

        Raises:
            FileNotFoundError: If the file path is invalid.
            ValueError: If the loader fails to process the file.
        """
        if self.verbose:
            print(f"Initializing loader for file: {file_path}, mode: {mode}...")

        try:
            loader = UnstructuredPowerPointLoader(file_path=file_path, mode=mode)
            docs = loader.load()
            if self.verbose:
                print(f"Successfully loaded {len(docs)} documents.")
        except FileNotFoundError:
            raise FileNotFoundError(f"The file '{file_path}' does not exist.")
        except Exception as e:
            raise ValueError(f"Failed to load PowerPoint file. Error: {str(e)}")

        ppt_data = {}
        for idx, doc in enumerate(docs, start=1):
            if isinstance(doc, Document):
                page_number = doc.metadata.get("page_number")
                if page_number:
                    ppt_data[page_number] = ppt_data.get(page_number, "") + "\n" + doc.page_content
                if self.verbose:
                    print(f"Processed document {idx}/{len(docs)}: Page {page_number if page_number else 'unknown'}")

        if not ppt_data:
            raise ValueError("No valid content extracted from the PowerPoint file.")

        return ppt_data

    def build_context(self, ppt_data: Dict[int, str]) -> str:
        """
        Builds a formatted context string from PowerPoint data.

        Args:
            ppt_data (Dict[int, str]): A dictionary where keys are page numbers
                                       and values are the corresponding page content.

        Returns:
            str: A formatted context string with page information.

        Raises:
            ValueError: If `ppt_data` is empty or not a dictionary.
        """
        if not isinstance(ppt_data, dict) or not ppt_data:
            raise ValueError("Invalid `ppt_data`: must be a non-empty dictionary.")

        if self.verbose:
            print("Building context from PowerPoint data...")

        context = ""
        for page_number, page_content in sorted(ppt_data.items()):
            if isinstance(page_number, int) and isinstance(page_content, str):
                if self.verbose:
                    print(f"Adding content for Page-{page_number}...")
                context += f"### Page-{page_number}\n{page_content.strip()}\n\n"
            elif self.verbose:
                print(f"Skipping invalid content for Page-{page_number}.")

        if not context.strip():
            raise ValueError("No valid content to generate context.")

        if self.verbose:
            print("Context generation complete.")

        return context

In [None]:
class ScriptGenerator:
    """
    A class to handle LLM-based script generation from context.
    """

    def __init__(self, model_name: str = "gpt-4o-mini", temperature: float = 0.8):
        """
        Initialize the script generator.

        Args:
            model_name (str): The name of the LLM model to use.
            temperature (float): Sampling temperature for the model.
        """
        self.llm = ChatOpenAI(model=model_name, temperature=temperature)
        self.qna_chain = self._initialize_chain()

    def _initialize_chain(self):
        """
        Initialize the LLM chain.

        Returns:
            Callable: A chain that processes prompts and generates responses.
        """
        system_prompt = SystemMessagePromptTemplate.from_template(
            "You are a highly skilled assistant. Your task is to create a script "
            "based on the context extracted from a PowerPoint presentation. "
            "Ensure clarity, engagement, and continuity in the script."
        )

        human_prompt = HumanMessagePromptTemplate.from_template(
            "Based on the context provided below, write a detailed, engaging script for each slide. "
            "Ensure the narrative flows smoothly and connects the key points.\n\n"
            "### Context:\n```{context}```\n\n### Question:\n```{question}```\n\n### Script:"
        )

        template = ChatPromptTemplate.from_messages(messages=[system_prompt, human_prompt])
        return template | self.llm | StrOutputParser()

    def generate_script(self, context: str, question: str) -> str:
        """
        Generate a script based on the given context and question.

        Args:
            context (str): The context string.
            question (str): The question or instruction for script creation.

        Returns:
            str: The generated script.

        Raises:
            ValueError: If script generation fails.
        """
        try:
            response = self.qna_chain.invoke({"context": context, "question": question})
            return response
        except Exception as e:
            raise ValueError(f"Script generation failed. Error: {str(e)}")


# Example usage
if __name__ == "__main__":
    file_path = "./ml_course.pptx"
    verbose = True

    processor = PowerPointProcessor(verbose=verbose)
    ppt_data = processor.extract_data(file_path=file_path)
    context = processor.build_context(ppt_data)

    question = (
        "For each PowerPoint slide provided above, write a 3-4 minute script "
        "that effectively conveys the key points. Ensure a smooth flow between slides, "
        "maintaining a clear and engaging narrative."
    )

    script_generator = ScriptGenerator()
    response = script_generator.generate_script(context=context, question=question)

    print(response)


Initializing loader for file: ./ml_course.pptx, mode: elements...
Successfully loaded 47 documents.
Processed document 1/47: Page 1
Processed document 2/47: Page 1
Processed document 3/47: Page 1
Processed document 4/47: Page 1
Processed document 5/47: Page 2
Processed document 6/47: Page 2
Processed document 7/47: Page 3
Processed document 8/47: Page 3
Processed document 9/47: Page 3
Processed document 10/47: Page 3
Processed document 11/47: Page 3
Processed document 12/47: Page 3
Processed document 13/47: Page 4
Processed document 14/47: Page 4
Processed document 15/47: Page 4
Processed document 16/47: Page 4
Processed document 17/47: Page 5
Processed document 18/47: Page 5
Processed document 19/47: Page 5
Processed document 20/47: Page 6
Processed document 21/47: Page 6
Processed document 22/47: Page 6
Processed document 23/47: Page 6
Processed document 24/47: Page 6
Processed document 25/47: Page 6
Processed document 26/47: Page 6
Processed document 27/47: Page 6
Processed document