In [1]:
# Required imports
from langchain.document_loaders import UnstructuredMarkdownLoader, NotebookLoader
from langchain.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI
from langchain.prompts import PromptTemplate
from typing import List, Dict
from datetime import datetime
import glob
from dotenv import load_dotenv, find_dotenv
import os

# Load environment variables
load_dotenv(find_dotenv())

# Custom prompt template
custom_prompt_template = """You are a helpful AI assistant specialized in demand forecasting and related topics.
Use the following pieces of context to answer the question at the end.
If you don't know the answer, just say that you don't know, don't try to make up an answer.

Context: {context}

Question: {question}

Please provide your answer following these guidelines:
1. Use ONLY information from the provided context
2. ALWAYS cite your sources using [Source X] format where X is the source number
3. If multiple sources support a statement, cite all relevant sources: [Source 1,2]
4. If the context doesn't contain enough information, clearly state that
5. Structure your response in a clear, logical manner
6. Keep the answer focused and relevant to demand forecasting

Remember: Every significant statement should have a source citation.

Answer: Let me help you with that.
"""

# Create the prompt template
PROMPT = PromptTemplate(
    template=custom_prompt_template, input_variables=["context", "question"]
)


def load_markdown_files(directory: str = "./data/docs/") -> List:
    """Load markdown files from directory"""
    loader = DirectoryLoader(
        directory, glob="**/*.md", loader_cls=UnstructuredMarkdownLoader
    )
    return loader.load()


def load_jupyter_notebooks(directory: str = "./demand_forecast_notebooks/") -> List:
    """Load jupyter notebooks from directory"""
    documents = []
    for notebook_path in glob.glob(f"{directory}/**/*.ipynb", recursive=True):
        if ".ipynb_checkpoints" not in notebook_path:
            try:
                loader = NotebookLoader(
                    notebook_path,
                    include_outputs=True,
                    max_output_length=50,
                    remove_newline=True,
                )
                documents.extend(loader.load())
            except Exception as e:
                print(f"Error loading notebook {notebook_path}: {e}")
    return documents


def process_documents(documents: List, chunk_size: int = 1000, chunk_overlap: int = 20):
    """Split documents into chunks"""
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        separators=["\n## ", "\n### ", "\n#### ", "\n", " ", ""],
    )
    return text_splitter.split_documents(documents)


def create_vector_store(documents: List, persist_directory: str = "./data/chroma_db"):
    """Create and persist vector store"""
    embeddings = OpenAIEmbeddings()
    vectorstore = Chroma.from_documents(
        documents=documents, embedding=embeddings, persist_directory=persist_directory
    )
    vectorstore.persist()
    return vectorstore


def setup_qa_chain(vectorstore):
    """Setup the QA chain with custom prompt"""
    llm = ChatOpenAI(
        model_name="gpt-3.5-turbo",
        temperature=0,
    )

    # Create the QA chain with custom prompt - the k can be added to the function input variable
    qa_chain = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=vectorstore.as_retriever(search_kwargs={"k": 3}),
        chain_type_kwargs={"prompt": PROMPT, "verbose": True},
        return_source_documents=True,
    )

    return qa_chain


def extract_relevant_context(doc) -> Dict:
    """Extract and format relevant context from a document."""
    return {
        "content": doc.page_content,
        "source": doc.metadata.get("source", "Unnamed Source"),
        "page": doc.metadata.get("page", None),
        "chunk": doc.metadata.get("chunk", None),
    }


def format_citation(source_info: Dict) -> str:
    """Format source information into a citation."""
    citation = source_info["source"]
    if source_info["page"]:
        citation += f", page {source_info['page']}"
    return citation


def ask_question(qa_chain, question: str) -> str:
    """Ask a question and return a well-formatted answer with citations."""
    try:
        # Get the answer and source documents
        result = qa_chain({"query": question})

        # Extract answer and sources
        answer = result["result"]
        sources = result.get("source_documents", [])

        # Create source mapping
        source_map = {}
        for idx, doc in enumerate(sources, 1):
            source_info = extract_relevant_context(doc)
            source_map[idx] = source_info

        # Format the response in Markdown
        md = f"### Question\n{question}\n\n"
        md += f"### Answer\n{answer}\n\n"

        if source_map:
            md += "### References\n\n"
            for idx, source_info in source_map.items():
                citation = format_citation(source_info)
                md += f"[Source {idx}] {citation}\n"
                excerpt = source_info["content"][:200].replace("\n", " ").strip()
                md += f"> {excerpt}...\n\n"

        return md
    except Exception as e:
        return f"**An error occurred:** {str(e)}"


def interactive_qa(qa_chain):
    """Interactive Q&A session with formatted Markdown output and citations."""
    try:
        from IPython.display import display, Markdown, HTML

        use_markdown = True
    except ImportError:
        use_markdown = False

    session_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    header = f"""
    # Demand Forecasting Q&A Session
    Session started: {session_time}

    Enter your questions below. Type 'exit' to end the session.
    """

    if use_markdown:
        display(Markdown(header))
    else:
        print(header)

    while True:
        question = input("\nYour question: ").strip()
        if question.lower() == "exit":
            footer = "\n### Session Ended\nThank you for using the Q&A system!"
            if use_markdown:
                display(Markdown(footer))
            else:
                print(footer)
            break

        if use_markdown:
            display(Markdown("---\n*Processing your question...*"))
        else:
            print("\nProcessing your question...")

        answer_md = ask_question(qa_chain, question)

        if use_markdown:
            display(Markdown(answer_md))
            display(Markdown("---"))
        else:
            print(answer_md)
            print("---")


def main():
    print("Loading documents...")
    markdown_docs = load_markdown_files("../data/docs/")
    notebook_docs = load_jupyter_notebooks("./demand_forecast_notebooks/")

    all_documents = markdown_docs + notebook_docs
    print(
        f"Loaded {len(markdown_docs)} markdown files and {len(notebook_docs)} notebooks"
    )

    print("Processing documents...")
    splits = process_documents(all_documents)
    print(f"Created {len(splits)} splits")

    print("Creating vector store...")
    vectorstore = create_vector_store(splits)
    print("Vector store created and persisted")

    qa_chain = setup_qa_chain(vectorstore)
    return qa_chain


if __name__ == "__main__":
    qa_chain = main()
    interactive_qa(qa_chain)

Loading documents...
Loaded 5 markdown files and 13 notebooks
Processing documents...
Created 485 splits
Creating vector store...


  embeddings = OpenAIEmbeddings()


Vector store created and persisted


  vectorstore.persist()
  llm = ChatOpenAI(



    # Demand Forecasting Q&A Session
    Session started: 2025-04-28 21:44:05

    Enter your questions below. Type 'exit' to end the session.
    

---
*Processing your question...*

  result = qa_chain({"query": question})




[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mYou are a helpful AI assistant specialized in demand forecasting and related topics.
Use the following pieces of context to answer the question at the end.
If you don't know the answer, just say that you don't know, don't try to make up an answer.

Context: Machine Learning Project Documentation

Project Overview

Introduction

Welcome to the documentation for our machine learning project focused on time series analysis. This project aims to leverage modern techniques to analyze and forecast time series data, providing valuable insights and predictions.

Objectives

Develop a robust time series analysis toolkit.

Implement various forecasting methods.

Detect anomalies and change points in time series data.

Provide comprehensive documentation and tutorials.

Project Details

Time Series Analysis Toolkit

Overview

Our time series analysis toolkit inclu

### Question
what are the steps of this project?

### Answer
Based on the provided context, the steps of the machine learning project focused on time series analysis are as follows:

1. Introduction: The project aims to leverage modern techniques to analyze and forecast time series data, providing valuable insights and predictions [Source 1].

2. Objectives: The objectives of the project include developing a robust time series analysis toolkit, implementing various forecasting methods, detecting anomalies and change points in time series data, and providing comprehensive documentation and tutorials [Source 1].

3. Time Series Analysis Toolkit: The toolkit includes a variety of methods and models designed to handle different aspects of time series data, such as traditional statistical methods, machine learning models, and deep learning techniques [Source 1].

4. Methods and Models: The project utilizes exponential smoothing as a classic method for forecasting time series data, including single, double, and triple exponential smoothing [Source 1].

Unfortunately, the specific steps of the project beyond these general objectives and components are not provided in the context.

### References

[Source 1] ../data/docs/0_project_overview.md
> Machine Learning Project Documentation  Project Overview  Introduction  Welcome to the documentation for our machine learning project focused on time series analysis. This project aims to leverage mod...

[Source 2] ../data/docs/0_project_overview.md
> Machine Learning Project Documentation  Project Overview  Introduction  Welcome to the documentation for our machine learning project focused on time series analysis. This project aims to leverage mod...

[Source 3] demand_forecast_notebooks/ts-2-linear-vision.ipynb
> is split into three sections: we introduce the basic framework of linear processes, then present extensions and finally demonstrate how to solve a prediction problem from scratch. * [Basic linear proc...



---


### Session Ended
Thank you for using the Q&A system!