In [8]:
# Required imports
from langchain.document_loaders import UnstructuredMarkdownLoader, NotebookLoader
from langchain.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI  # Changed this import
from typing import List
from pathlib import Path
import glob
from dotenv import load_dotenv, find_dotenv

load_dotenv(find_dotenv())

def load_markdown_files(directory: str="../data/docs/") -> List:
    """Load markdown files from directory"""
    loader = DirectoryLoader(
        directory,
        glob="**/*.md",
        loader_cls=UnstructuredMarkdownLoader
    )
    return loader.load()

def load_jupyter_notebooks(directory: str="./demand_forecast_notebooks/") -> List:
    """Load jupyter notebooks from directory"""
    documents = []
    # Find all .ipynb files in the directory and subdirectories
    for notebook_path in glob.glob(f"{directory}/**/*.ipynb", recursive=True):
        if ".ipynb_checkpoints" not in notebook_path:  # Skip checkpoint files
            try:
                loader = NotebookLoader(
                    notebook_path,
                    include_outputs=True,  # Include cell outputs
                    max_output_length=50,   # Limit output length
                    remove_newline=True     # Clean up newlines
                )
                documents.extend(loader.load())
            except Exception as e:
                print(f"Error loading notebook {notebook_path}: {e}")
    return documents

def process_documents(documents: List, chunk_size: int = 1000, chunk_overlap: int = 20):
    """Split documents into chunks"""
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        separators=["\n## ", "\n### ", "\n#### ", "\n", " ", ""]
    )
    return text_splitter.split_documents(documents)

def create_vector_store(documents: List, persist_directory: str = "../data/chroma_db"):
    """Create and persist vector store"""
    embeddings = OpenAIEmbeddings()
    vectorstore = Chroma.from_documents(
        documents=documents,
        embedding=embeddings,
        persist_directory=persist_directory
    )
    vectorstore.persist()
    return vectorstore

def setup_qa_chain(vectorstore):
    """Setup the QA chain"""
    # Changed the LLM initialization to use ChatOpenAI
    llm = ChatOpenAI(
        model_name="gpt-3.5-turbo",  # Cheapest available model
        temperature=0,               # Deterministic, minimizes hallucination
    )

    return RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=vectorstore.as_retriever(
            search_kwargs={"k": 3}
        )
    )

def main():
    print("Loading documents...")
    # Load both markdown files and notebooks
    markdown_docs = load_markdown_files("../data/docs/")
    notebook_docs = load_jupyter_notebooks("./demand_forecast_notebooks/")

    # Combine all documents
    all_documents = markdown_docs + notebook_docs
    print(f"Loaded {len(markdown_docs)} markdown files and {len(notebook_docs)} notebooks")

    # Process documents
    print("Processing documents...")
    splits = process_documents(all_documents)
    print(f"Created {len(splits)} splits")

    # Create vector store
    print("Creating vector store...")
    vectorstore = create_vector_store(splits)
    print("Vector store created and persisted")

    # Setup QA chain
    qa_chain = setup_qa_chain(vectorstore)

    return qa_chain

# Interactive QA function
def ask_question(qa_chain, question: str) -> str:
    """
    Ask a question about the loaded documents.
    """
    try:
        response = qa_chain.run(question)
        return response
    except Exception as e:
        return f"An error occurred: {str(e)}"

def interactive_qa(qa_chain):
    """
    Interactive Q&A session with the loaded documents.
    """
    print("\nEnter your questions (type 'exit' to quit):")
    while True:
        question = input("\nQuestion: ").strip()
        if question.lower() == 'exit':
            break
        answer = ask_question(qa_chain, question)
        print("\nAnswer:", answer)

# if __name__ == "__main__":
#     # Initialize the system
#     qa_chain = main()
#     # Start interactive session
#     interactive_qa(qa_chain)

In [9]:
# Initialize the system
qa_chain = main()

Loading documents...
Loaded 5 markdown files and 13 notebooks
Processing documents...
Created 485 splits
Creating vector store...
Vector store created and persisted


  llm = ChatOpenAI(


In [None]:
question = "which model they tried turned the best accuracy overall?"
answer = ask_question(qa_chain, question)
print("\nAnswer:", answer)

  response = qa_chain.run(question)



Answer: 
There is no mention of a specific model being tried in the given context, so it is not possible to determine which model had the best accuracy overall. The context only discusses the use of information criteria to select a model with the minimal number of parameters necessary to capture the details of the data. 


In [11]:
question = "what is the project all about?"
answer = ask_question(qa_chain, question)
print("\nAnswer:", answer)


Answer: The project is focused on time series analysis using modern techniques to analyze and forecast time series data. It aims to develop a robust time series analysis toolkit, implement various forecasting methods, detect anomalies and change points in time series data, and provide comprehensive documentation and tutorials. The toolkit includes traditional statistical methods, machine learning models, and deep learning techniques to handle different aspects of time series data.


In [20]:
# Required imports
from langchain.document_loaders import UnstructuredMarkdownLoader, NotebookLoader
from langchain.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain.memory import ConversationBufferMemory
from typing import List
from pathlib import Path
import glob
from dotenv import load_dotenv, find_dotenv

load_dotenv(find_dotenv())

# Define a custom prompt template
custom_prompt_template = """You are a helpful AI assistant specialized in demand forecasting and related topics. 
Use the following pieces of context to answer the question at the end. 
If you don't know the answer, just say that you don't know, don't try to make up an answer.

Context: {context}

Question: {question}

Please provide your answer following these guidelines:
1. Only use information from the provided context
2. If the context doesn't contain enough information to fully answer the question, clearly state that
3. If you need to make any assumptions, explicitly state them
4. If relevant, cite specific parts of the context
5. Keep the answer focused and relevant to demand forecasting

Answer: Let me help you with that.
"""

# Create the prompt template
PROMPT = PromptTemplate(
    template=custom_prompt_template,
    input_variables=["context", "question"]
)

def load_markdown_files(directory: str="../data/docs/") -> List:
    """Load markdown files from directory"""
    loader = DirectoryLoader(
        directory,
        glob="**/*.md",
        loader_cls=UnstructuredMarkdownLoader
    )
    return loader.load()

def load_jupyter_notebooks(directory: str="./demand_forecast_notebooks/") -> List:
    """Load jupyter notebooks from directory"""
    documents = []
    for notebook_path in glob.glob(f"{directory}/**/*.ipynb", recursive=True):
        if ".ipynb_checkpoints" not in notebook_path:
            try:
                loader = NotebookLoader(
                    notebook_path,
                    include_outputs=True,
                    max_output_length=50,
                    remove_newline=True
                )
                documents.extend(loader.load())
            except Exception as e:
                print(f"Error loading notebook {notebook_path}: {e}")
    return documents

def process_documents(documents: List, chunk_size: int = 1000, chunk_overlap: int = 20):
    """Split documents into chunks"""
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        separators=["\n## ", "\n### ", "\n#### ", "\n", " ", ""]
    )
    return text_splitter.split_documents(documents)

def create_vector_store(documents: List, persist_directory: str = "../data/chroma_db"):
    """Create and persist vector store"""
    embeddings = OpenAIEmbeddings()
    vectorstore = Chroma.from_documents(
        documents=documents,
        embedding=embeddings,
        persist_directory=persist_directory
    )
    vectorstore.persist()
    return vectorstore

def setup_qa_chain(vectorstore):
    """Setup the QA chain with custom prompt"""
    # Initialize the language model
    llm = ChatOpenAI(
        model_name="gpt-3.5-turbo",
        temperature=0,
    )

    # Create the QA chain with custom prompt
    qa_chain = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=vectorstore.as_retriever(
            search_kwargs={
                "k": 4  # Number of relevant chunks to retrieve
            }
        ),
        chain_type_kwargs={
            "prompt": PROMPT,
            "verbose": True  # Set to True to see the internal workings
        },
        return_source_documents=True  # This will return the source documents used for the answer
    )

    return qa_chain

def main():
    print("Loading documents...")
    markdown_docs = load_markdown_files("../data/docs/")
    notebook_docs = load_jupyter_notebooks("./demand_forecast_notebooks/")

    all_documents = markdown_docs + notebook_docs
    print(f"Loaded {len(markdown_docs)} markdown files and {len(notebook_docs)} notebooks")

    print("Processing documents...")
    splits = process_documents(all_documents)
    print(f"Created {len(splits)} splits")

    print("Creating vector store...")
    vectorstore = create_vector_store(splits)
    print("Vector store created and persisted")

    qa_chain = setup_qa_chain(vectorstore)
    return qa_chain

# def ask_question(qa_chain, question: str) -> str:
#     """
#     Ask a question about the loaded documents with improved response formatting.
#     """
#     try:
#         # Get both the answer and source documents
#         result = qa_chain({"query": question})
        
#         # Format the response
#         response = "\nAnswer: " + result['result']
        
#         # Add source information if available
#         if 'source_documents' in result:
#             response += "\n\nSources used:"
#             for i, doc in enumerate(result['source_documents'], 1):
#                 # Extract source information from metadata if available
#                 source = doc.metadata.get('source', f'Document {i}')
#                 response += f"\n{i}. {source}"
        
#         return response
#     except Exception as e:
#         return f"An error occurred: {str(e)}"

def ask_question(qa_chain, question: str) -> str:
    """
    Ask a question about the loaded documents and return a Markdown-formatted answer.
    """
    try:
        # Get both the answer and source documents
        result = qa_chain({"query": question})

        # Format the response as Markdown
        md = f"## Answer\n\n{result['result']}\n"
        if 'source_documents' in result and result['source_documents']:
            md += "\n---\n\n### Sources Used\n"
            for i, doc in enumerate(result['source_documents'], 1):
                source = doc.metadata.get('source', f'Document {i}')
                snippet = doc.page_content[:200].replace('\n', ' ')
                md += f"**{i}. {source}**\n\n> {snippet}...\n\n"
        return md
    except Exception as e:
        return f"**An error occurred:** {str(e)}"

# def interactive_qa(qa_chain):
#     """
#     Interactive Q&A session with improved formatting and source tracking.
#     """
#     print("\nWelcome to the Demand Forecasting Q&A System!")
#     print("Enter your questions (type 'exit' to quit)")
#     print("------------------------------------------------")
    
#     while True:
#         question = input("\nYour question: ").strip()
#         if question.lower() == 'exit':
#             print("\nThank you for using the Q&A system!")
#             break
        
#         print("\nProcessing your question...")
#         answer = ask_question(qa_chain, question)
#         print(answer)
#         print("\n------------------------------------------------")

def interactive_qa(qa_chain):
    """
    Interactive Q&A session with Markdown output.
    """
    try:
        from IPython.display import display, Markdown
        use_markdown = True
    except ImportError:
        use_markdown = False

    print("\nWelcome to the Demand Forecasting Q&A System!")
    print("Enter your questions (type 'exit' to quit)")
    print("------------------------------------------------")

    while True:
        question = input("\nYour question: ").strip()
        if question.lower() == 'exit':
            print("\nThank you for using the Q&A system!")
            break

        print("\nProcessing your question...")
        answer_md = ask_question(qa_chain, question)
        if use_markdown:
            display(Markdown(answer_md))
        else:
            print(answer_md)
        print("\n------------------------------------------------")

if __name__ == "__main__":
    qa_chain = main()
    interactive_qa(qa_chain)

Loading documents...
Loaded 5 markdown files and 13 notebooks
Processing documents...
Created 485 splits
Creating vector store...
Vector store created and persisted

Welcome to the Demand Forecasting Q&A System!
Enter your questions (type 'exit' to quit)
------------------------------------------------

Processing your question...


[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mYou are a helpful AI assistant specialized in demand forecasting and related topics. 
Use the following pieces of context to answer the question at the end. 
If you don't know the answer, just say that you don't know, don't try to make up an answer.

Context: Machine Learning Project Documentation

Project Overview

Introduction

Welcome to the documentation for our machine learning project focused on time series analysis. This project aims to leverage modern techniques to analyze and forecast time series data, providing valua

## Answer

The project is focused on machine learning and time series analysis. It aims to develop a robust time series analysis toolkit, implement various forecasting methods, detect anomalies and change points in time series data, and provide comprehensive documentation and tutorials. The toolkit includes traditional statistical methods, machine learning models, and deep learning techniques, such as exponential smoothing for forecasting time series data.

---

### Sources Used
**1. ../data/docs/0_project_overview.md**

> Machine Learning Project Documentation  Project Overview  Introduction  Welcome to the documentation for our machine learning project focused on time series analysis. This project aims to leverage mod...

**2. ../data/docs/0_project_overview.md**

> Machine Learning Project Documentation  Project Overview  Introduction  Welcome to the documentation for our machine learning project focused on time series analysis. This project aims to leverage mod...

**3. ../data/docs/0_project_overview.md**

> Machine Learning Project Documentation  Project Overview  Introduction  Welcome to the documentation for our machine learning project focused on time series analysis. This project aims to leverage mod...

**4. ../data/docs/0_project_overview.md**

> Machine Learning Project Documentation  Project Overview  Introduction  Welcome to the documentation for our machine learning project focused on time series analysis. This project aims to leverage mod...




------------------------------------------------

Thank you for using the Q&A system!


In [17]:
qa_chain = main()

Loading documents...
Loaded 5 markdown files and 13 notebooks
Processing documents...
Created 485 splits
Creating vector store...
Vector store created and persisted


In [18]:
question = "what is the project all about?"
answer = ask_question(qa_chain, question)
print("\nAnswer:", answer)



[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mYou are a helpful AI assistant specialized in demand forecasting and related topics. 
Use the following pieces of context to answer the question at the end. 
If you don't know the answer, just say that you don't know, don't try to make up an answer.

Context: Machine Learning Project Documentation

Project Overview

Introduction

Welcome to the documentation for our machine learning project focused on time series analysis. This project aims to leverage modern techniques to analyze and forecast time series data, providing valuable insights and predictions.

Objectives

Develop a robust time series analysis toolkit.

Implement various forecasting methods.

Detect anomalies and change points in time series data.

Provide comprehensive documentation and tutorials.

Project Details

Time Series Analysis Toolkit

Overview

Our time series analysis toolkit inc