In [29]:
from langchain.document_loaders import UnstructuredMarkdownLoader
from langchain.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain.llms import OpenAI
import os
from typing import List
from pathlib import Path

In [17]:
from dotenv import load_dotenv, find_dotenv

load_dotenv(find_dotenv())

True

In [7]:
!pwd

/home/alireza/projects/genai_docs_helper/Notebooks


In [20]:
# Initialize document loader
loader = DirectoryLoader('../data/docs/', glob="**/*.md", loader_cls=UnstructuredMarkdownLoader)
# Load documents
documents = loader.load()
print(f"Loaded {len(documents)} documents")

Loaded 5 documents


In [26]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=20,
    length_function=len,
    separators=["\n## ", "\n### ", "\n#### ", "\n", " ", ""]  # Respects markdown headers
)

In [27]:
# Split documents into chunks
print("Splitting documents...")
splits = text_splitter.split_documents(documents)
print(f"Created {len(splits)} splits")
print(splits[0])

Splitting documents...
Created 19 splits
page_content='Time Series Forecasting with Facebook Prophet

Overview

This document details the process and results of a time series forecasting project using Facebook Prophet. The goal was to build a robust, interpretable model for predicting future values based on historical time series data. The workflow includes data preparation, exploratory analysis, model training, forecasting, and evaluation.

Project Objectives

Develop a time series forecasting model using Facebook Prophet.

Visualize and interpret the forecast and its components.

Evaluate model performance and identify areas for improvement.

Data Preparation

The dataset was loaded and preprocessed to fit Prophet’s requirements. The key steps included:

Loading Data: The time series data was imported, ensuring the date column was in the correct datetime format.

Renaming Columns: Prophet requires columns to be named ds (datestamp) and y (value to forecast).' metadata={'source': '../

In [31]:
# Initialize embeddings
embeddings = OpenAIEmbeddings()

In [33]:
# Create and persist vector store
print("Creating vector store...")
vectorstore = Chroma.from_documents(
    documents=splits,
    embedding=embeddings,
    persist_directory="../data/chroma_db"  # This will save the vector store locally
)
vectorstore.persist()
print("Vector store created and persisted")

Creating vector store...
Vector store created and persisted


In [34]:
# Initialize retrieval chain
qa_chain = RetrievalQA.from_chain_type(
    llm=OpenAI(),
    chain_type="stuff",
    retriever=vectorstore.as_retriever(
        search_kwargs={"k": 3}  # Retrieve top 3 most relevant chunks
    )
)

  llm=OpenAI(),


In [35]:
# Example query function with better formatting
def ask_question(question: str) -> str:
    """
    Ask a question about the loaded documents.

    Args:
        question (str): The question to ask

    Returns:
        str: The answer from the model
    """
    try:
        response = qa_chain.run(question)
        return response
    except Exception as e:
        return f"An error occurred: {str(e)}"

In [36]:
# Example usage
if __name__ == "__main__":
    # Test the system with a sample question
    sample_question = "What is this document about?"
    print("\nTesting with sample question:", sample_question)
    print("\nAnswer:", ask_question(sample_question))


Testing with sample question: What is this document about?


  response = qa_chain.run(question)



Answer:  This document is about the basics of time series analysis, including patterns, dependence, and stationarity.
