In [1]:
import bs4
from langchain import hub
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_community.vectorstores import LanceDB
import lancedb

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import getpass
import os
from dotenv import load_dotenv

load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

In [8]:
# Define a function to process and index documents from multiple websites
def process_and_index_websites(web_paths):
    # Initialize the vector store
    for web_path in web_paths:
        loader = WebBaseLoader(
            web_paths=(web_path,),
            bs_kwargs=dict(
                parse_only=bs4.SoupStrainer(
                    class_=("post-content", "post-title", "post-header")
                )
            ),
        )
        docs = loader.load()

        text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
        splits = text_splitter.split_documents(docs)

        # Here, we assume from_documents can tag each document with its source URL
        vectorstore = LanceDB.from_documents(documents=splits, embedding=OpenAIEmbeddings(api_key=OPENAI_API_KEY), source_url=web_path)

    return vectorstore

# Example web paths to process
web_paths = [
    "https://lilianweng.github.io/posts/2023-06-23-agent/",
    # Add more web_paths as needed
]

# Process and index documents from the specified websites
vectorstore = process_and_index_websites(web_paths)

# The rest of the retrieval and generation pipeline can remain the same
retriever = vectorstore.as_retriever()
prompt = hub.pull("rlm/rag-prompt")
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)

def format_docs(docs):
    # Adjusted to include source URL in the formatted document
    return "\n\n".join(doc.page_content for doc in docs)


rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)


In [9]:
rag_chain.invoke("What is Task Decomposition?")

AttributeError: 'list' object has no attribute 'source_url'