In [1]:
import bs4
import os
from langchain import hub
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import WebBaseLoader,TextLoader,JSONLoader
from langchain_community.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

USER_AGENT environment variable not set, consider setting it to identify your requests.


In [2]:
os.environ['OPENAI_API_KEY'] = ''

In [3]:
def load_processed_files(directory: str):
    documents = []
    for filename in os.listdir(directory):
        if filename.endswith('.json'):
            file_path = os.path.join(directory, filename)
            
            # Define a custom jq schema to extract both content and metadata
            jq_schema = '.chunks[] | {content: .content, metadata: .metadata}'
            
            loader = JSONLoader(
    file_path=file_path,
    jq_schema=jq_schema,
    content_key="content",
    metadata_func=lambda metadata, _: {**metadata, "source": file_path}  # Add the second argument
)
            
            documents.extend(loader.load())
    
    return documents
loaded_documents = load_processed_files('../pages/json')

In [4]:
from langchain.schema import Document 
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)

# Container for split documents
split_documents = []

# Loop through each loaded document
for doc in loaded_documents:
    # Extract content and metadata
    content = doc.page_content
    metadata = doc.metadata

    
    # Split the content
    split_content = text_splitter.split_text(content)
    
    # Re-associate each split with the original metadata
    for split in split_content:
        split_documents.append({
            'page_content': split,
            'metadata': metadata  # Keep the same metadata for each chunk
        })
def flatten_metadata(metadata):
    # Flatten metadata by joining key-value pairs into a single string
    return {key: str(value) if isinstance(value, dict) else value for key, value in metadata.items()}

# Convert split_documents (which are dictionaries) into Document objects with flattened metadata
documents_as_objects = [
    Document(
        page_content=doc['page_content'], 
        metadata=flatten_metadata(doc['metadata'])  # Flatten the metadata
    )
    for doc in split_documents
]

In [5]:
vectorstore = Chroma.from_documents(documents=documents_as_objects, 
                                    embedding=OpenAIEmbeddings())

retriever = vectorstore.as_retriever()

In [6]:
prompt = hub.pull("rlm/rag-prompt")

  prompt = loads(json.dumps(prompt_object.manifest))


In [19]:
prompt.format_prompt(```You are a helpful assistant who provides detailed and accurate information.
Context: {context}
Question: {question}
Answer:``)

KeyError: 'question'

In [71]:
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)

In [73]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

# Chain
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

# Question
response = rag_chain.invoke("Where is USF located?")

In [74]:
response

'USF is located in the Tampa Bay region, with campuses in Tampa, St. Petersburg, and Sarasota-Manatee. The campuses together comprise over 1,600 acres and nearly 12 million square feet of building space. USF is one of the fastest growing universities in the nation.'