In [None]:
from langchain_community.document_loaders import TextLoader
storage_path = "/workspace/data/uploaded_files"
file_path1 = f"{storage_path}/guidelines-list.md"
file_path2 = f"{storage_path}/lecture_notes.txt"

documents = []
documents.extend(TextLoader(file_path1).load())
documents.extend(TextLoader(file_path2).load())

print(f"Loaded {len(documents)} documents from {storage_path}")
for doc in documents:
    print(f"Filename: {doc.metadata['source']}")
    # print("-" * 40)

Loaded 2 documents from /workspace/data/uploaded_files
Filename: /workspace/data/uploaded_files/guidelines-list.md
Metadata: {'source': '/workspace/data/uploaded_files/guidelines-list.md'}
Filename: /workspace/data/uploaded_files/lecture_notes.txt
Metadata: {'source': '/workspace/data/uploaded_files/lecture_notes.txt'}


In [34]:
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_chroma import Chroma

embeddings = "openai.text-embedding-3-large"
vectorstore = Chroma(
        embedding_function = OpenAIEmbeddings(model=embeddings),
        persist_directory = f"{storage_path}/chroma",
        collection_name = "uploaded_files"
    )

retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 3})

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)



In [None]:
vectorstore.delete_collection()
vectorstore.reset_collection()

In [31]:
vectorstore.similarity_search("alt tags")

[Document(metadata={'source': '/workspace/data/uploaded_files/guidelines-list.md'}, page_content='* If the `<img>` tag or `role="img"` is used   \n  * There must be an `alt` attribute on the image. (Siteimprove and WAVE detect this.)  \n  * Every `<img>` tag in a group of images must have the `alt` attribute.  \n* If the `<svg>` tag is used   \n  * The attribute and value `role="img"` must be applied.  \n* If the `<figure>` tag is used   \n  * The image(s) contained within use the conventions of the aforementioned `<img>` and  `<svg>` tags.  \n* If there is a `title` attribute present on an image, that does **not** factor into alternative text.  This attribute only provides hover-over text and is not available to assistive technology.   \n* CSS background images must not be used to present informative or grouped images.\n\n#### Determine if the alternative text value is appropriate.\n\nUse WAVE, a Screen Reader, or the browser\'s inspector tools (F12) to view the alternative text for e

In [None]:
retrieved_docs = retriever.invoke("What should I do if there is not an alt tag for an image?")
print(f"Retrieved {len(retrieved_docs)} documents")
for doc in retrieved_docs:
    print(f"Filename: {doc.metadata['source']}")
    print(doc.page_content)
    print("-" * 40)

In [35]:
rag_chain = ( retriever | format_docs )

rag_chain.invoke("What should I do if there is not an alt tag for an image?")


'* If the `<img>` tag or `role="img"` is used   \n  * There must be an `alt` attribute on the image. (Siteimprove and WAVE detect this.)  \n  * Every `<img>` tag in a group of images must have the `alt` attribute.  \n* If the `<svg>` tag is used   \n  * The attribute and value `role="img"` must be applied.  \n* If the `<figure>` tag is used   \n  * The image(s) contained within use the conventions of the aforementioned `<img>` and  `<svg>` tags.  \n* If there is a `title` attribute present on an image, that does **not** factor into alternative text.  This attribute only provides hover-over text and is not available to assistive technology.   \n* CSS background images must not be used to present informative or grouped images.\n\n#### Determine if the alternative text value is appropriate.\n\nUse WAVE, a Screen Reader, or the browser\'s inspector tools (F12) to view the alternative text for each image.\n\nConsider an image of Vincent Van Gogh\'s *The Starry Night*  \n![Oil painting with 