In [1]:
pip install ipywidgets langchain langchain-community langchain-core langchainhub tiktoken chromadb pysqlite3-binary sentence-transformers lxml


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.2.2[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


# Set Up The Model
In this block, we install chromadb and other dependancies.  Chroma requires sqlite3 so that is imported as well.

The LLM that is used is Mistral:Instruct running in Ollama which is hosted in OpenShift.

Huggingface Embeddings are used as well and are configured to take advantage of local GPUs.

In [9]:
__import__('pysqlite3')
import sys
sys.modules['sqlite3'] = sys.modules.pop('pysqlite3')
import chromadb

import os.path
import bs4
from typing import List
from bs4 import BeautifulSoup as Soup
from langchain import hub
from langchain_community.document_loaders import BSHTMLLoader
from langchain_community.document_loaders import DirectoryLoader
from langchain_community.chat_models import ChatOllama
from langchain_community.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnableParallel
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter

model = ChatOllama(model="mistral:instruct",
                   base_url="http://ollama-api-service.ollama-llm.svc.cluster.local:11434",
                   temperature = 0)

from langchain_community.embeddings import HuggingFaceEmbeddings

embd = HuggingFaceEmbeddings(model_kwargs={'device': 'cuda'})

# Gather Data to Query

Let's grab all of the man1 user command pages and load them into the docs variable.

In [3]:
check_file = "False"

path = 'man7.org/man-pages/man1/index.html.tmp'

check_file = os.path.isfile(path)

if check_file is False:

    !wget --quiet -np -r -l1 --cut-dirs=1 -e robots=off --accept-regex [A-Za-z]*.*1.html  -R "index.html" https://man7.org/linux/man-pages/man1/ 

    loader = DirectoryLoader(
        "man7.org/man-pages/man1/", 
        glob="**/*.html", 
        loader_cls=BSHTMLLoader,
        show_progress=True,
    )
    docs = loader.load()

# Split and Store the data in the vector store

First, we delete all of the data in the db folder to ensure that we get a fresh start.

In [4]:
persist_dir="man-db"

check_file = "False"

path = 'man-db/chroma.sqlite3'

check_file = os.path.isfile(path)

if check_file is False:
    
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1750, chunk_overlap=100, length_function=len)
    # 
    splits = text_splitter.split_documents(docs)
    
    vectorstore = Chroma.from_documents(documents=splits, embedding=embd, persist_directory=persist_dir)
    
else:
    vectorstore = Chroma(persist_directory=persist_dir, embedding_function=embd)
    
retriever = vectorstore.as_retriever()

# Run the RAG

In [7]:
# Prompt
rag_template = """
Given a question write an answer.
Use only the supplied source docs.
If you don't know the answer, just say that you don't know.  Do not fake the answer.
If the answer is relevant, then ALWAYS include a "SOURCES" part in your answer.

QUESTION: {question}
=========
{source_docs}
=========
ANSWER:
"""

def format_docs(docs: List[Document]) -> str:
    return "\n\n".join(
        f"Content: {doc.page_content}\nSource: {doc.metadata['title']}" for doc in docs
    )

rag_prompt = ChatPromptTemplate.from_template(rag_template)

rag_chain_from_docs = (
    RunnablePassthrough.assign(
        source_docs=(lambda x: format_docs(x["source_docs"]))
    )
    | rag_prompt
    | model
    | StrOutputParser()
)

rag_chain = RunnableParallel(
    {
        "source_docs": retriever,
        "question": RunnablePassthrough(),
    }
).assign(answer=rag_chain_from_docs)

question = "What command can be used to change the ownership of a file?"

results = (rag_chain.invoke(question))

answer = results["answer"]

print(answer)

 The command to change the ownership of a file in Linux is `chown`. You can specify the new owner and optionally the new group by providing the user name or numeric user ID followed by a colon and the group name or numeric group ID, with no spaces between them. For example, `chown john:group filename` would change the ownership of "filename" to user "john" and group "group".

Sources:
- chown(1) - Linux manual page
