In [1]:
pip install ipywidgets langchain langchain-community langchain-core langchainhub tiktoken chromadb pysqlite3-binary sentence-transformers pypdf

Collecting langchain
  Downloading langchain-0.1.13-py3-none-any.whl (810 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m810.5/810.5 kB[0m [31m29.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langchain-community
  Downloading langchain_community-0.0.29-py3-none-any.whl (1.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m63.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langchain-core
  Downloading langchain_core-0.1.36-py3-none-any.whl (273 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m273.9/273.9 kB[0m [31m183.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langchainhub
  Downloading langchainhub-0.1.15-py3-none-any.whl (4.6 kB)
Collecting tiktoken
  Downloading tiktoken-0.6.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m71.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting chromadb
  Downloading

# Set Up The Model
In this block, we install chromadb and other dependancies.  Chroma requires sqlite3 so that is imported as well.

The LLM that is used is Mistral:Instruct that is hosted by an Ollama container running in OpenShift.

HuggingFace Embeddings are used since they can be run locally and can be configured to take advantage of available GPUs.

In [9]:
__import__('pysqlite3')
import sys
sys.modules['sqlite3'] = sys.modules.pop('pysqlite3')
import chromadb

import bs4
import os.path
from langchain import hub
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.chat_models import ChatOllama
from langchain_community.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_core.prompts import ChatPromptTemplate
from langchain_text_splitters import RecursiveCharacterTextSplitter
from pprint import pprint

model = ChatOllama(model="mistral:instruct",
                   base_url="http://ollama-api-service.ollama-llm.svc.cluster.local:11434",
                   temperature = 0)

from langchain_community.embeddings import HuggingFaceEmbeddings

#embd = HuggingFaceEmbeddings(model_kwargs={'device': 'cuda'})
embd = HuggingFaceEmbeddings(model_kwargs={'device': 'cpu'})

# Gather Data, Chunk it and Store it in the vector store

If the database is not present, then create it by downloading and chunking the files.  If it is present, then just load it.

In [10]:
persist_dir = "db"

check_file = "False"

path = 'db/chroma.sqlite3'

check_file = os.path.isfile(path)

if check_file is False:
    urls = [
        r'https://access.redhat.com/documentation/en-us/red_hat_enterprise_linux/9/html-single/performing_a_standard_rhel_9_installation/index',
        r'https://access.redhat.com/documentation/en-us/red_hat_enterprise_linux/9/html-single/performing_an_advanced_rhel_9_installation/index',
        r'https://access.redhat.com/documentation/en-us/red_hat_enterprise_linux/9/html-single/configuring_basic_system_settings/index',
        r'https://access.redhat.com/documentation/en-us/red_hat_enterprise_linux/9/html-single/security_hardening/index',
        r'https://access.redhat.com/documentation/en-us/red_hat_enterprise_linux/9/html-single/composing_a_customized_rhel_system_image/index',
        r'https://access.redhat.com/documentation/en-us/red_hat_enterprise_linux/9/html-single/upgrading_from_rhel_8_to_rhel_9/index',
        r'https://www.redhat.com/en/resources/red-hat-enterprise-linux-subscription-guide'
    ]
    
    loader = WebBaseLoader(urls)
    
    docs = loader.load()
    
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1250, chunk_overlap=0)
    
    splits = text_splitter.split_documents(docs)
    
    vectorstore = Chroma.from_documents(documents=splits, embedding=embd, persist_directory="db")

else:
    
    vectorstore = Chroma(persist_directory=persist_dir, embedding_function=embd)
    
retriever = vectorstore.as_retriever()

# Run the RAG

In [16]:
from langchain_core.runnables import RunnableParallel
from typing import List
from langchain_core.documents import Document


# Prompt
rag_template = """
Given a question write an answer.
Use only the supplied source docs.
If you don't know the answer, just say that you don't know.  Do not fake the answer.
If the answer is relevant, then ALWAYS include a "SOURCES" part in your answer.

QUESTION: {question}
=========
{source_docs}
=========
ANSWER: 
"""

rag_prompt = ChatPromptTemplate.from_template(rag_template)

def format_docs(docs: List[Document]) -> str:
    return "\n\n".join(
        f"Content: {doc.page_content}\nSource: {doc.metadata['source']}" for doc in docs
    )

rag_chain_from_docs = (
    RunnablePassthrough.assign(
        source_docs=(lambda x: format_docs(x["source_docs"]))
    )
    | rag_prompt
    | model
    | StrOutputParser()
)

rag_chain = RunnableParallel(
    {
        "source_docs": retriever,
        "question": RunnablePassthrough()
    }
).assign(answer=rag_chain_from_docs)

question = "How does SELinux work?"

results = (rag_chain.invoke(question))
answer = results["answer"]
print(answer)

 SELinux (Security-Enhanced Linux) is a security extension for the Linux operating system that helps protect against unauthorized access to files, processes, and other resources. It uses a Mandatory Access Control (MAC) model, which sets strict access control rules that must be followed by the system and its applications.

SELinux labels are used to identify different types of data and processes, and access control decisions are based on these labels. The labels define the security context for each resource, including the file type, process, or user. SELinux policies specify the allowed access between different security contexts.

When a process attempts to access a resource, SELinux checks the security context of both the process and the resource against the policy rules. If the access is allowed, the operation is permitted; otherwise, it is denied. This helps prevent unauthorized access or modification of sensitive data.

SELinux policies can be managed using various tools like `sets