#### Importing relevant libraries

In [22]:
import warnings
# ignore warnings
warnings.filterwarnings("ignore")

from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain_text_splitters import CharacterTextSplitter
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough, RunnableLambda
from langchain_chroma import Chroma
from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings
from langchain_core.documents import Document
from langchain_core.prompts import ChatPromptTemplate
from sentence_transformers import CrossEncoder
from textwrap import dedent
from dotenv import load_dotenv
from hashlib import sha256
from typing import List
from IPython.display import display, Markdown
import chromadb
import numpy as np
import os
import shutil

load_dotenv()

True

#### Defining some functions to organise the code

In [2]:
# Function to load documents
def load_documents(file_directory: str) -> List[Document]:
    loader = PyPDFDirectoryLoader(file_directory)
    documents = loader.load()
    return documents


# Function to split documents into chunks and shift the raw files to the saved directory
def prepare_documents(documents: List[Document], chunk_size: int = 500, chunk_overlap: int = 200) -> List[Document]:
    # Breaking down documents into chunks
    splitter = CharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    docs = splitter.split_documents(documents)
    
    # Shifting the files in the 'raw' directory to the 'saved' directory
    if not os.path.exists("../guides/saved"):
        os.makedirs("../guides/saved")
        
    for file in os.listdir("../guides/raw"):
        shutil.move(os.path.join(os.path.abspath("../guides/raw"), file), os.path.join(os.path.abspath("../guides/saved"), file))
    
    return docs


# Function to add documents to a collection in the vector database, and returns the vector database
def add_to_vector_db(docs: List[Document], collection_name: str, 
                     embeddings: HuggingFaceInferenceAPIEmbeddings) -> Chroma:
    # Initialise ChromaDB client
    if not os.path.exists("./chroma_db_index"):
        os.mkdir("./chroma_db_index")
    chroma_client = chromadb.PersistentClient(path="./chroma_db_index")
    
    # Get/Create a collection
    collection = chroma_client.get_or_create_collection(
        name=collection_name,
    )
    
    # Creating ids for docs
    ids = [sha256(doc.page_content.encode('utf-8')).hexdigest() for doc in docs]

    embeddings = HuggingFaceInferenceAPIEmbeddings(
        model="sentence-transformers/all-MiniLM-l6-v2",
        api_key=os.getenv("HF_TOKEN")
        )

    # Adding documents to the collection
    db = Chroma.from_documents(
        client=chroma_client,
        collection_name="pbs-user-guide",
        documents=docs,
        embedding=embeddings,
        ids = ids
    )
    
    return db


# Function to initialise the existing vector database should there be no new documents to be added
def get_vector_db(collection_name: str, embeddings: HuggingFaceInferenceAPIEmbeddings) -> Chroma:
    if not os.path.exists("./chroma_db_index"):
        raise FileNotFoundError("No vector database found. Please add documents to the vector database.")
    
    chroma_client = chromadb.PersistentClient(path="./chroma_db_index")
    
    db = Chroma(client=chroma_client, collection_name=collection_name, 
                embedding_function=embeddings,
                )
    
    return db

#### Loading our documents and storing them in a vector database (ChromaDB) 

In [3]:
# Loading our documents from the directory 
documents = load_documents("../guides/raw/")
# Embeddings for indexing
embeddings = HuggingFaceInferenceAPIEmbeddings(model="sentence-transformers/all-MiniLM-l6-v2", 
                                               api_key=os.getenv("HF_TOKEN"))

if documents:
    docs = prepare_documents(documents)
    print(f"Number of documents: {len(docs)}")
    print("Example of a document: \n", docs[0].page_content)
    db = add_to_vector_db(docs, "pbs-user-guide", embeddings=embeddings)

# if there are no documents, initialise the existing vector database    
else: 
    db = get_vector_db("pbs-user-guide", embeddings=embeddings)
    
retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": 10})

In [4]:
response = retriever.invoke("What is a PBS job?")
response

[Document(page_content='PBS Professional \n  \n2021.1.2 \n  \nUser’s \n Guide \nUG-v \nContents \nAbout PBS Documentation \n ix\n1 \nGetting Started with PBS \n1 \n1.1 \nWhy Use PBS? \n . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .  \n1 \n1.2 \nPBS Tasks and Components \n . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .  \n1 \n1.3 \nInterfaces to PBS \n . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .  \n3 \n1.4 \nSetting Up Your Environment \n . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .  \n4 \n2 \nSubmitting a PBS Job \n11 \n2.1 \nIntroduction to the PBS Job \n. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .  \n11 \n2.2 \nThe PBS Job Script \n. . . . . . .

#### Reranking using a cross encoder

In [5]:
cross_encoder = CrossEncoder(model_name='cross-encoder/ms-marco-MiniLM-L-6-v2')

In [23]:
# Number of documents to use as context in the prompt
num_docs = 3

def reranker(retrieved_documents: List[Document]):
    
    pairs = [[query, doc.page_content] for doc in retrieved_documents]
    scores = cross_encoder.predict(pairs)
    
    # # printing out to see change in order
    # print("New Ordering:")
    # for o in np.argsort(scores)[::-1]:
    #     print(o+1)
    
    # Selecting top n
    top_n =  [retrieved_documents[i] for i, v in enumerate(np.argsort(scores)[::-1]) if v in range(num_docs)]
    
    return top_n

#### Preparing our prompt and LLM

In [27]:
template = dedent(
       """You are an assistant for question-answering tasks. Use the following pieces of retrieved context to aid in answering the question. 
       Keep the answer clear and concise, and support with examples if possible. Return the answer in a markdown format. 
       Question: {question}
       Context: {context}
       Answer:"""
)

prompt = ChatPromptTemplate.from_messages([
    # ("system", template),
    ("human", template)
])

model = ChatGoogleGenerativeAI(model="gemini-pro", max_output_tokens=2048, temperature=0)



#### Creating our chain

In [28]:
# Helper function to format the retrieved documents in a format the LLM can take
def format_docs(docs: List[Document]) -> str:
    return "\n\n".join(doc.page_content for doc in docs)

rag_chain = (
    {"context": retriever | RunnableLambda(reranker) | format_docs, "question": RunnablePassthrough()}
    | prompt
    | model
    | StrOutputParser()
)

In [29]:
query = "How do I submit multiprocessor jobs?"

response = rag_chain.invoke(query)
display(Markdown(response))

To submit multiprocessor jobs, you can use the following steps:

1. Create a job script that specifies the number of processors and the MPI command to be executed.
2. Use the `qsub` command to submit the job script to PBS.
3. PBS will then schedule the job and allocate the necessary resources.

For example, the following job script requests 2 processors and runs the `mpiexec` command:

```
#!/bin/sh
#PBS -l select=2:ncpus=2:mpiprocs=2
#PBS -l walltime=00:10:00
mpiexec -n 2 my_program
```

To submit this job script, you would use the following command:

```
qsub job_script.sh
```