In [None]:
# Install required dependencies
! pip install openai num2words matplotlib plotly scipy scikit-learn pandas tiktoken langchain pypdf faiss-cpu

In [2]:
# Setup openAI connections

import keys

gpt_key = keys.gpt_key
gpt_endpoint = "https://raid-ses-openai.openai.azure.com/"

In [2]:
# some logic here to better split the documents instead of just by page??

In [4]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.chat_models import AzureChatOpenAI
from langchain.vectorstores import FAISS
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.chains import RetrievalQA
from langchain.chains.question_answering import load_qa_chain

def load_docs(filepath):
    loader = DirectoryLoader(filepath, glob='**/*.pdf', loader_cls=PyPDFLoader)

    docs = loader.load()
    
    return docs

def vector_load(docs, key, endpoint):
    
    embedding_model = OpenAIEmbeddings(
    openai_api_type="azure",
    openai_api_key=key, 
    openai_api_base=endpoint,
    openai_api_version="2023-05-15",
    deployment="swiftfaq-ada002"
    )
    
    # need to build some logic here for checking the database - if exists then just add if not, create
    
    db = FAISS.from_documents(docs, embedding_model)
    
    return db
    

In [6]:
docs = load_docs('./data/124')
db = vector_load(docs, gpt_key, gpt_endpoint)

In [15]:
db.save_local("vecstore_124")

In [4]:
db = FAISS.load_local("dbstore", OpenAIEmbeddings(
    openai_api_type="azure",
    openai_api_key=gpt_key, 
    openai_api_base=gpt_endpoint,
    openai_api_version="2023-05-15",
    deployment="swiftfaq-ada002"
    ))

In [7]:
from langchain.chat_models import AzureChatOpenAI
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationSummaryBufferMemory

# initiate llm
llm = AzureChatOpenAI(openai_api_type="azure", 
                      openai_api_version="2023-05-15", 
                      openai_api_base=gpt_endpoint, 
                      openai_api_key=gpt_key, 
                      deployment_name="raidGPT", 
                      temperature=0.0)

retriever = db.as_retriever(search_kwargs = {"k": 10})

memory = ConversationSummaryBufferMemory(llm=llm,
                                   memory_key="chat_history", 
                                   input_key="question", 
                                   output_key="answer", 
                                   return_messages=True)

qa = ConversationalRetrievalChain.from_llm(llm=llm, 
                                           retriever=retriever, 
                                           return_source_documents=True, 
                                           memory = memory)

## Testing custom prompt

In [8]:
from langchain.prompts.prompt import PromptTemplate

In [11]:
custom_template = """
You are a bot designed to answer military helicopter pilot trainees' questions from various flying handbooks and rulebooks. Use the context provided below to answer their questions. If you don't know the answer, just say that you don't know, don't try to make up an answer. 

{context}

Additionally, this was the chat history of your conversation with the user.
{chat_history}

Question: {question}

"""

PROMPT = PromptTemplate.from_template(template=custom_template)

In [12]:
qa = ConversationalRetrievalChain.from_llm(llm=llm, 
                                           retriever=retriever, 
                                           return_source_documents=True, 
                                           memory = memory,
                                           combine_docs_chain_kwargs={"prompt" : PROMPT})

In [13]:
qa({"question": "what are the types of autorotation?"})

{'question': 'what are the types of autorotation?',
 'chat_history': [HumanMessage(content='what are the types of autorotation?', additional_kwargs={}, example=False),
  AIMessage(content='The types of autorotation covered in the handbook are:\n\n1. Glide autorotation\n2. 180° autorotation\n3. 360° autorotation\n4. Low level autorotation', additional_kwargs={}, example=False)],
 'answer': 'The types of autorotation covered in the handbook are:\n\n1. Glide autorotation\n2. 180° autorotation\n3. 360° autorotation\n4. Low level autorotation',
 'source_documents': [Document(page_content='OFFICIAL (CLOSED)   \n11-1 \nOFFICIAL (CLOSED)   PART B  \n \nCHAPTER 11  \n \nADVANCED AUTOROTATION  \n \nINTRODUCTION  \n \n11.1 Advanced autorotation is an extension of basic autorotation. In advanced \nautorotation, the terminal phase of landing the helicopter along with varying \nprofiles through which an autorotation can be conducted will be covered.  \n \n11.2 The learning obj ectives are:  (1) unde

In [14]:
qa({'question' : "Tell me more about glide autorotation"})

{'question': 'Tell me more about glide autorotation',
 'chat_history': [HumanMessage(content='what are the types of autorotation?', additional_kwargs={}, example=False),
  AIMessage(content='The types of autorotation covered in the handbook are:\n\n1. Glide autorotation\n2. 180° autorotation\n3. 360° autorotation\n4. Low level autorotation', additional_kwargs={}, example=False),
  HumanMessage(content='Tell me more about glide autorotation', additional_kwargs={}, example=False),
 'source_documents': [Document(page_content='OFFICIAL (CLOSED)   \n11-11 \nOFFICIAL (CLOSED)   Figure 11-5: 180 autos  \ne. Turn . On intercepting the point where the downwind meets the \nautorotative glideslope for the 180 turn, input 30o AOB with the cyclic \nand commence the turn towards the platform. Maintain the AAABC scan \nand adjust AOB (max up to 4 5o) to intercept the extended centreline and \nglideslope. Scan the NR as it may rise rapidly due to the turn and disc \nloading.  \n  \nf. 400ft AGL . Simi

## Versioning Control Tests

## Alternate embedding models

In [27]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.chat_models import AzureChatOpenAI
from langchain.vectorstores import FAISS
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.chains import RetrievalQA
from langchain.chains.question_answering import load_qa_chain

def load_docs(filepath):
    loader = DirectoryLoader(filepath, glob='**/*.pdf', loader_cls=PyPDFLoader)

    docs = loader.load()
    
    return docs

from langchain.embeddings import HuggingFaceBgeEmbeddings
model_name = "BAAI/bge-large-en-v1.5"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity

def vector_load(docs, key, endpoint):
    
    embedding_model = HuggingFaceBgeEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs,
    query_instruction="Represent the question for retrieving supporting documents: "
)
    
    # need to build some logic here for checking the database - if exists then just add if not, create
    
    db = FAISS.from_documents(docs, embedding_model)
    
    return db
   

In [None]:
# Was not able to finish because it was running for 15 minutes but still not done transforming yet
docs = load_docs('./data/')
db_bge = vector_load(docs, embed_key, embed_endpoint)