<a href="https://colab.research.google.com/github/tractorjuice/MLOpsAIKB/blob/main/Building_MLOps_AI_Body_of_Knowledge_Part_4_Query_Data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# MLOps AI Body of Knowledge Using Langchain & OpenAI
## Part 4, query the vector database using ChatGPT

This example shows how to create and query an internal knowledge base using ChatGPT.

This does not require a GPU runtime.

## Set Up


Mount Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
import os

KB_FOLDER = "/content/gdrive/MyDrive/MLOpsKB"  # Google drive folder to save the knowledgebase
YT_DATASTORE = os.path.join(KB_FOLDER, "youtube/datastore")  # Sub-directory for YouTube FAIS datastore files
YT_AUDIO_FOLDER = os.path.join(KB_FOLDER, "youtube/audio")  # Sub-directory for audio files
TRANSCRIPTS_FOLDER = os.path.join(YT_AUDIO_FOLDER, "transcripts")  # Sub-directory for transcripts of audio files
TRANSCRIPTS_TEXT_FOLDER = os.path.join(TRANSCRIPTS_FOLDER, "text")  # Sub-directory for text of audio files
TRANSCRIPTS_WHISPER_FOLDER = os.path.join(TRANSCRIPTS_FOLDER, "whisper_chunks")  # Sub-directory for Whisper chunks of audio files

# Check if directory exists and if not, create it
if not os.path.exists(KB_FOLDER):
    os.makedirs(KB_FOLDER)

# Check if directory exists and if not, create it
if not os.path.exists(YT_DATASTORE):
    os.makedirs(YT_DATASTORE)

# Check if sub-directory exists and if not, create it
if not os.path.exists(YT_AUDIO_FOLDER):
    os.makedirs(YT_AUDIO_FOLDER)

# Check if sub-directory exists and if not, create it
if not os.path.exists(TRANSCRIPTS_FOLDER):
    os.makedirs(TRANSCRIPTS_FOLDER)

# Check if sub-directory exists and if not, create it
if not os.path.exists(TRANSCRIPTS_TEXT_FOLDER):
    os.makedirs(TRANSCRIPTS_TEXT_FOLDER)

# Check if sub-directory exists and if not, create it
if not os.path.exists(TRANSCRIPTS_WHISPER_FOLDER):
    os.makedirs(TRANSCRIPTS_WHISPER_FOLDER)

Load required dependencies

In [None]:
!pip install -q langchain
!pip install -q openai
!pip install -q tiktoken

Use Pinecone or FAISS for the Vector Database

In [None]:
vectorstore = 'FAISS' # Set to 'Pinecone' or 'FAISS' for the vector datbase

In [None]:
if vectorstore == 'Pinecone':
    !pip install -q pinecone-client
    from langchain.vectorstores import Pinecone
    from tqdm.autonotebook import tqdm
    import pinecone

    # initialize pinecone
    pinecone.init(
        api_key="",  # find at app.pinecone.io
        environment="us-west4-gcp-free"  # next to api key in console
        )

    index_name = "knowledge" # Put your Pincecone index name here
    name_space = "mlopskb" # Put your Pincecone namespace here

else:
    !pip install -q faiss-cpu
    from langchain.vectorstores import FAISS


Set up OPEN_API_KEY and necessary variables

In [None]:
import os
from getpass import getpass

os.environ["OPENAI_API_KEY"] = "" # Add you OpenAI Key here

#MODEL = "gpt-3"
#MODEL = "gpt-3.5-turbo"
#MODEL = "gpt-3.5-turbo-0613"
#MODEL = "gpt-3.5-turbo-16k"
MODEL = "gpt-3.5-turbo-16k-0613"
#MODEL = "gpt-4"
#MODEL = "gpt-4-0613"
#MODEL = "gpt-4-32k-0613"

# Query using the vector store with ChatGPT integration

Setup access to the Pinecone or FAISS vector database

In [None]:
from langchain.embeddings.openai import OpenAIEmbeddings
embeddings = OpenAIEmbeddings()

In [None]:
if vectorstore == 'Pinecone':
    vector_store = Pinecone.from_existing_index(index_name, embeddings, namespace=name_space)

else:
    # Open datastore
    from langchain.vectorstores import FAISS
    if os.path.exists(f"{YT_DATASTORE}"):
        vector_store = FAISS.load_local(
            f"{YT_DATASTORE}",
            OpenAIEmbeddings()
            )
    else:
        print(f"Missing files. Upload index.faiss and index.pkl files to data_store directory first")


Setup the prompt

In [None]:
from langchain.prompts.chat import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    HumanMessagePromptTemplate,
)

system_template="""
    You are MLOpsGPT a mlops specialist bot.
    You use examples from MLOps in your answers.
    Your language should be for an 12 year old to understand.
    If you do not know the answer to a question, do not make information up - instead, ask a follow-up question in order to gain more context.
    Use a mix of technical and colloquial uk english language to create an accessible and engaging tone.
    Use the following pieces of context to answer the users question.
    Take note of the sources and include them in the answer in the format: "SOURCES: source1 source2", use "SOURCES" in capital letters regardless of the number of sources.
----------------
{summaries}
"""
messages = [
    SystemMessagePromptTemplate.from_template(system_template),
    HumanMessagePromptTemplate.from_template("{question}")
]
prompt = ChatPromptTemplate.from_messages(messages)

Initialise the LLM API

In [None]:
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQAWithSourcesChain

chain_type_kwargs = {"prompt": prompt}
llm = ChatOpenAI(model_name=MODEL, temperature=0)  # Modify model_name if you have access to GPT-4
chain = RetrievalQAWithSourcesChain.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vector_store.as_retriever(search_type="mmr", search_kwargs={"k": 3}), # Use MMR search and return 5 (max 20) video sources
    return_source_documents=True,
    chain_type_kwargs=chain_type_kwargs
)

#### Use the chain to query

Print the sources so we can find the YouTube videos

In [None]:
query = "How can I learn about MLOps?"
result = chain(query)

In [None]:
print(result['question'])
print(result['answer'])

source_documents = result['source_documents']
for index, document in enumerate(source_documents):
    print(f"\nSource {index + 1}:")
    print(f"Video title: {document.metadata['title']}")
    print(f"Video author: {document.metadata['author']}")
    print(f"Source video: https://youtu.be/{document.metadata['source_url']}?t={int(document.metadata['source'])}")
    print(f"Content: {document.page_content}")

In [None]:
query = "what are the key components of MLOps?"
result = chain(query)

In [None]:
print(result['question'])
print(result['answer'])

source_documents = result['source_documents']
for index, document in enumerate(source_documents):
    print(f"\nSource {index + 1}:")
    print(f"Video title: {document.metadata['title']}")
    print(f"Video author: {document.metadata['author']}")
    print(f"Source video: https://youtu.be/{document.metadata['source_url']}?t={int(document.metadata['source'])}")
    print(f"Content: {document.page_content}")

In [None]:
query = "Who leads the MLOps Community?"
result = chain(query)

In [None]:
print(result['question'])
print(result['answer'])

source_documents = result['source_documents']
for index, document in enumerate(source_documents):
    print(f"\nSource {index + 1}:")
    print(f"Video title: {document.metadata['title']}")
    print(f"Video author: {document.metadata['author']}")
    print(f"Source video: https://youtu.be/{document.metadata['source_url']}?t={int(document.metadata['source'])}")
    print(f"Content: {document.page_content}")