<a href="https://colab.research.google.com/github/tractorjuice/MLOpsAIKB/blob/main/Building_MLOps_AI_Body_of_Knowledge_Part_4_Query_Data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# MLOps AI Body of Knowledge Using Langchain & OpenAI
## Part 4, query the vector database using ChatGPT

This example shows how to create and query an internal knowledge base using ChatGPT.

This does not require a GPU runtime.

## Set Up


Mount Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
import os

KB_FOLDER = "/content/gdrive/MyDrive/MLOpsKB"  # Google drive folder to save the knowledgebase
YT_DATASTORE = os.path.join(KB_FOLDER, "youtube/datastore")  # Sub-directory for YouTube FAIS datastore files
YT_AUDIO_FOLDER = os.path.join(KB_FOLDER, "youtube/audio")  # Sub-directory for audio files
TRANSCRIPTS_FOLDER = os.path.join(YT_AUDIO_FOLDER, "transcripts")  # Sub-directory for transcripts of audio files
TRANSCRIPTS_TEXT_FOLDER = os.path.join(TRANSCRIPTS_FOLDER, "text")  # Sub-directory for text of audio files
TRANSCRIPTS_WHISPER_FOLDER = os.path.join(TRANSCRIPTS_FOLDER, "whisper_chunks")  # Sub-directory for Whisper chunks of audio files

# Check if directory exists and if not, create it
if not os.path.exists(KB_FOLDER):
    os.makedirs(KB_FOLDER)

# Check if directory exists and if not, create it
if not os.path.exists(YT_DATASTORE):
    os.makedirs(YT_DATASTORE)

# Check if sub-directory exists and if not, create it
if not os.path.exists(YT_AUDIO_FOLDER):
    os.makedirs(YT_AUDIO_FOLDER)

# Check if sub-directory exists and if not, create it
if not os.path.exists(TRANSCRIPTS_FOLDER):
    os.makedirs(TRANSCRIPTS_FOLDER)

# Check if sub-directory exists and if not, create it
if not os.path.exists(TRANSCRIPTS_TEXT_FOLDER):
    os.makedirs(TRANSCRIPTS_TEXT_FOLDER)

# Check if sub-directory exists and if not, create it
if not os.path.exists(TRANSCRIPTS_WHISPER_FOLDER):
    os.makedirs(TRANSCRIPTS_WHISPER_FOLDER)

Use Pinecone or FAISS for the Vector Database

In [None]:
!pip install -q langchain
!pip install -q openai
!pip install -q tiktoken

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m90.0/90.0 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.4/49.4 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.6/73.6 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
vectorstore = 'FAISS' # Set to 'Pinecone' or 'FAISS' for the vector datbase

In [None]:
if vectorstore == 'Pinecone':
    !pip install -q pinecone-client
    from langchain.vectorstores import Pinecone
    from tqdm.autonotebook import tqdm
    import pinecone

    # initialize pinecone
    pinecone.init(
        api_key="",  # find at app.pinecone.io
        environment="us-west4-gcp-free"  # next to api key in console
        )

    index_name = "knowledge" # Put your Pincecone index name here
    name_space = "mlopskb" # Put your Pincecone namespace here

else:
    !pip install -q faiss-cpu
    from langchain.vectorstores import FAISS


Set up OPEN_API_KEY and necessary variables

In [None]:
import os
from getpass import getpass

os.environ["OPENAI_API_KEY"] = "" # Add your OpenAI API Key here

#MODEL = "gpt-3"
#MODEL = "gpt-3.5-turbo"
#MODEL = "gpt-3.5-turbo-0613"
#MODEL = "gpt-3.5-turbo-16k"
MODEL = "gpt-3.5-turbo-16k-0613"
#MODEL = "gpt-4"
#MODEL = "gpt-4-0613"
#MODEL = "gpt-4-32k-0613"

# Query using the vector store with ChatGPT integration

Setup access to the Pinecone or FAISS vector database

In [None]:
from langchain.embeddings.openai import OpenAIEmbeddings
embeddings = OpenAIEmbeddings()

In [None]:
if vectorstore == 'Pinecone':
    vector_store = Pinecone.from_existing_index(index_name, embeddings, namespace=name_space)

else:
    # Open datastore
    from langchain.vectorstores import FAISS
    if os.path.exists(f"{YT_DATASTORE}"):
        vector_store = FAISS.load_local(
            f"{YT_DATASTORE}",
            OpenAIEmbeddings()
            )
    else:
        print(f"Missing files. Upload index.faiss and index.pkl files to data_store directory first")


Setup the prompt

In [None]:
from langchain.prompts.chat import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    HumanMessagePromptTemplate,
)

system_template="""
    You are MLOpsGPT a mlops specialist bot.
    You use examples from MLOps in your answers.
    Your language should be for an 12 year old to understand.
    If you do not know the answer to a question, do not make information up - instead, ask a follow-up question in order to gain more context.
    Use a mix of technical and colloquial uk english language to create an accessible and engaging tone.
    Use the following pieces of context to answer the users question.
    Take note of the sources and include them in the answer in the format: "SOURCES: source1 source2", use "SOURCES" in capital letters regardless of the number of sources.
----------------
{summaries}
"""
messages = [
    SystemMessagePromptTemplate.from_template(system_template),
    HumanMessagePromptTemplate.from_template("{question}")
]
prompt = ChatPromptTemplate.from_messages(messages)

Initialise the LLM API

In [None]:
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQAWithSourcesChain

chain_type_kwargs = {"prompt": prompt}
llm = ChatOpenAI(model_name=MODEL, temperature=0)  # Modify model_name if you have access to GPT-4
chain = RetrievalQAWithSourcesChain.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vector_store.as_retriever(),
    return_source_documents=True,
    chain_type_kwargs=chain_type_kwargs
)

#### Use the chain to query

Print the sources so we can find the YouTube videos

In [None]:
query = "How can I learn about MLOps?"
result = chain(query)

In [None]:
print(result['question'])
print(result['answer'])

source_documents = result['source_documents']
for index, document in enumerate(source_documents):
    print(f"\nSource {index + 1}:")
    print(f"Video title: {document.metadata['title']}")
    print(f"Video author: {document.metadata['author']}")
    print(f"Source video: https://youtu.be/{document.metadata['source_url']}?t={int(document.metadata['source'])}")
    print(f"Content: {document.page_content}")

How can I learn about MLOps?
If you're interested in learning about MLOps, there are several ways you can get started. Here are a few suggestions:

1. Online Courses: There are many online courses available that can teach you the fundamentals of MLOps. Platforms like Coursera, Udemy, and edX offer courses specifically focused on MLOps. These courses usually cover topics such as deploying machine learning models, managing data pipelines, and monitoring model performance.

2. Books and Blogs: There are several books and blogs written by experts in the field of MLOps that can provide you with valuable insights. Some popular books include "Building Machine Learning Powered Applications" by Emmanuel Ameisen and "MLOps: Continuous Delivery and Automation Pipelines in Machine Learning" by Mark Treveil. Blogs like Towards Data Science and Medium also have a wealth of articles on MLOps.

3. MLOps Community: Engaging with the MLOps community can be a great way to learn from others and stay updat

In [None]:
query = "what are the key components of MLOps?"
result = chain(query)

In [None]:
print(result['question'])
print(result['answer'])

source_documents = result['source_documents']
for index, document in enumerate(source_documents):
    print(f"\nSource {index + 1}:")
    print(f"Video title: {document.metadata['title']}")
    print(f"Video author: {document.metadata['author']}")
    print(f"Source video: https://youtu.be/{document.metadata['source_url']}?t={int(document.metadata['source'])}")
    print(f"Content: {document.page_content}")

what are the key components of MLOps?
The key components of MLOps are:

1. Data Management: This involves collecting, storing, and organizing the data that is used to train and test machine learning models. Good data management ensures that the data is accurate, complete, and properly labeled.

2. Model Development: This is the process of creating and fine-tuning machine learning models. It involves selecting the right algorithms, training the models on the data, and evaluating their performance.

3. Model Deployment: Once a machine learning model is developed, it needs to be deployed into a production environment where it can be used to make predictions or automate tasks. This involves setting up the necessary infrastructure, such as servers or cloud platforms, and integrating the model into existing systems.

4. Monitoring and Maintenance: After a model is deployed, it needs to be continuously monitored to ensure that it is performing as expected. This includes monitoring its accurac

In [None]:
query = "Who leads the MLOps Community?"
result = chain(query)

In [None]:
print(result['question'])
print(result['answer'])

source_documents = result['source_documents']
for index, document in enumerate(source_documents):
    print(f"\nSource {index + 1}:")
    print(f"Video title: {document.metadata['title']}")
    print(f"Video author: {document.metadata['author']}")
    print(f"Source video: https://youtu.be/{document.metadata['source_url']}?t={int(document.metadata['source'])}")
    print(f"Content: {document.page_content}")

Who leads the MLOps Community?
The MLOps Community is a collaborative community, so there isn't a single person who leads it. Instead, it is led by a group of passionate individuals who are experts in the field of MLOps. They work together to organize events, share knowledge, and facilitate discussions within the community. You can find these leaders actively participating in various platforms such as the MLOps Community Slack, where they help answer questions and provide guidance to members. They also contribute to the MLOps Community podcast, where they share insights and interview industry experts. So, it's a collective effort by many dedicated individuals who are passionate about advancing MLOps practices. (

Source 1:
Video title: Using LLMs to Punch Above your Weight! // Cameron Feenstra // LLMs in Production Conference Talk
Video author: MLOps.community
Source video: https://youtu.be/1_NTxx3CJXg?t=1859
Content: in the MLOps community,

Source 2:
Video title: $360k Question - Und