<a href="https://colab.research.google.com/github/tractorjuice/Building_BoK/blob/main/Building_Wardley_Mapping_Body_of_Knowledge_Part_15_Query_Podcast_Data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Wardley Mapping Body of Knowledge Using Langchain & OpenAI
## Part 15, query the podcast vector database using ChatGPT

This example shows how to create and query an internal knowledge base using ChatGPT.

This does not require a GPU runtime.

## Set Up


Mount Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
import os

KB_FOLDER = "/content/gdrive/MyDrive/AI/WardleyKB"  # Google drive folder to save the knowledgebase
MAPS = os.path.join(KB_FOLDER, "maps/research2022")  # Sub-directory for research 2022 files
MAPS_DATASTORE = os.path.join(KB_FOLDER, "maps/datastore")  # Sub-directory for maps FAIS datastore files
YT = os.path.join(KB_FOLDER, "youtube")  # Sub-directory for YouTube FAIS datastore files
YT_DATASTORE = os.path.join(YT, "datastore")  # Sub-directory for YouTube FAIS datastore files
YT_AUDIO = os.path.join(YT, "audio")  # Sub-directory for audio files
YT_TRANSCRIPTS = os.path.join(YT_AUDIO, "transcripts")  # Sub-directory for transcripts of audio files
YT_TRANSCRIPTS_TEXT = os.path.join(YT_TRANSCRIPTS, "full_text")  # Sub-directory for text of audio files
YT_TRANSCRIPTS_WHISPER = os.path.join(YT_TRANSCRIPTS, "whisper_chunks")  # Sub-directory for Whisper chunks of audio files
YT_TRANSCRIPTS_DATASTORE = os.path.join(YT_TRANSCRIPTS, "datastore")  # Sub-directory for books FAIS datastore file
PODCAST = os.path.join(KB_FOLDER, "podcast")  # Sub-directory for YouTube FAIS datastore files
PODCAST_DATASTORE = os.path.join(PODCAST, "datastore")  # Sub-directory for YouTube FAIS datastore files
PODCAST_AUDIO = os.path.join(PODCAST, "audio")  # Sub-directory for YouTube FAIS datastore files
PODCAST_TRANSCRIPTS = os.path.join(PODCAST, "transcripts")  # Sub-directory for YouTube FAIS datastore files
BOOKS = os.path.join(KB_FOLDER, "books")  # Sub-directory for books FAIS datastore file
BOOKS_DATASTORE = os.path.join(BOOKS, "datastore")  # Sub-directory for books FAIS datastore file
BOOK = os.path.join(BOOKS, "book")  # Sub-directory for files of the pages from Wardley book

# Check if directory exists and if not, create it
if not os.path.exists(KB_FOLDER):
    os.makedirs(KB_FOLDER)

# Check if directory exists and if not, create it
if not os.path.exists(MAPS):
    os.makedirs(MAPS)

# Check if directory exists and if not, create it
if not os.path.exists(MAPS_DATASTORE):
    os.makedirs(MAPS_DATASTORE)

# Check if directory exists and if not, create it
if not os.path.exists(YT_DATASTORE):
    os.makedirs(YT_DATASTORE)

# Check if directory exists and if not, create it
if not os.path.exists(BOOKS_DATASTORE):
    os.makedirs(BOOKS_DATASTORE)

# Check if sub-directory exists and if not, create it
if not os.path.exists(YT_AUDIO):
    os.makedirs(YT_AUDIO)

# Check if sub-directory exists and if not, create it
if not os.path.exists(YT_TRANSCRIPTS):
    os.makedirs(YT_TRANSCRIPTS)

# Check if sub-directory exists and if not, create it
if not os.path.exists(YT_TRANSCRIPTS_TEXT):
    os.makedirs(YT_TRANSCRIPTS_TEXT)

# Check if sub-directory exists and if not, create it
if not os.path.exists(YT_TRANSCRIPTS_WHISPER):
    os.makedirs(YT_TRANSCRIPTS_WHISPER)

# Check if sub-directory exists and if not, create it
if not os.path.exists(BOOKS):
    os.makedirs(BOOKS)

# Check if sub-directory exists and if not, create it
if not os.path.exists(BOOK):
    os.makedirs(BOOK)

# Check if sub-directory exists and if not, create it
if not os.path.exists(PODCAST):
    os.makedirs(PODCAST)

# Check if sub-directory exists and if not, create it
if not os.path.exists(PODCAST_DATASTORE):
    os.makedirs(PODCAST_DATASTORE)

# Check if sub-directory exists and if not, create it
if not os.path.exists(PODCAST_AUDIO):
    os.makedirs(PODCAST_AUDIO)

# Check if sub-directory exists and if not, create it
if not os.path.exists(PODCAST_TRANSCRIPTS):
    os.makedirs(PODCAST_TRANSCRIPTS)

Use Pinecone or FAISS for the Vector Database

In [None]:
vectorstore = 'FAISS' # Set to 'Pinecone' or 'FAISS' for the vector datbase

In [None]:
!pip install -q langchain
!pip install -q openai
!pip install -q tiktoken

In [None]:
if vectorstore == 'Pinecone':
    !pip install -q pinecone-client
    from langchain.vectorstores import Pinecone
    from tqdm.autonotebook import tqdm
    import pinecone

    # initialize pinecone
    pinecone.init(
        api_key="",  # find at app.pinecone.io
        environment="us-west4-gcp-free"  # next to api key in console
        )

    index_name = "knowledge" # Put your Pincecone index name here
    name_space = "wardleykb" # Put your Pincecone namespace here

else:
    !pip install -q faiss-cpu
    from langchain.vectorstores import FAISS


Set up OPEN_API_KEY and necessary variables

In [None]:
import os
from getpass import getpass

os.environ["OPENAI_API_KEY"] = "" # add your OpenAI API key here

#MODEL = "gpt-3"
#MODEL = "gpt-3.5-turbo"
#MODEL = "gpt-3.5-turbo-0613"
#MODEL = "gpt-3.5-turbo-16k"
MODEL = "gpt-3.5-turbo-16k-0613"
#MODEL = "gpt-4"
#MODEL = "gpt-4-0613"
#MODEL = "gpt-4-32k-0613"

# Query using the vector store with ChatGPT integration

Setup access to the Pinecone or FAISS vector database

In [None]:
from langchain.embeddings.openai import OpenAIEmbeddings
embeddings = OpenAIEmbeddings()

In [None]:
if vectorstore == 'Pinecone':
    vector_store = Pinecone.from_existing_index(index_name, embeddings, namespace=name_space)

else:
    # Open FAISS datastore
    from langchain.vectorstores import FAISS
    if os.path.exists(f"{PODCAST_DATASTORE}"):
        vector_store = FAISS.load_local(
            f"{PODCAST_DATASTORE}",
            OpenAIEmbeddings()
            )
    else:
        print(f"Missing files. Upload index.faiss and index.pkl files to data_store directory first")

Setup the prompt

In [None]:
from langchain.prompts.chat import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    HumanMessagePromptTemplate,
)

system_template="""
    You are an Data Mesh researcher based in the UK with well over twenty years research in Data arcitecture, DataOps and Data Mesh.
    You use examples from Data Mesh in your answers.
    Your language should be for a 12 year old to understand.
    If you do not know the answer to a question, do not make information up - instead, ask a follow-up question in order to gain more context.
    Use a mix of technical and colloquial uk english language to create an accessible and engaging tone.
    Use the following pieces of context to answer the users question.
    Take note of the sources and include them in the answer in the format: "SOURCES: source1 source2", use "SOURCES" in capital letters regardless of the number of sources.
----------------
{summaries}
"""
messages = [
    SystemMessagePromptTemplate.from_template(system_template),
    HumanMessagePromptTemplate.from_template("{question}")
]
prompt = ChatPromptTemplate.from_messages(messages)

Initialise the LLM API

In [None]:
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQAWithSourcesChain

chain_type_kwargs = {"prompt": prompt}
llm = ChatOpenAI(model_name=MODEL, temperature=0)  # Modify model_name if you have access to GPT-4
chain = RetrievalQAWithSourcesChain.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vector_store.as_retriever(search_type="mmr", search_kwargs={"k": 10}), # Use MMR search and return 5 (max 20) sources
    return_source_documents=True,
    chain_type_kwargs=chain_type_kwargs
)

#### Use the chain to query

In [None]:
query = "what are the key concepts of a datamesh?"
result = chain(query)

Print the sources so we can find the Google Podcasts

In [None]:
print(result['question'])
print(result['answer'])

source_documents = result['source_documents']
for index, document in enumerate(source_documents):
    print(f"\nSource {index + 1}:")
    podcast_filename = document.metadata['source_url']
    podcast_name = os.path.splitext(podcast_filename)[0]
    print(f"Source podcast: https://podcasts.google.com/feed/aHR0cHM6Ly9mZWVkcy5jYXB0aXZhdGUuZm0vZGF0YS1tZXNoLXJhZGlv/episode/{podcast_name}?t={int(document.metadata['source'])}")
    print(f"Content: {document.page_content}")

In [None]:
query = "what does a typical datamesh instructure look like?"
result = chain(query)

In [None]:
print(result['question'])
print(result['answer'])

source_documents = result['source_documents']
for index, document in enumerate(source_documents):
    print(f"\nSource {index + 1}:")
    podcast_filename = document.metadata['source_url']
    podcast_name = os.path.splitext(podcast_filename)[0]
    print(f"Source podcast: https://podcasts.google.com/feed/aHR0cHM6Ly9mZWVkcy5jYXB0aXZhdGUuZm0vZGF0YS1tZXNoLXJhZGlv/episode/{podcast_name}?t={int(document.metadata['source'])}")
    print(f"Content: {document.page_content}")

In [None]:
query = "what is domain-oriented decentralized data ownership?"
result = chain(query)

In [None]:
print(result['question'])
print(result['answer'])

source_documents = result['source_documents']
for index, document in enumerate(source_documents):
    print(f"\nSource {index + 1}:")
    podcast_filename = document.metadata['source_url']
    podcast_name = os.path.splitext(podcast_filename)[0]
    print(f"Source podcast: https://podcasts.google.com/feed/aHR0cHM6Ly9mZWVkcy5jYXB0aXZhdGUuZm0vZGF0YS1tZXNoLXJhZGlv/episode/{podcast_name}?t={int(document.metadata['source'])}")
    print(f"Content: {document.page_content}")

In [None]:
query = "what tools enable control and autonomy over data?"
result = chain(query)

In [None]:
print(result['question'])
print(result['answer'])

source_documents = result['source_documents']
for index, document in enumerate(source_documents):
    print(f"\nSource {index + 1}:")
    podcast_filename = document.metadata['source_url']
    podcast_name = os.path.splitext(podcast_filename)[0]
    print(f"Source podcast: https://podcasts.google.com/feed/aHR0cHM6Ly9mZWVkcy5jYXB0aXZhdGUuZm0vZGF0YS1tZXNoLXJhZGlv/episode/{podcast_name}?t={int(document.metadata['source'])}")
    print(f"Content: {document.page_content}")