<a href="https://colab.research.google.com/github/tractorjuice/Building_BoK/blob/main/Building_Wardley_Mapping_Body_of_Knowledge_Part_8_GitHub_Data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Wardley Mapping Body of Knowledge Using Langchain & OpenAI
## Part 8, adding open Wardley Maps to the body of knowledge
This example shows how to create and query an internal knowledge base using ChatGPT.

This does not require a GPU runtime.

## Set Up


Mount Google Drive for data storage

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Setup file structure

In [None]:
import os

DOCS_FOLDER = "/content/gdrive/MyDrive/WardleyKB"  # Google drive folder to save the audio clips from YouTube videos
MAPS_FOLDER = os.path.join(DOCS_FOLDER, "maps/research2022")  # Sub-directory for audio files
DATASTORE = os.path.join(DOCS_FOLDER, "maps/datastore")  # Sub-directory for audio files
AUDIO_FOLDER = os.path.join(DOCS_FOLDER, "audio")  # Sub-directory for audio files
TRANSCRIPTS_FOLDER = os.path.join(AUDIO_FOLDER, "transcripts")  # Sub-directory for audio files
TRANSCRIPTS_TEXT_FOLDER = os.path.join(TRANSCRIPTS_FOLDER, "text")  # Sub-directory for audio files
TRANSCRIPTS_WHISPER_FOLDER = os.path.join(TRANSCRIPTS_FOLDER, "whisper_chunks")  # Sub-directory for audio files

# Check if directory exists and if not, create it
if not os.path.exists(DOCS_FOLDER):
    os.makedirs(DOCS_FOLDER)

# Check if directory exists and if not, create it
if not os.path.exists(MAPS_FOLDER):
    os.makedirs(MAPS_FOLDER)

# Check if directory exists and if not, create it
if not os.path.exists(DATASTORE):
    os.makedirs(DATASTORE)

# Check if sub-directory for audio exists and if not, create it
if not os.path.exists(AUDIO_FOLDER):
    os.makedirs(AUDIO_FOLDER)

# Check if sub-directory for audio exists and if not, create it
if not os.path.exists(TRANSCRIPTS_FOLDER):
    os.makedirs(TRANSCRIPTS_FOLDER)

# Check if sub-directory for audio exists and if not, create it
if not os.path.exists(TRANSCRIPTS_TEXT_FOLDER):
    os.makedirs(TRANSCRIPTS_TEXT_FOLDER)

# Check if sub-directory for audio exists and if not, create it
if not os.path.exists(TRANSCRIPTS_WHISPER_FOLDER):
    os.makedirs(TRANSCRIPTS_WHISPER_FOLDER)

Install required dependencies

In [None]:
!pip install -q langchain
!pip install -q openai
!pip install -q tiktoken

Setup required API keys

In [None]:
import os
from getpass import getpass

os.environ["OPENAI_API_KEY"] = "" # Put your OpenAI API key here

#MODEL = "gpt-3"
#MODEL = "gpt-3.5-turbo"
#MODEL = "gpt-3.5-turbo-0613"
#MODEL = "gpt-3.5-turbo-16k"
MODEL = "gpt-3.5-turbo-16k-0613"
#MODEL = "gpt-4"
#MODEL = "gpt-4-0613"
#MODEL = "gpt-4-32k-0613"

Install required dependencies for GitHub

In [None]:
!pip -q install PyGithub

In [None]:
from github import Github
import base64

Define the required GitHub repo and setup API keys

In [None]:
GITHUBREPO = "swardley/Research2022" # Source of Wardley Maps
GITHUB = "" # Put your GitHub API key here

## Wardley Map Data Collection

In [None]:
# Initiate GitHub repository

g = Github(GITHUB)
repo = g.get_repo(GITHUBREPO)

In [None]:
# Get a list of all the available files in GitHub

file_list = []
contents = repo.get_contents("")
while contents:
    file_item = contents.pop(0)
    if file_item.type == "dir":
        contents.extend(repo.get_contents(file_item.path))
    else:
        file_name = file_item.name
        # Ignore files that are not maps
        if not file_name.isupper() and not file_name.startswith('.') and file_name.lower() != 'readme.md':
            file_list.append(file_item.path)

print (file_list)

In [None]:
# Get a the files and save them to the Google Drive

for file in file_list:
    file_item = repo.get_contents(file)
    file_content = base64.b64decode(file_item.content).decode('utf-8')
    maps_filename = f'{MAPS_FOLDER}/{file}.owm'

    os.makedirs(os.path.dirname(maps_filename), exist_ok=True)

    if not os.path.isfile(maps_filename):
        with open(maps_filename, 'w') as f:
            f.write(file_content)
            print (file)
    else:
        print (f"File already exists: {maps_filename}")

## Split text and upsert the maps into Pinecone or FAISS vector database

Initialise preferred vector database

In [None]:
vectorstore = 'FAISS' # Set to 'Pinecone' or 'FAISS' for the vector datbase. If using FAISS, no GPU required

In [None]:
if vectorstore == 'Pinecone':
    !pip install -q pinecone-client
    from langchain.vectorstores import Pinecone
    from tqdm.auto import tqdm
    import pinecone

    # initialize pinecone
    pinecone.init(
        api_key="",  # find at app.pinecone.io
        environment="us-west4-gcp-free"  # next to api key in console
        )

    index_name = "knowledge" # Put your Pincecone index name here
    name_space = "researchmaps" # Put your Pincecone namespace here

else:
    !pip install -q faiss-cpu
    from langchain.vectorstores import FAISS


Install all required dependencies

In [None]:
#import json
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
import tiktoken

### Walk through files and upsert into preferred vector database

In [None]:
counter = 0

text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=20, separator="\n")
embeddings = OpenAIEmbeddings()

total_maps = len(file_list)

for files in file_list:
    counter = counter + 1
    maps_filename = f'{MAPS_FOLDER}/{files}.owm'

    # Open the file and read its content
    with open(maps_filename, 'r') as f:
        file_content = f.read()

    print(f'\n{counter} of {total_maps}: Loading {maps_filename} ......')
    docs = []
    metadatas = []

    splits = text_splitter.split_text(file_content)
    docs.extend(splits)

    # Extend metadatas with a metadata dict for each split
    metadatas.extend([{"source": maps_filename}] * len(splits))

    # Upsert one at a time, this handles errors with OpenAI API better

    if vectorstore == 'Pinecone':
        try:
            vector_store = Pinecone.from_texts(docs, embeddings, metadatas=metadatas, index_name=index_name, namespace=name_space)
        except:
            print("Error upserting data into the vectorstore\n")
    else:
        try:
            vector_store = FAISS.from_texts(docs, embeddings, metadatas=metadatas)
            if os.path.exists(f"{DATASTORE}/index.faiss"):
                existing_index=FAISS.load_local(f"{DATASTORE}", embeddings)
                existing_index.merge_from(vector_store)
                existing_index.save_local(f"{DATASTORE}")
            else:
                vector_store.save_local(f"{DATASTORE}") # Download the files `$DATA_STORE_DIR/index.faiss` and `$DATA_STORE_DIR/index.pkl` to local
        except:
            print("Error upserting data into the vectorstore\n")



### Query using the vector store with ChatGPT integration

Setup access to the preferred vector database

In [None]:
from langchain.embeddings.openai import OpenAIEmbeddings
embeddings = OpenAIEmbeddings()

In [None]:
if vectorstore == 'Pinecone':
    vector_store = Pinecone.from_existing_index(index_name, embeddings, namespace=name_space)

else:
    # Open FAISS datastore
    from langchain.vectorstores import FAISS
    if os.path.exists(f"{DATASTORE}"):
        vector_store = FAISS.load_local(
            f"{DATASTORE}",
            OpenAIEmbeddings()
            )
    else:
        print(f"Missing files. Upload index.faiss and index.pkl files to data_store directory first")

## Example Queries (Q&A)

Setup the prompt

In [None]:
from langchain.prompts.chat import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    HumanMessagePromptTemplate,
)

system_template="""
    You are SimonGPT a strategy researcher based in the UK.
    “Researcher” means in the style of a strategy researcher with well over twenty years research in strategy, wardley mapping and cloud computing.
    You use examples from Wardley Mapping in your answers.
    Your language should be for an 12 year old to understand.
    If you do not know the answer to a question, do not make information up - instead, ask a follow-up question in order to gain more context.
    Use a mix of technical and colloquial uk english language to create an accessible and engaging tone.
    Use the following pieces of context to answer the users question.
    ----------
    {summaries}
    """
messages = [
    SystemMessagePromptTemplate.from_template(system_template),
    HumanMessagePromptTemplate.from_template("{question}")
]
prompt = ChatPromptTemplate.from_messages(messages)

Initialise the LLM API

In [None]:
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQAWithSourcesChain

chain_type_kwargs = {"prompt": prompt}
llm = ChatOpenAI(model_name=MODEL, temperature=0, max_tokens=1000)  # Modify model_name if you have access to GPT-4
chain = RetrievalQAWithSourcesChain.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vector_store.as_retriever(),
    return_source_documents=True,
    chain_type_kwargs=chain_type_kwargs
)

Example question

In [None]:
query = "how is AI used in these maps?"
result = chain(query)

Print the answer and sources

In [None]:
print('Question:', result['question'])
print('Answer:  ', result['answer'],'\n')

source_documents = result['source_documents']
for index, document in enumerate(source_documents):
    if 'source' in document.metadata:
        print(f"Source {index + 1}:", document.metadata['source'])
        #print(f"Content: {document.page_content}")

In [None]:
query = "Are there any common components across these maps?"
result = chain(query)

In [None]:
print('Question:', result['question'])
print('Answer:  ', result['answer'],'\n')

source_documents = result['source_documents']
for index, document in enumerate(source_documents):
    if 'source' in document.metadata:
        print(f"Source {index + 1}:", document.metadata['source'])
        #print(f"Content: {document.page_content}")

In [None]:
query = "What are the opportunities for SMEs within these maps?"
result = chain(query)

In [None]:
print('Question:', result['question'])
print('Answer:  ', result['answer'],'\n')

source_documents = result['source_documents']
for index, document in enumerate(source_documents):
    if 'source' in document.metadata:
        print(f"Source {index + 1}:", document.metadata['source'])
        #print(f"Content: {document.page_content}")

In [None]:
query = "I want to build a component that can be used by all of these maps, what should I create?"
result = chain(query)

In [None]:
print('Question:', result['question'])
print('Answer:  ', result['answer'],'\n')

source_documents = result['source_documents']
for index, document in enumerate(source_documents):
    if 'source' in document.metadata:
        print(f"Source {index + 1}:", document.metadata['source'])
        #print(f"Content: {document.page_content}")

In [None]:
# Delete the Pinecone namespace if required

#vector_store.delete(delete_all=True, namespace=name_space)