<a href="https://colab.research.google.com/github/tractorjuice/Building_BoK/blob/main/Building_Wardley_Mapping_Body_of_Knowledge_Part_8.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Wardley Mapping Body of Knowledge Using Langchain & OpenAI
## Part 8, adding open Wardley Maps to the body of knowledge
This example shows how to create and query an internal knowledge base using ChatGPT.

This does not require a GPU runtime.

## Set Up


Mount Google Drive for data storage

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Setup file structure

In [None]:
import os

DOCS_FOLDER = "/content/gdrive/MyDrive/WardleyKB"  # Google drive folder to save the audio clips from YouTube videos
MAPS_FOLDER = os.path.join(DOCS_FOLDER, "maps")  # Sub-directory for audio files

# Check if directory exists and if not, create it
if not os.path.exists(MAPS_FOLDER):
    os.makedirs(MAPS_FOLDER)


Install required dependencies for GitHub

In [None]:
!pip -q install PyGithub

In [None]:
from github import Github
import base64


Define the required GitHub repo and setup API keys

In [None]:
GITHUBREPO = "swardley/Research2022" # Source of Wardley Maps
GITHUB = "" # Put your GitHub API key here

## Wardley Map Data Collection

In [None]:
# Initiate GitHub repository

g = Github(GITHUB)
repo = g.get_repo(GITHUBREPO)


In [None]:
# Get a list of all the available files in GitHub

file_list = []
contents = repo.get_contents("")
while contents:
    file_item = contents.pop(0)
    if file_item.type == "dir":
        contents.extend(repo.get_contents(file_item.path))
    else:
        file_name = file_item.name
        # Ignore files that are not maps
        if not file_name.isupper() and not file_name.startswith('.') and file_name.lower() != 'readme.md':
            file_list.append(file_item.path)

print (file_list)

In [None]:
# Get a the files and save them to the Google Drive

for file in file_list:
    file_item = repo.get_contents(file)
    file_content = base64.b64decode(file_item.content).decode('utf-8')
    maps_filename = f'{MAPS_FOLDER}/{file}.owm'

    os.makedirs(os.path.dirname(maps_filename), exist_ok=True)

    if not os.path.isfile(maps_filename):
        with open(maps_filename, 'w') as f:
            f.write(file_content)
            print (file)
    else:
        print (f"File already exists: {maps_filename}")

## Split text and upsert the maps into Pinecone vector database

Setup required API keys

In [None]:
import os
os.environ["OPENAI_API_KEY"] = "" # Put your OpenAI API key here

Use Pinecone for the Vector Database

In [None]:
!pip install -q langchain
!pip install -q pinecone-client
!pip install -q openai
!pip install -q tiktoken
from langchain.vectorstores import Pinecone
from tqdm.autonotebook import tqdm
import pinecone

In [None]:
# initialize pinecone
pinecone.init(
    api_key="",  # find at app.pinecone.io
    environment=""  # next to api key in console
    )

index_name = "knowledge" # Put your Pincecone index name here
name_space = "wardleymaps" # Put your Pincecone namespace here

Install all required dependencies

In [None]:
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
import tiktoken

### Walk through files and upsert into Pinecone

In [None]:
counter = 0

text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0, separator="\n")
embeddings = OpenAIEmbeddings()

total_maps = len(file_list)

for files in file_list:
    counter = counter + 1
    maps_filename = f'{MAPS_FOLDER}/{files}.owm'

    # Open the file and read its content
    with open(maps_filename, 'r') as f:
        file_content = f.read()

    print(f'\n{counter} of {total_maps}: Loading {maps_filename} ......')
    docs = []
    metadatas = []

    splits = text_splitter.split_text(file_content)
    docs.extend(splits)

    # Extend metadatas with a metadata dict for each split
    metadatas.extend([{"source": maps_filename}] * len(splits))

    #print(splits)
    #print(docs)
    #print(metadatas)

    # Upsert one at a time, this handles errors with OpenAI API better
    vector_store = Pinecone.from_texts(docs, embeddings, metadatas=metadatas, index_name=index_name, namespace=name_space)


### Query using the vector store with ChatGPT integration

Setup access to the Pinecone vector database

In [None]:
from langchain.embeddings.openai import OpenAIEmbeddings
embeddings = OpenAIEmbeddings()
vector_store = Pinecone.from_existing_index(index_name, embeddings, namespace=name_space)

Setup the prompt

In [None]:
from langchain.prompts.chat import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    HumanMessagePromptTemplate,
)

system_template="""
    You are SimonGPT a strategy researcher based in the UK.
    “Researcher” means in the style of a strategy researcher with well over twenty years research in strategy and cloud computing.
    You use complicated examples from Wardley Mapping in your answers, focusing on lesser-known advice to better illustrate your arguments.
    Your language should be for an 12 year old to understand.
    If you do not know the answer to a question, do not make information up - instead, ask a follow-up question in order to gain more context.
    Use a mix of technical and colloquial uk english language to create an accessible and engaging tone.
    Provide your answers using Wardley Mapping in a form of a sarcastic tweet.
    Use the following pieces of context to answer the users question.
    ----------
    {summaries}
    """
messages = [
    SystemMessagePromptTemplate.from_template(system_template),
    HumanMessagePromptTemplate.from_template("{question}")
]
prompt = ChatPromptTemplate.from_messages(messages)

Initialise the LLM API

In [None]:
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQAWithSourcesChain

chain_type_kwargs = {"prompt": prompt}
llm = ChatOpenAI(model_name="gpt-3.5-turbo-16k-0613", temperature=0, max_tokens=256)  # Modify model_name if you have access to GPT-4
chain = RetrievalQAWithSourcesChain.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vector_store.as_retriever(),
    return_source_documents=True,
    chain_type_kwargs=chain_type_kwargs
)


#### Use the chain to query

In [None]:
query = "how is AI used in these maps?"
result = chain(query)


Print the answer and sources

In [None]:
print(result['question'],'\n')
print(result['answer'],'\n')

for document in result['source_documents']:
    if 'source' in document.metadata:
        print("\nSource: ", document.metadata['source'],"\n")
        #print(document.page_content)


In [None]:
# Delete the namespace if required

vector_store.delete(delete_all=True, namespace=name_space)