<a href="https://colab.research.google.com/github/tractorjuice/Building_Wardley_BoK/blob/main/Building_Wardley_Mapping_Body_of_Knowledge_Part_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Build an AI Body of Knowledge Using Langchain & OpenAI
## Part 3, create the vector database

This example shows how to create and query an internal knowledge base using ChatGPT.

This does not requires a GPU runtime.

### Runtime Checks

In [None]:
try:
  gpu_info = !nvidia-smi
except:
  print('No GPU')
else:
  gpu_info = '\n'.join(gpu_info)
  if gpu_info.find('failed') >= 0:
    print('Not connected to a GPU')
  print(gpu_info)

In [None]:
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

## Set Up


In [None]:
# Make sure you have a settings.ini file in your directory with the required searches and/or playlists

"""
; System settings
[System]
KB_FOLDER = "/content/gdrive/MyDrive/AI/WardleyKB"

[YouTube]
YT_PLAYLISTS = "                                     "
YT_SEARCHES = "DataOps", "FinOps", "MLOps"
"""

Mount Google Drive

In [None]:
# Mount Google Drive
try:
    from google.colab import drive
    drive.mount('/content/gdrive')
except Exception as e:
    print(f"Failed to mount Google Drive. Reason: {e}")

Get required secrets and setup any keys

In [None]:
import os
from google.colab import userdata
os.environ["OPENAI_API_KEY"] = userdata.get('OPENAI_API_KEY')

Check / create directory structure

In [None]:
debug = False  # set this to True if you only want the first 5 files
SETTINGS = "/content/gdrive/MyDrive/AI/WardleyKB/settings.ini"

In [None]:
# Read the settings.ini file to get key configuration
import configparser
config = configparser.ConfigParser()
config.read(SETTINGS)

In [None]:
import os, datetime

# Get the current time at the start of the program and format as a string
start_time = datetime.datetime.now()
start_time_str = start_time.strftime("%Y%m%d%H%M")

# Read the settings.ini file
try:
    KB_FOLDER = config.get('System', 'KB_FOLDER').strip('"')
except configparser.NoOptionError:
    print("Missing configuration key.")
except Exception as e:
    print(f"Failed to read configuration. Reason: {e}")

print("Root Folder: ", KB_FOLDER)
YT = os.path.join(KB_FOLDER, "youtube")  # All YouTube files
YT_DATASTORE = os.path.join(YT, "datastore")  # Sub-directory for YouTube FAISS datastore files
YT_VIDEOS = os.path.join(YT, "videos")  # Sub-directory for audio files
YT_AUDIO = os.path.join(YT_VIDEOS, "audio")  # Sub-directory for audio files
YT_TRANSCRIPTS = os.path.join(YT_VIDEOS, "transcripts")  # Sub-directory for transcripts of audio files
YT_TRANSCRIPTS_TEXT = os.path.join(YT_TRANSCRIPTS, "full_text")  # Sub-directory for text of audio files
YT_TRANSCRIPTS_WHISPER = os.path.join(YT_TRANSCRIPTS, "whisper_chunks")  # Sub-directory for Whisper chunks of audio files
YT_TRANSCRIPTS_WHISPER_DISTIL = os.path.join(YT_TRANSCRIPTS, "distil_whisper_chunks")  # Sub-directory for Distil Whisper chunks of audio files
YT_TRANSCRIPTS_COMBINED = os.path.join(YT_TRANSCRIPTS, "combined_transcripts")  # Sub-directory for books FAIS datastore file
YT_TRANSCRIPTS_DATASTORE = os.path.join(YT_TRANSCRIPTS, "datastore")  # Sub-directory for books FAISS datastore file
PODCAST = os.path.join(KB_FOLDER, "podcast")  # Sub-directory for YouTube FAIS datastore files
PODCAST_DATASTORE = os.path.join(PODCAST, "datastore")  # Sub-directory for YouTube FAIS datastore files
PODCAST_AUDIO = os.path.join(PODCAST, "audio")  # Sub-directory for YouTube FAIS datastore files
PODCAST_TRANSCRIPTS = os.path.join(PODCAST, "transcripts")  # Sub-directory for YouTube FAIS datastore files
MAPS = os.path.join(KB_FOLDER, "maps")  # Sub-directory for research 2022 files
MAPS_DATASTORE = os.path.join(MAPS, "datastore")  # Sub-directory for maps FAIS datastore files

directories = [
    YT,
    YT_DATASTORE,
    YT_VIDEOS,
    YT_AUDIO,
    YT_TRANSCRIPTS,
    YT_TRANSCRIPTS_TEXT,
    YT_TRANSCRIPTS_WHISPER,
    YT_TRANSCRIPTS_WHISPER_DISTIL,
    YT_TRANSCRIPTS_COMBINED,
    YT_TRANSCRIPTS_DATASTORE,
    PODCAST,
    PODCAST_DATASTORE,
    PODCAST_AUDIO,
    PODCAST_TRANSCRIPTS,
    MAPS,
    MAPS_DATASTORE
    ]

for directory in directories:
    print ("Folders    : ", directory)
    if not os.path.exists(directory):
        try:
            os.makedirs(directory)
        except Exception as e:
            print(f"Failed to create {directory}. Reason: {e}")


Install required dependencies

In [None]:
!pip install -q -U langchain
!pip install -q -U langchain-community
!pip install -q -U langchain_openai

Set up OpenAI Model and necessary variables

In [None]:
#MODEL = "gpt-3.5-turbo-16k" # Legacy
#MODEL = "gpt-3.5-turbo-1106" # Current model
MODEL = "gpt-3.5-turbo-0125" # Latest model
#MODEL = "gpt-4-0125-preview" Latest model

Initialise preferred vectorstore

In [None]:
vs = 'PineconeServerless' # Set to 'Pinecone' or 'FAISS' for the vector datbase. If using FAISS, no GPU required

In [None]:
if vs == 'Pinecone':
    !pip install -q pinecone-client
    from langchain.vectorstores import Pinecone
    from tqdm.auto import tqdm
    import pinecone

    # initialize pinecone
    pinecone.init(
        api_key = userdata.get('PINECONE_API_KEY'), # find at app.pinecone.io
        environment="gcp-starter"  # next to api key in console
        )

    index_name = "knowledge" # Put your Pincecone index name here
    name_space = "wardleykb" # Put your Pincecone namespace here

elif vs == 'PineconeServerless':
    !pip install -q -U pinecone-client
    from pinecone import Pinecone
    api_key = userdata.get('PINECONE_API_KEY')
    import os
    os.environ["PINECONE_API_KEY"] = api_key

    # initialize pinecone
    pc = Pinecone(api_key=api_key)

    from pinecone import ServerlessSpec, PodSpec
    spec = ServerlessSpec(cloud='AWS', region='us-west-2')
    index_name = 'wardleykb'

elif vs == 'FAISS':
    !pip install -q faiss-cpu
    from langchain.vectorstores import FAISS

elif vs == "CHROMA":
    !pip install chromadb
    from langchain.vectorstores import Chroma



# Build the datastore

## Split text and create chunks, create metadata and upsert embeddings to vectorstore

In [None]:
import json
from langchain.text_splitter import CharacterTextSplitter
from langchain_openai import OpenAIEmbeddings

### Upsert video embeddings to preferred vector store

In [None]:
docs = []
metadatas = []
embedding_data = []
unique_video_ids = []
transcriptions = []
counter = 0
texts = []
start_times = []

text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0, separator="\n")
embeddings = OpenAIEmbeddings()
from langchain.vectorstores import Pinecone # Flip over to LanhChain Pinecone
embeddings_file = f'{YT_DATASTORE}/embeddings.json'

with open(f'{YT_VIDEOS}/videos.txt', 'r') as file:
    for line in file:
        # Remove linebreak which is the last character of the string
        curr_place = line[:-1]
        # Add item to the list
        unique_video_ids.append(curr_place)

total_videos = len(unique_video_ids)

for video_id in unique_video_ids:
    counter = counter + 1
    transcript_filename = f'{YT_TRANSCRIPTS_WHISPER_DISTIL}/' + video_id + '_large.txt'
    url = "https://www.youtube.com/watch?v=" + video_id
    try:
        file = open(transcript_filename, 'r')
    except:
        print(f'{counter} of {total_videos}: File does not exist {transcript_filename}, skipping.')
    else:
        print(f'{counter} of {total_videos}: Loading {transcript_filename} ......\n')
        transcription = json.load(file)
        texts = []
        start_times = []
        docs = []
        metadatas = []

        for chunk in transcription['chunks']:
            if chunk['timestamp'][0] is not None:
                text = chunk['text']
                start = int(chunk['timestamp'][0])
                texts.append(text)
                start_times.append(start)

        # Load the JSON file
        video_id_no_ext = video_id.replace('.webm', '')
        with open(f'{YT_AUDIO}/{video_id_no_ext}.info.json', "r") as file:
            video_info = json.load(file)

        # Extracting general video details from the JSON data
        general_video_details = {
            #"ID": video_info.get("id", None), # Try and fix error with FAISS. ID Error
            "Title": video_info.get("fulltitle", None),
            "Description": video_info.get("description", None),
            "Duration": video_info.get("duration_string", None),
            "Uploader": video_info.get("uploader", None),
            "Upload Date": video_info.get("upload_date", None),
            "View Count": video_info.get("view_count", None),
            "Like Count": video_info.get("like_count", None),
            "Dislike Count": video_info.get("dislike_count", None),
            "Average Rating": video_info.get("average_rating", None),
        }

        # Filtering out any None values for cleaner presentation
        general_video_details = {k: v for k, v in general_video_details.items() if v is not None}

        video_title = general_video_details.get("Title")
        video_description = general_video_details.get("Description")
        video_duration = general_video_details.get("Duration")
        video_uploader = general_video_details.get("Uploader")
        video_upload_date = general_video_details.get("Upload Date")
        video_view_count = general_video_details.get("View Count")
        video_like_count = general_video_details.get("Like Count")
        video_dislike_count = general_video_details.get("Dislike Count")
        video_average_rating = general_video_details.get("Average Rating")

        # Now, you can use these variables throughout your code
        print("Title      :",video_title)
        print("Uploader   :", video_uploader)
        print("Duration   :", video_duration, "\n")

        for i, d in enumerate(texts):
            splits = text_splitter.split_text(d)
            docs.extend(splits)
            metadatas.extend([{
                key: value for key, value in {
                    "source": "YouTube",
                    "source_video": video_id,
                    "start_time": start_times[i],
                    "title": video_title,
                    "author": video_uploader,
                    "description": video_description,
                    "upload_date": video_upload_date,
                    "view_count": video_view_count,
                    "like_count": video_like_count,
                    "dislike_count": video_dislike_count,
                    "average_rating": video_average_rating,
                }.items() if value is not None # Remove 'null' values, as not supported by Pinecone Serverless
            }])

        if vs == 'Pinecone':
            try:
                vector_store = Pinecone.from_texts(docs, embeddings, metadatas=metadatas, index_name=index_name, namespace=name_space)
            except:
                print("Error upserting data into the vectorstore\n")
        elif vs == 'PineconeServerless':
            try:
                print("Contents   :",docs)
                print("             Saving data to the serverless vectorstore")
                vector_store = Pinecone.from_texts(docs, embeddings, metadatas=metadatas, index_name=index_name)
                print("             Vectorstore save complete")
            except:
                print("              Error upserting data into the vectorstore\n")
        elif vs == "FAISS":
            try:
                vector_store = FAISS.from_texts(docs, embeddings, metadatas=metadatas)
                if os.path.exists(f"{YT_DATASTORE}/index.faiss"):
                    existing_index=FAISS.load_local(f"{YT_DATASTORE}", embeddings)
                    existing_index.merge_from(vector_store)
                    existing_index.save_local(f"{YT_DATASTORE}")
                else:
                    vector_store.save_local(f"{YT_DATASTORE}") # Download the files `$DATA_STORE_DIR/index.faiss` and `$DATA_STORE_DIR/index.pkl` to local

            except:
                print("Error upserting data into the vectorstore\n")
        elif vs == "CHROMA":
            try:
                vector_store = Chroma.from_texts(docs, embeddings, metadatas=metadatas, persist_directory=YT_DATASTORE)
            except:
                print("Error upserting data into the vectorstore\n")

        print('-' * 50)  # Just a separator line for clarity