<a href="https://colab.research.google.com/github/tractorjuice/Building_BoK/blob/main/Building_Wardley_Mapping_Body_of_Knowledge_Part_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Building a Body of Knowledge using Pinecone, Langchain and OpenAI
## Part 3, create the vector database

This example shows how to create and query an internal knowledge base using ChatGPT.

This does requires a GPU runtime.

### Runtime Checks

In [None]:
try:
  gpu_info = !nvidia-smi
except:
  print('No GPU')
else:
  gpu_info = '\n'.join(gpu_info)
  if gpu_info.find('failed') >= 0:
    print('Not connected to a GPU')
  print(gpu_info)

In [None]:
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

## Set Up


###Mount Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
import os

DOCS_FOLDER = "/content/gdrive/MyDrive/WardleyKB"  # Google drive folder to save the audio clips from YouTube videos
AUDIO_FOLDER = os.path.join(DOCS_FOLDER, "audio")  # Sub-directory for audio files

# Check if directory exists and if not, create it
if not os.path.exists(DOCS_FOLDER):
    os.makedirs(DOCS_FOLDER)

# Check if sub-directory for audio exists and if not, create it
if not os.path.exists(AUDIO_FOLDER):
    os.makedirs(AUDIO_FOLDER)


Use Pinecone for the Vector Database

In [None]:
from tqdm.autonotebook import tqdm
!pip install -q langchain
!pip install -q pinecone-client
!pip install -q openai
!pip install -q tiktoken
from langchain.vectorstores import Pinecone
from tqdm.autonotebook import tqdm
import pinecone

In [None]:
# initialize pinecone
pinecone.init(
    api_key="",  # find at app.pinecone.io
    environment=""  # next to api key in console
    )

index_name = "knowledge" # The name of your Pinecone index
name_space = "wardley" # The name of your Pinecone namespace

### Set up OPEN_API_KEY and necessary variables

In [None]:
import os
from getpass import getpass

os.environ["OPENAI_API_KEY"] = ""

# Build the datastore
*(Skip to next section to load data store from files if it has been saved locally to save cost of embeddings)*

In [None]:
import json
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
import tiktoken

## Split text and create chunks, create metadata and upsert embeddings to pinecone vectorstore

In [None]:
#Required for YouTube transcript extraction
!pip install -q pytube
import pytube

## Upsert embeddings to preferred vector store

In [None]:
docs = []
metadatas = []
unique_video_ids = []
transcriptions = []
counter = 0
texts = []
start_times = []

text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0, separator="\n")
embeddings = OpenAIEmbeddings()

with open(f'{AUDIO_FOLDER}/videos.txt', 'r') as file:
    for line in file:
        # Remove linebreak which is the last character of the string
        curr_place = line[:-1]
        # Add item to the list
        unique_video_ids.append(curr_place)

total_videos = len(unique_video_ids)

for videos in unique_video_ids:
    counter = counter + 1
    transcript_filename = f'{AUDIO_FOLDER}/transcripts/' + videos + '_large.txt'
    url = "https://www.youtube.com/watch?v=" + videos
    try:
        file = open(transcript_filename, 'r')
    except:
        print(f'{counter} of {total_videos}: File does not exist {transcript_filename}, skipping.')
    else:
        print(f'{counter} of {total_videos}: Loading {transcript_filename} ......\n')
        transcription = json.load(file)
        texts = []
        start_times = []
        docs = []
        metadatas = []

        for chunk in transcription['chunks']:
            text = chunk['text']
            start = int(chunk['timestamp'][0])
            texts.append("".join(text))
            start_times.append(start)

        yt = pytube.YouTube(url)
        try:
            video_title = yt.title
        except:
            video_title = ""
        try:
            video_author = yt.author
        except:
            video_author = ""

        for i, d in enumerate(texts):
            splits = text_splitter.split_text(d)
            docs.extend(splits)
            metadatas.extend([{"source": start_times[i], "source_url": videos, "title": video_title, "author": video_author}])

        vector_store = Pinecone.from_texts(docs, embeddings, metadatas=metadatas, index_name=index_name, namespace=name_space)