<a href="https://colab.research.google.com/github/tractorjuice/MLOpsAIKB/blob/main/Building_MLOps_AI_Body_of_Knowledge_Part_3_Upsert_Data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# MLOps AI Body of Knowledge Using Langchain & OpenAI
## Part 3, create the vector database

This example shows how to create and query an internal knowledge base using ChatGPT.

This does requires a GPU/TPU runtime.

### Runtime Checks

In [None]:
try:
  gpu_info = !nvidia-smi
except:
  print('No GPU')
else:
  gpu_info = '\n'.join(gpu_info)
  if gpu_info.find('failed') >= 0:
    print('Not connected to a GPU')
  print(gpu_info)

In [None]:
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

## Set Up


Mount Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
import os

KB_FOLDER = "/content/gdrive/MyDrive/MLOpsKB"  # Google drive folder to save the knowledgebase
YT_DATASTORE = os.path.join(KB_FOLDER, "youtube/datastore")  # Sub-directory for YouTube FAIS datastore files
YT_AUDIO_FOLDER = os.path.join(KB_FOLDER, "youtube/audio")  # Sub-directory for audio files
TRANSCRIPTS_FOLDER = os.path.join(YT_AUDIO_FOLDER, "transcripts")  # Sub-directory for transcripts of audio files
TRANSCRIPTS_TEXT_FOLDER = os.path.join(TRANSCRIPTS_FOLDER, "text")  # Sub-directory for text of audio files
TRANSCRIPTS_WHISPER_FOLDER = os.path.join(TRANSCRIPTS_FOLDER, "whisper_chunks")  # Sub-directory for Whisper chunks of audio files

# Check if directory exists and if not, create it
if not os.path.exists(KB_FOLDER):
    os.makedirs(KB_FOLDER)

# Check if directory exists and if not, create it
if not os.path.exists(YT_DATASTORE):
    os.makedirs(YT_DATASTORE)

# Check if sub-directory exists and if not, create it
if not os.path.exists(YT_AUDIO_FOLDER):
    os.makedirs(YT_AUDIO_FOLDER)

# Check if sub-directory exists and if not, create it
if not os.path.exists(TRANSCRIPTS_FOLDER):
    os.makedirs(TRANSCRIPTS_FOLDER)

# Check if sub-directory exists and if not, create it
if not os.path.exists(TRANSCRIPTS_TEXT_FOLDER):
    os.makedirs(TRANSCRIPTS_TEXT_FOLDER)

# Check if sub-directory exists and if not, create it
if not os.path.exists(TRANSCRIPTS_WHISPER_FOLDER):
    os.makedirs(TRANSCRIPTS_WHISPER_FOLDER)

Use Pinecone for the Vector Database

In [None]:
!pip install -q langchain
!pip install -q openai
!pip install -q tiktoken

Set up OPEN_API_KEY and necessary variables

In [None]:
import os
from getpass import getpass

os.environ["OPENAI_API_KEY"] = "" # Add your OpenAI API key here

#MODEL = "gpt-3"
#MODEL = "gpt-3.5-turbo"
#MODEL = "gpt-3.5-turbo-0613"
#MODEL = "gpt-3.5-turbo-16k"
MODEL = "gpt-3.5-turbo-16k-0613"
#MODEL = "gpt-4"
#MODEL = "gpt-4-0613"
#MODEL = "gpt-4-32k-0613"

# Build the datastore

### Initialise preferred vectorstore

In [None]:
vectorstore = 'FAISS' # Set to 'Pinecone' or 'FAISS' for the vector datbase

In [None]:
if vectorstore == 'Pinecone':
    !pip install -q pinecone-client
    from langchain.vectorstores import Pinecone
    from tqdm.auto import tqdm
    import pinecone

    # initialize pinecone
    pinecone.init(
        api_key="",  # find at app.pinecone.io
        environment="us-west4-gcp"  # next to api key in console
        )

    index_name = "knowledge" # Put your Pincecone index name here
    name_space = "mlopskb" # Put your Pincecone namespace here

else:
    !pip install -q faiss-cpu
    from langchain.vectorstores import FAISS


In [None]:
import json
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
import tiktoken

## Split text and create chunks, create metadata and upsert embeddings to vectorstore

In [None]:
#Required for YouTube transcript extraction
!pip install -q pytube
import pytube

### Upsert embeddings to preferred vector store

In [None]:
docs = []
metadatas = []
unique_video_ids = []
transcriptions = []
counter = 0
texts = []
start_times = []

text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0, separator="\n")
embeddings = OpenAIEmbeddings()

with open(f'{YT_AUDIO_FOLDER}/videos.txt', 'r') as file:
    for line in file:
        # Remove linebreak which is the last character of the string
        curr_place = line[:-1]
        # Add item to the list
        unique_video_ids.append(curr_place)

total_videos = len(unique_video_ids)

for videos in unique_video_ids:
    counter = counter + 1
    transcript_filename = f'{TRANSCRIPTS_WHISPER_FOLDER}/' + videos + '_large.txt'
    url = "https://www.youtube.com/watch?v=" + videos
    try:
        file = open(transcript_filename, 'r')
    except:
        print(f'{counter} of {total_videos}: File does not exist {transcript_filename}, skipping.')
    else:
        print(f'{counter} of {total_videos}: Loading {transcript_filename} ......\n')
        transcription = json.load(file)
        texts = []
        start_times = []
        docs = []
        metadatas = []

        for chunk in transcription['chunks']:
            if chunk['timestamp'][0] is not None:
                text = chunk['text']
                start = int(chunk['timestamp'][0])
                texts.append(text)
                start_times.append(start)

        yt = pytube.YouTube(url)
        try:
            video_title = yt.title
        except:
            video_title = ""
        try:
            video_author = yt.author
        except:
            video_author = ""

        for i, d in enumerate(texts):
            splits = text_splitter.split_text(d)
            docs.extend(splits)
            metadatas.extend([{"source": start_times[i], "source_url": videos, "title": video_title, "author": video_author}])

        if vectorstore == 'Pinecone':
            try:
                vector_store = Pinecone.from_texts(docs, embeddings, metadatas=metadatas, index_name=index_name, namespace=name_space)
            except:
                print("Error upserting data into the vectorstore\n")
        else:
            try:
                vector_store = FAISS.from_texts(docs, embeddings, metadatas=metadatas)
                if os.path.exists(f"{YT_DATASTORE}/index.faiss"):
                    existing_index=FAISS.load_local(f"{YT_DATASTORE}", embeddings)
                    existing_index.merge_from(vector_store)
                    existing_index.save_local(f"{YT_DATASTORE}")
                else:
                    vector_store.save_local(f"{YT_DATASTORE}") # Download the files `$YT_DATA_STORE/index.faiss` and `$YT_DATA_STORE/index.pkl` to local
            except:
                print("Error upserting data into the vectorstore\n")


## Store the chunks for processing later

In [None]:
import json

unique_video_ids = []
transcriptions = []
counter = 0
texts = []

with open(f'{YT_AUDIO_FOLDER}/videos.txt', 'r') as file:
    for line in file:
        # Remove linebreak which is the last character of the string
        curr_place = line[:-1]
        # Add item to the list
        unique_video_ids.append(curr_place)

total_videos = len(unique_video_ids)

for video_id in unique_video_ids:
    counter = counter + 1
    transcript_filename = f'{TRANSCRIPTS_WHISPER_FOLDER}/' + video_id + '_large.txt'
    url = "https://www.youtube.com/watch?v=" + video_id
    try:
        file = open(transcript_filename, 'r')
    except:
        print(f'{counter} of {total_videos}: File does not exist {transcript_filename}, skipping.')
    else:
        print(f'{counter} of {total_videos}: Loading {transcript_filename} ......\n')
        transcription = json.load(file)
        text = transcription['text']

        # Write text to file
        with open(f'{TRANSCRIPTS_TEXT_FOLDER}/' + video_id + '_large.txt', 'w') as output_file:
            output_file.write(text)