<a href="https://colab.research.google.com/github/tractorjuice/MLOpsAIKB/blob/main/Building_MLOps_AI_Body_of_Knowledge_Part_1_Collect_YouTube_Audio.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# MLOps AI Body of Knowledge Using Langchain & OpenAI
## Part 1, data collection. Collect the required data for processing

This example shows how to create and query an internal knowledge base using ChatGPT.

This does not require a GPU/TPU runtime.

## Set Up


###Mount Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
import os

KB_FOLDER = "/content/gdrive/MyDrive/AI/MLOpsKB"  # Google drive folder to save the knowledgebase
YT_DATASTORE = os.path.join(KB_FOLDER, "youtube/datastore")  # Sub-directory for YouTube FAIS datastore files
YT_AUDIO_FOLDER = os.path.join(KB_FOLDER, "youtube/audio")  # Sub-directory for audio files
TRANSCRIPTS_FOLDER = os.path.join(YT_AUDIO_FOLDER, "transcripts")  # Sub-directory for transcripts of audio files
TRANSCRIPTS_TEXT_FOLDER = os.path.join(TRANSCRIPTS_FOLDER, "text")  # Sub-directory for text of audio files
TRANSCRIPTS_WHISPER_FOLDER = os.path.join(TRANSCRIPTS_FOLDER, "whisper_chunks")  # Sub-directory for Whisper chunks of audio files

# Check if directory exists and if not, create it
if not os.path.exists(KB_FOLDER):
    os.makedirs(KB_FOLDER)

# Check if directory exists and if not, create it
if not os.path.exists(YT_DATASTORE):
    os.makedirs(YT_DATASTORE)

# Check if sub-directory exists and if not, create it
if not os.path.exists(YT_AUDIO_FOLDER):
    os.makedirs(YT_AUDIO_FOLDER)

# Check if sub-directory exists and if not, create it
if not os.path.exists(TRANSCRIPTS_FOLDER):
    os.makedirs(TRANSCRIPTS_FOLDER)

# Check if sub-directory exists and if not, create it
if not os.path.exists(TRANSCRIPTS_TEXT_FOLDER):
    os.makedirs(TRANSCRIPTS_TEXT_FOLDER)

# Check if sub-directory exists and if not, create it
if not os.path.exists(TRANSCRIPTS_WHISPER_FOLDER):
    os.makedirs(TRANSCRIPTS_WHISPER_FOLDER)

## Find all Wardley Mapping YouTube Videos

In [None]:
!pip install -q scrapetube
import scrapetube

In [None]:
# MLOps LLMs in Production Part II Playlist:
mlops_llms_prod_partii_videos = scrapetube.get_playlist("PL3vkEKxWd-uupBSWL-DbVJuCMqXO9Z3Z4")

mlops_llms_prod_partii_video_ids = [video['videoId'] for video in mlops_llms_prod_partii_videos]
print (mlops_llms_prod_partii_video_ids)

In [None]:
# MLOps LLMs in Production Part I Playlist:
mlops_llms_prod_parti_videos = scrapetube.get_playlist("PL3vkEKxWd-us5YvvuvYkjP_QGlgUq3tpA")

mlops_llms_prod_parti_video_ids = [video['videoId'] for video in mlops_llms_prod_parti_videos]
print (mlops_llms_prod_parti_video_ids)

In [None]:
# Deduplicate the lists

unique_video_ids = []
unique_video_ids = list(set(mlops_llms_prod_parti_video_ids + mlops_llms_prod_partii_video_ids))
print(unique_video_ids)

## Store the list of videos in a local file for processing later

In [None]:
# Open the file in write mode to erase existing content
with open(f'{YT_AUDIO_FOLDER}/videos.txt', 'w') as f:
    pass

total_videos = len(unique_video_ids)  # Get the total number of videos

for idx, video_id in enumerate(unique_video_ids, start=1):
    url = "https://www.youtube.com/watch?v=" + video_id
    print(f"Processing video {idx} of {total_videos}: {url}")
    with open(f'{YT_AUDIO_FOLDER}/videos.txt', 'a') as f:
        f.write(f'{video_id}\n')


### Download the audio from all the videos and store them on Google drive

In [None]:
!pip install -q yt-dlp

In [None]:
import os
import yt_dlp as yt

counter = 0
total_videos = len(unique_video_ids)

# Define download options
ydl_opts = {
    'format': 'bestaudio/best',
    'outtmpl': f'{YT_AUDIO_FOLDER}/clips/%(id)s.%(ext)s',
}

# Create directories if they do not exist
os.makedirs(os.path.join(YT_AUDIO_FOLDER, 'clips'), exist_ok=True)

for video_id in unique_video_ids:
    counter = counter + 1
    url = "https://www.youtube.com/watch?v=" + video_id
    path = os.path.join(YT_AUDIO_FOLDER, 'clips', f'{video_id}.webm')

    if not os.path.isfile(path):
        print(counter, "of", total_videos, ": Fetching new audio file " + video_id)

        #try:
        with yt.YoutubeDL(ydl_opts) as ydl:
            ydl.download([url])
        with open(os.path.join(YT_AUDIO_FOLDER, 'processed_videos.txt'), 'a') as f:
            f.write(url + "\n")
        #except:
        #    print(f'Video {url} is unavailable, skipping.')

    else:
        print(counter, "of", total_videos, ": Existing file: " + path)


### Alternative download for the audio using PyTube

In [None]:
#Required for YouTube audio extraction. Try PyTube if yt-dlp fails
#!pip install -q pytube

In [None]:
from pytube import YouTube

counter = 0
total_videos = len(unique_video_ids)

# Create directories if they do not exist
os.makedirs(os.path.join(YT_AUDIO_FOLDER, 'clips'), exist_ok=True)

for video_id in unique_video_ids:
    counter = counter + 1
    url = "https://www.youtube.com/watch?v=" + video_id
    path = os.path.join(YT_AUDIO_FOLDER, 'clips', f'{video_id}.mp4')

    if not os.path.isfile(path):
        print(counter, "of", total_videos, ": Fetching new audio file " + video_id)
        yt = YouTube(url)
        # Download the audio stream

        try:
            audio = yt.streams.get_audio_only()
            fn = audio.download(output_path=os.path.join(YT_AUDIO_FOLDER, 'clips'), filename=f'{video_id}.mp4')
            with open(os.path.join(YT_AUDIO_FOLDER, 'processed_videos.txt'), 'a') as f:
                f.write(url + "\n")
        except:
            print(f'Error: Video {video_id} is unavailable, skipping.')
    else:
        print(counter, "of",total_videos,": Existing file " + video_id)