<a href="https://colab.research.google.com/github/tractorjuice/Building_BoK/blob/main/Building_Wardley_Mapping_Body_of_Knowledge_Part_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Building a Body of Knowledge using Pinecone, Langchain and OpenAI
## Part 1, collect the required data for processing

This example shows how to create and query an internal knowledge base using ChatGPT.

This does not require a GPU runtime.

## Set Up


###Mount Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
import os

DOCS_FOLDER = "/content/gdrive/MyDrive/WardleyKB"  # Google drive folder to save the audio clips from YouTube videos
AUDIO_FOLDER = os.path.join(DOCS_FOLDER, "audio")  # Sub-directory for audio files

# Check if directory exists and if not, create it
if not os.path.exists(DOCS_FOLDER):
    os.makedirs(DOCS_FOLDER)

# Check if sub-directory for audio exists and if not, create it
if not os.path.exists(AUDIO_FOLDER):
    os.makedirs(AUDIO_FOLDER)


## Find all Wardley Mapping YouTube Videos

In [None]:
!pip install -q scrapetube
import scrapetube

In [None]:
# MapCamp 2020:
mapcamp_2020_videos = scrapetube.get_playlist("PLP0vnsXbJsRXpKWEFe956zjGrawwQ0wb3", limit=10) # Fetch all videos from the playlist. Limit it to the first 10 videos

mapcamp_2020_video_ids = [video['videoId'] for video in mapcamp_2020_videos]
print (mapcamp_2020_video_ids)

In [None]:
# MapCamp 2021: Use Case Edition
mapcamp_2021_videos = scrapetube.get_playlist("PLP0vnsXbJsRX11ANTNANzKQQoxTLKRvV1", limit=10) # Fetch all videos from the playlist. Limit it to the first 10 videos

mapcamp_2021_video_ids = [video['videoId'] for video in mapcamp_2021_videos]
print (mapcamp_2021_video_ids)


In [None]:
# Wardley Mapping Meetups:
wardley_meetup_videos = scrapetube.get_playlist("PLP0vnsXbJsRWkXBHkdWTVbByRfY92YYR5", limit=10) # Fetch all videos from the playlist. Limit it to the first 10 videos

wardley_meetup_video_ids = [video['videoId'] for video in wardley_meetup_videos]
print (wardley_meetup_video_ids)

In [None]:
#All Wardley Mapping Videos
all_videos = scrapetube.get_search("Wardley Mapping", limit=10) # Fetch all videos from the search. Limit it to the first 10 videos

all_video_ids = [video['videoId'] for video in all_videos]
print (all_video_ids)


In [None]:
#All Wardley Mapping Videos
all_videos_hash = scrapetube.get_search("wardleymaps", limit=10) # Fetch all videos from the search. Limit it to the first 10 videos

all_video_hash_ids = [video['videoId'] for video in all_videos_hash]
print (all_video_hash_ids)

In [None]:
#Platform Design Toolkit Videos
pdt_leaders_videos = scrapetube.get_playlist("PLP0vnsXbJsRWkXBHkdWTVbByRfY92YYR5", limit=10) # Fetch all videos from the playlist. Limit it to the first 10 videos

pdt_leaders_videos_ids = [video['videoId'] for video in pdt_leaders_videos]
print (pdt_leaders_videos_ids)

In [None]:
#Mark Craddock Videos
mc_videos = scrapetube.get_playlist("PLFwBSJ0MOI-351lB7tPAiUs_a_2lIo_-J", limit=10) # Fetch all videos from the playlist. Limit it to the first 10 videos

mc_video_ids = [video['videoId'] for video in mc_videos]
print (mc_video_ids)

In [None]:
#Simon Wardley Videos
swardley_videos = scrapetube.get_playlist("PLFwBSJ0MOI-27N3Tnx5_CCNPZqrq0gOU-", limit=10) # Fetch all videos from the playlist. Limit it to the first 10 videos

swardley_videos_ids = [video['videoId'] for video in swardley_videos]
print (swardley_videos_ids)

In [None]:
# Deduplicate the lists

unique_video_ids = []
unique_video_ids = list(set(mapcamp_2020_video_ids + mapcamp_2021_video_ids + wardley_meetup_video_ids + all_video_ids + all_video_hash_ids + pdt_leaders_videos_ids + swardley_videos_ids + mc_video_ids))
print(unique_video_ids)


## Store the list of videos in a local file for processing later

In [None]:
# Open the file in write mode to erase existing content
with open(f'{DOCS_FOLDER}/audio/videos.txt', 'w') as f:
    pass

total_videos = len(unique_video_ids)  # Get the total number of videos

for idx, video_id in enumerate(unique_video_ids, start=1):
    url = "https://www.youtube.com/watch?v=" + video_id
    print(f"Processing video {idx} of {total_videos}: {url}")
    with open(f'{DOCS_FOLDER}/audio/videos.txt', 'a') as f:
        f.write(f'{video_id}\n')


### Download the audio from all the videos and store them on Google drive

In [None]:
!pip install -q yt-dlp

In [None]:
import os
import yt_dlp as yt

counter = 0
total_videos = len(unique_video_ids)

# Define download options
ydl_opts = {
    'format': 'bestaudio/best',
    'outtmpl': f'{AUDIO_FOLDER}/clips/%(id)s.%(ext)s',
}

# Create directories if they do not exist
os.makedirs(os.path.join(AUDIO_FOLDER, 'clips'), exist_ok=True)

for video_id in unique_video_ids:
    counter = counter + 1
    url = "https://www.youtube.com/watch?v=" + video_id
    path = os.path.join(AUDIO_FOLDER, 'clips', f'{video_id}.webm')

    if not os.path.isfile(path):
        print(counter, "of", total_videos, ": Fetching new audio file " + video_id)

        try:
            with yt.YoutubeDL(ydl_opts) as ydl:
                ydl.download([url])
            with open(os.path.join(AUDIO_FOLDER, 'processed_videos.txt'), 'a') as f:
                f.write(url + "\n")
        except:
            print(f'Video {url} is unavailable, skipping.')

    else:
        print(counter, "of", total_videos, ": Existing file: " + path)


### Alternative download for the audio using PyTube

In [None]:
#Required for YouTube audio extraction. Try PyTube if yt-dlp fails
!pip install -q pytube

In [None]:
from pytube import YouTube

counter = 0
total_videos = len(unique_video_ids)

# Create directories if they do not exist
os.makedirs(os.path.join(AUDIO_FOLDER, 'clips'), exist_ok=True)

for video_id in unique_video_ids:
    counter = counter + 1
    url = "https://www.youtube.com/watch?v=" + video_id
    path = os.path.join(AUDIO_FOLDER, 'clips', f'{video_id}.mp4')

    if not os.path.isfile(path):
        print(counter, "of", total_videos, ": Fetching new audio file " + video_id)
        yt = YouTube(url)
        # Download the audio stream

        try:
            audio = yt.streams.get_audio_only()
            fn = audio.download(output_path=os.path.join(AUDIO_FOLDER, 'clips'), filename=f'{video_id}.mp4')
            with open(os.path.join(AUDIO_FOLDER, 'processed_videos.txt'), 'a') as f:
                f.write(url + "\n")
        except:
            print(f'Error: Video {video_id} is unavailable, skipping.')
    else:
        print(counter, "of",total_videos,": Existing file " + video_id)