# Motivation
I've recently gotten a bunch of embeddings for each of Anthony Fantano's videos. In this notebook, I want to develop a rudimentary prototype for semantic search. 

# Setup
The cells below will help to set up the rest of the notebook. 

I'll start by changing my working directory. 

In [None]:
%cd ..

Now, I'll import some libraries.

In [None]:
# Import statements
import requests
import os
from requests.structures import CaseInsensitiveDict
import numpy as np
import json
from numpy import dot
from numpy.linalg import norm
from tqdm import tqdm
from pathlib import Path
import pandas as pd
import math
import traceback
from time import sleep
from IPython.display import display, Markdown

# Loading Data
Next, I'm going to load in all of the data. 

In [None]:
# Create a DataFrame containing all of the data scraped for each of the videos
tnd_data_df_records = []
for child_dir in tqdm(list(Path("data/theneedledrop_scraping/").iterdir())):
    
    # Extract the video ID from the 
    cur_video_id = child_dir.name
    
    # Load in the details.json file
    try:
        with open(f"data/theneedledrop_scraping/{cur_video_id}/details.json", "r") as json_file:
            cur_details_dict = json.load(json_file)
    except:
        cur_details_dict = {}
        
    # Load in the transcription.json file
    try:
        with open(f"data/theneedledrop_scraping/{cur_video_id}/transcription.json", "r") as json_file:
            cur_transcription_dict = json.load(json_file)
    except:
        cur_transcription_dict = {}
        
    # Load in the embedding
    try:
        with open(f"data/theneedledrop_scraping/{cur_video_id}/whole_video_embedding.json", "r") as json_file:
            whole_video_embedding = json.load(json_file)
    except:
        whole_video_embedding = None
        
    # Load in the enriched details dictionary
    try:
        with open(f"data/theneedledrop_scraping/{cur_video_id}/enriched_details.json", "r") as json_file:
            cur_enriched_details_dict = json.load(json_file)
    except:
        cur_details_dict = {}
        
    # Create a "record" for this video
    tnd_data_df_records.append({
        "video_id": cur_video_id,
        "details_dict": cur_details_dict,
        "transcription_dict": cur_transcription_dict,
        "whole_video_embedding": whole_video_embedding,
        "enriched_details_dict": cur_enriched_details_dict
    })
    
# Now, we want to create a DataFrame from the tnd_data_df_records
tnd_data_df = pd.DataFrame.from_records(tnd_data_df_records)

# Making the embeddings ndarrays instead of lists 
tnd_data_df["whole_video_embedding"] = tnd_data_df["whole_video_embedding"].apply(lambda x: np.asarray(x) if x is not None else None)

# Add a "transcription string" column 
tnd_data_df["transcription_str"] = tnd_data_df["transcription_dict"].apply(lambda x: x['text'] if 'text' in x else None)

# Add a couple of columns indicating how long each of the transcriptions are 
tnd_data_df["transcription_length"] = tnd_data_df["transcription_str"].apply(lambda x: len(x) if x is not None else None)
tnd_data_df["transcription_approx_tokens"] = tnd_data_df["transcription_str"].apply(lambda x: int(math.ceil(len(x)/3.5)) if x is not None else None)

# Add a couple of columns grabbing the title and URL of the video 
tnd_data_df["video_title"] = tnd_data_df["details_dict"].apply(lambda x: x['title'])
tnd_data_df["video_url"] = tnd_data_df["video_id"].apply(lambda x: f"https://www.youtube.com/watch?v={x}")

I'll also load in all of the segments for each video.

In [None]:
# Load in all of the JSON files containing the video segment embeddings
video_segment_emb_dict = {}
tnd_scraping_folder_path = Path("data/theneedledrop_scraping/")
for child_folder in tqdm(list(tnd_scraping_folder_path.iterdir())):
    if child_folder.is_dir():
        cur_video_id = child_folder.stem
        video_segment_emb_path = Path(f"data/theneedledrop_scraping/{cur_video_id}/video_segment_embeddings.json")
        if (video_segment_emb_path.exists()):
            with open(video_segment_emb_path, "r") as json_file:
                video_segment_emb_dict[cur_video_id] = json.load(json_file)
                
# Loading all of the embedding dictionaries into a single DataFrame
segment_emb_df_list = []
for cur_video_id, segment_dict_list in video_segment_emb_dict.items():
    segmend_df = pd.DataFrame(segment_dict_list)
    segmend_df["video_id"] = cur_video_id
    segment_emb_df_list.append(segmend_df)
segment_emb_df = pd.concat(segment_emb_df_list)


# Remove all of the segments without embeddings
segment_emb_df = segment_emb_df[segment_emb_df["embedding"].notna()].copy()

# Methods
Below, I'm going to write a couple of methods. 

In [None]:
# This method will return a list of ndarrays, each representing text embeddings of 
# the text in each index of the input_text_list list
def generate_embeddings(input_text_list, print_exceptions=False):
    
    # Get the OpenAI API key from the environment variables 
    api_key = os.getenv("OPENAI_API_KEY", "")
    
    # Build the API request
    url = "https://api.openai.com/v1/embeddings"
    headers = CaseInsensitiveDict()
    headers["Content-Type"] = "application/json"
    headers["Authorization"] = "Bearer " + api_key
    data = """{"input": """ + json.dumps(input_text_list) + ""","model":"text-embedding-ada-002"}"""
    
    # Send the API request
    resp = requests.post(url, headers=headers, data=data)
    
    # If the request was successful, return ndarrays of the embeddings. Otherwise, return None objects 
    if resp.status_code == 200:
        return [np.asarray(data_object['embedding']) for data_object in resp.json()['data']]
    else:
        if (print_exceptions):
            print(resp.json())
        return [None for txt in input_text_list]
    
# This method will generate the embedding for a single string
def generate_embedding(txt_input):
    return (generate_embeddings([txt_input])[0])
    
# This method will return the cosine similarity of two ndarrays
def cosine_sim(a, b):
    return dot(a, b)/(norm(a)*norm(b))

# Whole-Video Search Prototype

In [None]:
# Indicate the search string, and then generate an embedding based off of these 
search_txt = "taylor swift"
search_txt_emb = generate_embedding(search_txt)

Now that we have this embedding, we can search the Fantano videos! 

In [None]:
# Search across all of the different embeddings to determine which videos are similar
tnd_data_with_embs_df = tnd_data_df[tnd_data_df["whole_video_embedding"].notna()].copy()
search_result_sim = tnd_data_with_embs_df.copy()
search_result_sim["cosine_sim_to_search"] = search_result_sim["whole_video_embedding"].apply(
    lambda x: cosine_sim(search_txt_emb, x))
search_result_sim = search_result_sim.sort_values("cosine_sim_to_search", ascending=False)

# We're going to print out the top_n results
top_n = 10
for idx, row in enumerate(list(search_result_sim.head(top_n).itertuples())):
    markdown_str = f"**#{idx+1}:** [{row.video_title}]({row.video_url})<br>Similarity: {row.cosine_sim_to_search:.3f}<br>"
    display(Markdown(markdown_str))

# Segment Search Prototype
Next, I want to try and enable searching for videos on a segment level. 

I'll start again by specifying my search string, and generating the embedding for it. 

In [None]:
# Indicate the search string, and then generate an embedding based off of these 
search_txt = "radiohead hyperpop meteorite godsend"
search_txt_emb = generate_embedding(search_txt)

Next, I'll iterate through each video's segments, and determine the similarity between that video's segments and the `search_txt_emb`. 

In [None]:
# Calculate the similarity between the segment embeddings and the search embedding 
segment_emb_sim_df = segment_emb_df.copy()
segment_emb_sim_df["cosine_sim_to_search"] = segment_emb_sim_df["embedding"].apply(
    lambda x: cosine_sim(search_txt_emb, x))

# Sort this DataFrame by the similarity to the search embedding
sorted_segment_emb_sim_df = segment_emb_sim_df.sort_values(
    "cosine_sim_to_search", ascending=False).copy()

# Determine the average similarity across each video 
avg_segment_sim_df = pd.DataFrame(sorted_segment_emb_sim_df.groupby("video_id")[
    "cosine_sim_to_search"].mean()).reset_index().sort_values("cosine_sim_to_search", ascending=False)

# Show the top n_to_show videos
n_to_show = 5
for idx, row in enumerate(list(avg_segment_sim_df.merge(tnd_data_df, on="video_id").head(
    n_to_show).itertuples())):
    markdown_str = f"**#{idx+1}:** [{row.video_title}]({row.video_url})<br>Similarity: {row.cosine_sim_to_search}"
    display(Markdown(markdown_str))

I also want to see the highest similarity segments: 

In [None]:
n_to_show = 3
for idx, row in enumerate(sorted_segment_emb_sim_df.merge(tnd_data_df, on="video_id").head(n_to_show).itertuples()):
    markdown_str = f"**Segment #{idx+1}** (from [{row.video_title}]({row.video_url}))<br>Similarity: {row.cosine_sim_to_search}<br>{row.text}<br><br>"
    display(Markdown(markdown_str))

Finally, another thing I can check out: the highest-similarity segments across different videos. 

In [None]:
high_sim_segments_per_video_df = sorted_segment_emb_sim_df[sorted_segment_emb_sim_df.groupby(
    ['video_id'])['cosine_sim_to_search'].transform(max) == sorted_segment_emb_sim_df[
    'cosine_sim_to_search']].copy()
n_to_show = 5
for idx, row in enumerate(high_sim_segments_per_video_df.merge(tnd_data_df, on="video_id").head(n_to_show).itertuples()):
    markdown_str = f"**Segment #{idx+1}** (from [{row.video_title}]({row.video_url}))<br>Similarity: {row.cosine_sim_to_search}<br>{row.text}<br><br>"
    display(Markdown(markdown_str))

### "Better" Segment Search
Another idea that I had: what if I grabbed the top 10 most similar segments from each video, and then got the average segment similarity for that video? That would filter out brief mentions, and return videos that were more generally relevant to the query. 

In [None]:
# Indicate the search string, and then generate an embedding based off of these 
search_txt = "masterful meteorite disintegration"
search_txt_emb = generate_embedding(search_txt)

# Calculate the similarity between the segment embeddings and the search embedding 
segment_emb_sim_df = segment_emb_df.copy()
segment_emb_sim_df["cosine_sim_to_search"] = segment_emb_sim_df["embedding"].apply(
    lambda x: cosine_sim(search_txt_emb, x))

# Sort this DataFrame by the similarity to the search embedding
sorted_segment_emb_sim_df = segment_emb_sim_df.sort_values(
    "cosine_sim_to_search", ascending=False).copy()

grouped_sorted_segment_df = sorted_segment_emb_sim_df.groupby("video_id")
top_segments_per_video = grouped_sorted_segment_df.apply(
    lambda x: x.sort_values("cosine_sim_to_search", ascending=False).head(10)).reset_index(
    drop=True).copy()
top_videos_by_top_segments = top_segments_per_video.groupby("video_id")[
    "cosine_sim_to_search"].mean().reset_index().sort_values(
    "cosine_sim_to_search", ascending=False).merge(tnd_data_df, on="video_id").copy()

n_to_show = 5
for idx, row in enumerate(top_videos_by_top_segments.head(n_to_show).itertuples()):
    markdown_str = f"**Segment #{idx+1}** (from [{row.video_title}]({row.video_url}))<br>Similarity: {row.cosine_sim_to_search}"
    display(Markdown(markdown_str))

# Main Method
Now that I've done a couple of different experiments, I'm going to try and create a "main" method. When given a textual input, this method ought to search the entirety of the video corpus. 

This main method will assume that all of the data has been loaded. 

I'm going to start by embedding the user's input:

In [None]:
# Indicate the search string, and then generate an embedding based off of these 
search_txt = "masterful meteorite disintegration"
search_txt_emb = generate_embedding(search_txt)

Next, I'm going to search each of the segments. 

In [None]:
# Calculate the similarity between the segment embeddings and the search embedding 
segment_emb_sim_df = segment_emb_df.copy()
segment_emb_sim_df["cosine_sim_to_search"] = segment_emb_sim_df["embedding"].apply(
    lambda x: cosine_sim(search_txt_emb, x))

# Sort this DataFrame by the similarity to the search embedding
sorted_segment_emb_sim_df = segment_emb_sim_df.sort_values(
    "cosine_sim_to_search", ascending=False).copy()

grouped_sorted_segment_df = sorted_segment_emb_sim_df.groupby("video_id")
top_segments_per_video = grouped_sorted_segment_df.apply(
    lambda x: x.sort_values("cosine_sim_to_search", ascending=False).head(10)).reset_index(
    drop=True).copy()
top_videos_by_top_segments = top_segments_per_video.groupby("video_id")[
    "cosine_sim_to_search"].mean().reset_index().sort_values(
    "cosine_sim_to_search", ascending=False).merge(tnd_data_df, on="video_id").copy()