# Motivation
I wanted to experiment with using the OpenAI embedding endpoint to create embeddings for all of Fantano's videos! This notebook will establish a workflow for creating those embeddings. 

# Setup
The cells below will help to set up the rest of the notebook. 

I'll start by changing my working directory. 

In [1]:
%cd ..

C:\Data\Personal Study\Programming\neural-needle-drop


Now, I'll import some libraries.

In [2]:
# Import statements
import requests
import os
from requests.structures import CaseInsensitiveDict
import numpy as np
import json
from numpy import dot
from numpy.linalg import norm
from tqdm import tqdm
from pathlib import Path
import pandas as pd
import math
import traceback
from time import sleep

# Methods
I've copied the methods below from the **OpenAI Embedding Experiment** notebook! They're going to help with getting the embeddings. 

In [3]:
# This method will return a list of ndarrays, each representing text embeddings of 
# the text in each index of the input_text_list list
def generate_embeddings(input_text_list, print_exceptions=False):
    
    # Get the OpenAI API key from the environment variables 
    api_key = os.getenv("OPENAI_API_KEY", "")
    
    # Build the API request
    url = "https://api.openai.com/v1/embeddings"
    headers = CaseInsensitiveDict()
    headers["Content-Type"] = "application/json"
    headers["Authorization"] = "Bearer " + api_key
    data = """{"input": """ + json.dumps(input_text_list) + ""","model":"text-embedding-ada-002"}"""
    
    # Send the API request
    resp = requests.post(url, headers=headers, data=data)
    
    # If the request was successful, return ndarrays of the embeddings. Otherwise, return None objects 
    if resp.status_code == 200:
        return [np.asarray(data_object['embedding']) for data_object in resp.json()['data']]
    else:
        if (print_exceptions):
            print(resp.json())
        return [None for txt in input_text_list]   
    
# This method will return the cosine similarity of two ndarrays
def cosine_sim(a, b):
    return dot(a, b)/(norm(a)*norm(b))

# Generating Embeddings
Now that I've got the methods in place to generate embeddings, I want to get embeddings for each of the reviews. 

### Loading Data
I'll start by loading in all of the data into a DataFrame. 

In [None]:
# Create a DataFrame containing all of the data scraped for each of the videos
tnd_data_df_records = []
for child_dir in tqdm(list(Path("data/theneedledrop_scraping/").iterdir())):
    
    # Extract the video ID from the 
    cur_video_id = child_dir.name
    
    # Load in the details.json file
    try:
        with open(f"data/theneedledrop_scraping/{cur_video_id}/details.json", "r") as json_file:
            cur_details_dict = json.load(json_file)
    except:
        cur_details_dict = {}
        
    # Load in the transcription.json file
    try:
        with open(f"data/theneedledrop_scraping/{cur_video_id}/transcription.json", "r") as json_file:
            cur_transcription_dict = json.load(json_file)
    except:
        cur_transcription_dict = {}
        
    # Create a "record" for this video
    tnd_data_df_records.append({
        "video_id": cur_video_id,
        "details_dict": cur_details_dict,
        "transcription_dict": cur_transcription_dict
    })
    
# Now, we want to create a DataFrame from the tnd_data_df_records
tnd_data_df = pd.DataFrame.from_records(tnd_data_df_records)

# Add a "transcription string" column 
tnd_data_df["transcription_str"] = tnd_data_df["transcription_dict"].apply(lambda x: x['text'] if 'text' in x else None)

# Add a couple of columns indicating how long each of the transcriptions are 
tnd_data_df["transcription_length"] = tnd_data_df["transcription_str"].apply(lambda x: len(x) if x is not None else None)
tnd_data_df["transcription_approx_tokens"] = tnd_data_df["transcription_str"].apply(lambda x: int(math.ceil(len(x)/3.5)) if x is not None else None)

# Determining how many segments are in a transcription
tnd_data_df["transcription_segment_amt"] = tnd_data_df["transcription_dict"].apply(lambda x: len(x['segments']) if 'segments' in x else None)

### Simple Whole-Video Embeddings
First order of business: getting embeddings for the videos who we believe are under a certain amount of tokens. According to [the OpenAI Tokenizer site](https://beta.openai.com/tokenizer), tokens are *roughly* ~4 characters. If I assume ~3.5 tokens/word instead (a more conservative estimate), I can calculate the approximate amount of tokens in each transcription. 

The [embeddings endpoint](https://beta.openai.com/docs/api-reference/embeddings/create) has a limit of 8192 tokens per text input. So, I'm going to try and get the embeddings for each transcription under ~7,500 tokens in length. 

In [None]:
# Subset the DataFrame to only show videos w/ less than 7,500 tokens in their transcription
tnd_data_df_simple_token_subset = tnd_data_df.query("transcription_approx_tokens<=7500").copy()

# We're going to iterate through this DataFrame in chunks
master_emb_df_list = []
chunk_size = 20
chunk_amt = int(math.ceil(len(tnd_data_df_simple_token_subset)/chunk_size))
for cur_chunk in tqdm(list(range(chunk_amt))):
    df_chunk = tnd_data_df_simple_token_subset[(cur_chunk*chunk_size):((cur_chunk+1)*chunk_size)]
    transcription_list = [x.strip() for x in list(df_chunk["transcription_str"])]
    emb_list = generate_embeddings(transcription_list, print_exceptions=True)
    
    # Create a new DataFrame with these embeddings
    cur_df_chunk_with_embeddings = df_chunk[["video_id"]].copy()
    cur_df_chunk_with_embeddings["whole_video_embedding"] = emb_list
    master_emb_df_list.append(cur_df_chunk_with_embeddings)
    
    # Determine how long we ought to sleep after this request
    total_approx_tokens = sum(df_chunk["transcription_approx_tokens"])
    sleep_amt = int(math.ceil(((total_approx_tokens/150000)*60)*1.1))
    sleep(sleep_amt)
    
# Now, make the master embedding DataFrame
simple_token_subset_emb_df = pd.concat(master_emb_df_list)

Now, with these embeddings in hand, I'm going to save them. 

In [None]:
# Saving all of the whole-video embeddings 
for row in tqdm(list(simple_token_subset_emb_df.itertuples())):
    video_folder = Path(f"data/theneedledrop_scraping/{row.video_id}/")
    with open(f'{video_folder}/whole_video_embedding.json', "w") as json_file:
        json.dump(row.whole_video_embedding.tolist(), json_file, indent=2)

### Complex Whole-Video Embeddings

### Video Sequence Embeddings

In [None]:
# We're going to step through each of the videos in our tnd_data_df and generate embeddings for 
# each "segment chunk".
# video_segment_embedding_dict = {}
for row in tqdm(list(tnd_data_df.itertuples())):
    
    # If the transcription dictionary isn't complete, skip it 
    if ("segments" not in row.transcription_dict):
        continue
        
    # If the video's already had its segments embedded, skip it
    if (row.video_id in video_segment_embedding_dict):
        continue
    
    # Grab a list of the segments associated with this video
    segment_list = row.transcription_dict["segments"]
    
    # We're going to break up the segment list into chunks 
    segment_chunks_dict = {}
    segments_per_chunk = 4
    chunk_amt = int(math.ceil(len(segment_list)/segments_per_chunk))
    for cur_chunk in range(chunk_amt):
        segment_list_chunk = segment_list[(cur_chunk*segments_per_chunk):((cur_chunk+1)*segments_per_chunk)]
        segment_id_range = (segment_list_chunk[0]['id'], segment_list_chunk[-1]['id'])
        joined_segment_text = " ".join([segment['text'].strip() for segment in segment_list_chunk])
        segment_chunks_dict[segment_id_range] = joined_segment_text
        
    # Create a DataFrame out of the segment chunks we've found 
    segment_chunk_df = pd.DataFrame([{"segment_range": key, "text": val} for key, val in segment_chunks_dict.items()])

    # Now, we're going to generate embeddings for each of these chunks, chunks at a time 
    chunk_size = 20
    chunk_amt = int(math.ceil(len(segment_chunks_dict)/chunk_size))

    # Iterate through in chunks
    master_emb_df_list = []
    for cur_chunk in range(chunk_amt):

        # Get the subset of the DataFrame corresponding with this chunk 
        cur_chunk_df = segment_chunk_df[(cur_chunk*chunk_size):((cur_chunk+1)*chunk_size)].copy()
        cur_chunk_text_list = [x.strip() for x in list(cur_chunk_df["text"])]

        # Grab the embeddings for this chunk list
        emb_list = generate_embeddings(cur_chunk_text_list, print_exceptions=True)

        # Add these embeddings to the cur_chunk_df
        cur_chunk_df["embedding"] = emb_list

        # Add this 
        master_emb_df_list.append(cur_chunk_df)
        
        # Determine how long to sleep, and then sleep 
        total_approx_tokens = len(" ".join(cur_chunk_text_list))/3.5
        sleep_amt = int(math.ceil(((total_approx_tokens/150000)*60)*1.1)) + 5
        sleep(sleep_amt)

    # Concatenate all of the chunks together 
    cur_video_segment_embedding_df = pd.concat(master_emb_df_list)
    video_segment_embedding_dict[row.video_id] = cur_video_segment_embedding_df

Now, we're going to save all of these embeddings. 

In [None]:
for video_id, embedding_df in video_segment_embedding_dict.items():
    if embedding_df["embedding"] is None:
        print(video_id)
        continue
    embedding_df["embedding"] = embedding_df["embedding"].apply(lambda x: x.tolist() if (not isinstance(x, list) and x is not None) else x)
    
    # Figure out where to save this 
    save_file_path = f"data/theneedledrop_scraping/{video_id}/video_segment_embeddings.json"
    embedding_df.to_json(save_file_path, orient="records", indent=2)