## In this experiment we're searching a prepared corpus based on similarity between the query and the pre-processed embeddings of the corpus sections
* Things to try this time:
  * Using a vector database for storing/querying prepared embeddings
  * Including urls with the prepared embedded vectors to allow linking to the source doc in query response
* Stretch
  * Analyze the similarity of the responses to undersatnd if there are multi0ple interpretations of the query/response

In [1]:
import numpy as np
import openai
import pandas as pd
import pickle
import tiktoken

COMPLETIONS_MODEL = "text-davinci-003"
EMBEDDING_MODEL = "text-embedding-ada-002"

In [2]:
# This dataset has already been split into sections, one row for each section of the Wikipedia page.

df = pd.read_csv('olympics-data/olympics_sections.csv')
df = df.set_index(["title", "heading"])
print(f"{len(df)} rows in the data.")
df.sample(5)

3941 rows in the data.


Unnamed: 0_level_0,Unnamed: 1_level_0,content,tokens
title,heading,Unnamed: 2_level_1,Unnamed: 3_level_1
Ukraine at the 2020 Summer Olympics,Shooting,Ukrainian shooters achieved quota places for t...,66
Italy at the 2020 Summer Olympics,Sport climbing,Italy entered three sport climbers into the Ol...,151
Aruba at the 2020 Summer Olympics,Shooting,Aruba received an invitation from the Triparti...,54
2020 Summer Olympics opening ceremony,Let the Games Begin & Time to Shine,A video sequence showed the history of the Oly...,374
Tajikistan at the 2020 Summer Olympics,Swimming,Tajikistan received a universality invitation ...,50


In [10]:
def get_embedding(text: str, model: str=EMBEDDING_MODEL) -> list:
    result = openai.Embedding.create(
      model=model,
      input=text
    )
    return result["data"][0]["embedding"]

def compute_doc_embeddings(df: pd.DataFrame) -> dict:
    """
    Create an embedding for each row in the dataframe using the OpenAI Embeddings API.
    
    Return a dictionary that maps between each embedding vector and the index of the row that it corresponds to.
    """
    return {
        idx: get_embedding(r.content) for idx, r in df.iterrows()
    }

def load_embeddings(fname: str) -> dict:
    """
    Read the document embeddings and their keys from a CSV.
    
    fname is the path to a CSV with exactly these named columns: 
        "title", "heading", "0", "1", ... up to the length of the embedding vectors.
    """
    
    df = pd.read_csv(fname, header=0)
    max_dim = max([int(c) for c in df.columns if c != "title" and c != "heading"])
    return {
           (r.title, r.heading): [r[str(i)] for i in range(max_dim + 1)] for _, r in df.iterrows()
    }


In [11]:
document_embeddings = load_embeddings('olympics-data/olympics_sections_document_embeddings.csv')


In [12]:
# An example embedding:
example_entry = list(document_embeddings.items())[0]
print(f"{example_entry[0]} : {example_entry[1][:5]}... ({len(example_entry[1])} entries)")

('2020 Summer Olympics', 'Summary') : [0.0037565305829048, -0.0061981128528714, -0.0087078781798481, -0.0071364338509738, -0.0025227521546185]... (1536 entries)
