In [76]:
import openai
from string import Template
import pandas as pd
import numpy as np
import re
from typing import Set
from transformers import GPT2TokenizerFast

import numpy as np
from nltk.tokenize import sent_tokenize
import tiktoken

import nltk

openai.api_key_path = '/home/tim/projects/openai/apikey.txt'

COMPLETIONS_MODEL = "text-davinci-003"
EMBEDDING_MODEL = "text-embedding-ada-002"

## First, convert the processed text sections into embedding vectors


In [16]:
# Use a data set prepared in one of the "Extract data from wikipedia" notebooks
# Those datasets have already been split into sections, one row for each section of the Wikipedia page.

df = pd.read_csv('olympics-data/olympics_sections.csv')
df = df.set_index(["title", "heading"])
print(f"{len(df)} rows in the data.")
df.sample(5)

3941 rows in the data.


Unnamed: 0_level_0,Unnamed: 1_level_0,content,tokens
title,heading,Unnamed: 2_level_1,Unnamed: 3_level_1
Swimming at the 2020 Summer Olympics – Women's 800 metre freestyle,Competition format,The competition consisted of two rounds: heats...,48
Germany at the 2020 Summer Olympics,BMX,Germany received two quota spots for BMX at th...,59
Badminton at the 2020 Summer Olympics – Men's singles,Background,This was the 8th appearance of the event as a ...,106
Tunisia at the 2020 Summer Olympics,Tennis,Tunisia entered one tennis player into the Oly...,65
Athletics at the 2020 Summer Olympics – Women's 4 × 100 metres relay,Final,"Setting a national record, Jamaica won the gol...",50


In [17]:
def get_embedding(text: str, model: str=EMBEDDING_MODEL):
    result = openai.Embedding.create(
      model=model,
      input=text
    )
    return result["data"][0]["embedding"]

def compute_doc_embeddings(df: pd.DataFrame):
    """
    Create an embedding for each row in the dataframe using the OpenAI Embeddings API.
    
    Return a dictionary that maps between each embedding vector and the index of the row that it corresponds to.
    """
    return {
        idx: get_embedding(r.content) for idx, r in df.iterrows()
    }

def load_embeddings(fname: str):
    """
    Read the document embeddings and their keys from a CSV.
    
    fname is the path to a CSV with exactly these named columns: 
        "title", "heading", "0", "1", ... up to the length of the embedding vectors.
    """
    
    df = pd.read_csv(fname, header=0)
    max_dim = max([int(c) for c in df.columns if c != "title" and c != "heading"])
    return {
           (r.title, r.heading): [r[str(i)] for i in range(max_dim + 1)] for _, r in df.iterrows()
    }

In [None]:
# document_embeddings = load_embeddings("https://cdn.openai.com/API/examples/data/olympics_sections_document_embeddings.csv")

# ===== OR, uncomment the below line to recaculate the embeddings from scratch. ========

document_embeddings  = compute_doc_embeddings(df)



In [28]:
# document_embeddings are a dictionary with the tuple (title, heading) as key and an embedding vector as value.  
# It's not clear to me why all of the vectors are equal length (1536).  
#
# Well, see here, the output dimensions for the ada-2 embeddings model is 1536. https://beta.openai.com/docs/guides/embeddings/what-are-embeddings
# I'd like to understand better how that works.

stuff = iter(document_embeddings)
first_vector_key = next(stuff)
print(f"Example section vector for '{first_vector_key}', length: {len(document_embeddings[first_vector_key])}")
document_embeddings[first_vector_key]


Example section vector for '('2020 Summer Olympics', 'Summary')', length: 1536


[0.006666478700935841,
 -0.010420418344438076,
 -0.011656628921627998,
 -0.004915718920528889,
 -0.0018009201157838106,
 0.0145626962184906,
 -0.02350742742419243,
 -0.004572686739265919,
 -0.007825021632015705,
 -0.02884061075747013,
 0.01676328107714653,
 0.00819394364953041,
 -0.01427791453897953,
 -0.0194169282913208,
 -0.02067255601286888,
 -0.013786018826067448,
 0.025513842701911926,
 -0.01636199839413166,
 -0.003211883595213294,
 -0.015274649485945702,
 -0.026407022029161453,
 0.008718200959265232,
 -0.0058930376544594765,
 0.014303803443908691,
 -0.008064497262239456,
 -0.0011973772197961807,
 0.015132259577512741,
 -0.017086897045373917,
 0.033992569893598557,
 -0.01996060274541378,
 -0.012925202026963234,
 0.011197094805538654,
 -0.009572545066475868,
 0.010498085990548134,
 -0.0016763281309977174,
 -0.0170480627566576,
 -0.01631021872162819,
 -0.013863686472177505,
 0.005093707237392664,
 -0.015404095873236656,
 0.007773243356496096,
 0.02104794979095459,
 -0.00685417558997

In [33]:
# Looking at these embeddings
r = [{'title': t[0], 'heading': t[1], 'tokens': len(document_embeddings[t])} for t in document_embeddings]
pd.DataFrame(r)

Unnamed: 0,title,heading,tokens
0,2020 Summer Olympics,Summary,1536
1,2020 Summer Olympics,Host city selection,1536
2,2020 Summer Olympics,Impact of the COVID-19 pandemic,1536
3,2020 Summer Olympics,Qualifying event cancellation and postponement,1536
4,2020 Summer Olympics,Effect on doping tests,1536
...,...,...,...
3936,Serbia at the 2020 Summer Olympics,Table tennis,1536
3937,Serbia at the 2020 Summer Olympics,Taekwondo,1536
3938,Serbia at the 2020 Summer Olympics,Tennis,1536
3939,Serbia at the 2020 Summer Olympics,Wrestling,1536


In [65]:
# Convert the document embeddings into a form that can be saved as a csv
r = [{**{'title': t[0], 'heading': t[1]}, **dict(enumerate(document_embeddings[t], 0))} for t in document_embeddings]
embeddings = pd.DataFrame(r)
embeddings.to_csv('olympics-data/my_olympics_sections_embeddings.csv')
embeddings

Unnamed: 0,title,heading,0,1,2,3,4,5,6,7,...,1526,1527,1528,1529,1530,1531,1532,1533,1534,1535
0,2020 Summer Olympics,Summary,0.006666,-0.010420,-0.011657,-0.004916,-0.001801,0.014563,-0.023507,-0.004573,...,0.007676,0.001538,-0.006705,-0.025902,-0.033734,-0.015132,0.019637,-0.015171,0.000244,-0.037358
1,2020 Summer Olympics,Host city selection,0.009945,-0.012757,-0.010687,-0.004917,-0.025436,0.019487,-0.035511,-0.007680,...,0.003066,0.005080,0.006769,-0.044884,-0.037646,-0.025970,0.013629,-0.013994,0.000980,-0.022012
2,2020 Summer Olympics,Impact of the COVID-19 pandemic,-0.003252,-0.021974,-0.007119,0.002855,-0.013825,0.008023,-0.027079,0.006321,...,-0.009239,-0.017135,0.008993,-0.027132,-0.029352,-0.027837,0.005577,-0.014982,-0.004633,-0.029246
3,2020 Summer Olympics,Qualifying event cancellation and postponement,-0.008643,-0.037240,0.002775,-0.011113,-0.028577,0.018712,-0.018976,-0.018105,...,0.015701,-0.016560,0.005345,-0.020905,-0.037002,-0.018752,-0.006880,-0.010776,-0.007309,-0.020165
4,2020 Summer Olympics,Effect on doping tests,-0.006274,-0.022417,-0.003617,-0.024952,-0.018364,0.008740,-0.020054,-0.019882,...,0.001642,0.005449,0.034273,-0.026866,-0.021216,-0.032794,-0.024173,-0.019698,-0.001923,0.006852
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3936,Serbia at the 2020 Summer Olympics,Table tennis,0.011825,-0.010692,0.016339,-0.016824,0.000577,0.036243,-0.025302,0.006214,...,0.022013,-0.008930,0.002283,-0.028722,-0.032862,-0.008189,0.005372,-0.011996,-0.010417,-0.008222
3937,Serbia at the 2020 Summer Olympics,Taekwondo,-0.008083,-0.012065,-0.002836,-0.017353,0.005135,0.021235,-0.038628,-0.015606,...,0.011018,0.001516,0.012471,-0.048872,-0.025757,-0.012451,0.010017,-0.013025,-0.026744,-0.009317
3938,Serbia at the 2020 Summer Olympics,Tennis,-0.007109,-0.023274,-0.002258,-0.002707,-0.016719,0.035728,-0.035491,-0.003821,...,0.009360,-0.007102,0.005838,-0.032647,-0.036438,0.008800,-0.007602,-0.014876,-0.013836,-0.008833
3939,Serbia at the 2020 Summer Olympics,Wrestling,0.035901,-0.003644,-0.002491,-0.041342,-0.011929,0.019311,-0.033580,0.006331,...,0.007488,-0.017777,-0.005695,-0.031953,-0.036274,0.019471,0.001149,-0.008535,0.000156,-0.008849


## Compare the embedding of a question vs the store of embeddings created above 

In [70]:
def vector_similarity(x, y) -> float:
    """
    Returns the similarity between two vectors.
    
    Because OpenAI Embeddings are normalized to length 1, the cosine similarity is the same as the dot product.
    """
    return np.dot(np.array(x), np.array(y))

def order_document_sections_by_query_similarity(query: str, contexts):
    """
    Find the query embedding for the supplied query, and compare it against all of the pre-calculated document embeddings
    to find the most relevant sections. 
    
    Return the list of document sections, sorted by relevance in descending order.
    """
    query_embedding = get_embedding(query)
    
    document_similarities = sorted([
        (vector_similarity(query_embedding, doc_embedding), doc_index) for doc_index, doc_embedding in contexts.items()
    ], reverse=True)
    
    return document_similarities

In [71]:
order_document_sections_by_query_similarity("Who won the men's high jump?", document_embeddings)[:5]

[(0.884864308450607,
  ("Athletics at the 2020 Summer Olympics – Men's high jump", 'Summary')),
 (0.8633938355935528,
  ("Athletics at the 2020 Summer Olympics – Men's pole vault", 'Summary')),
 (0.8617234695303775,
  ("Athletics at the 2020 Summer Olympics – Men's long jump", 'Summary')),
 (0.8560523857031269,
  ("Athletics at the 2020 Summer Olympics – Men's triple jump", 'Summary')),
 (0.8469039130441249,
  ("Athletics at the 2020 Summer Olympics – Men's 110 metres hurdles",
   'Summary'))]

In [72]:
order_document_sections_by_query_similarity("How many different countries competed?", document_embeddings)[:5]

[(0.8619061430358894,
  ('Softball at the 2020 Summer Olympics – Qualification',
   "2018 Women's Softball World Championship")),
 (0.8586178000819126,
  ("Athletics at the 2020 Summer Olympics – Men's 110 metres hurdles",
   'Background')),
 (0.856590380396488,
  ("Athletics at the 2020 Summer Olympics – Men's 400 metres hurdles",
   'Background')),
 (0.8562393272970783,
  ("Athletics at the 2020 Summer Olympics – Men's 100 metres", 'Background')),
 (0.8552774624239372, ('Beach volleyball at the Summer Olympics', '1996'))]

In [77]:
MAX_SECTION_LEN = 500
SEPARATOR = "\n* "
ENCODING = "cl100k_base"  # encoding for text-embedding-ada-002

encoding = tiktoken.get_encoding(ENCODING)
separator_len = len(encoding.encode(SEPARATOR))

f"Context separator contains {separator_len} tokens"

'Context separator contains 3 tokens'

In [78]:
def construct_prompt(question: str, context_embeddings: dict, df: pd.DataFrame) -> str:
    """
    Fetch relevant 
    """
    most_relevant_document_sections = order_document_sections_by_query_similarity(question, context_embeddings)
    
    chosen_sections = []
    chosen_sections_len = 0
    chosen_sections_indexes = []
     
    for _, section_index in most_relevant_document_sections:
        # Add contexts until we run out of space.        
        document_section = df.loc[section_index]
        
        chosen_sections_len += document_section.tokens + separator_len
        if chosen_sections_len > MAX_SECTION_LEN:
            break
            
        chosen_sections.append(SEPARATOR + document_section.content.replace("\n", " "))
        chosen_sections_indexes.append(str(section_index))
            
    # Useful diagnostic information
    print(f"Selected {len(chosen_sections)} document sections:")
    print("\n".join(chosen_sections_indexes))
    
    header = """Answer the question as truthfully as possible using the provided context, and if the answer is not contained within the text below, say "I don't know."\n\nContext:\n"""
    
    return header + "".join(chosen_sections) + "\n\n Q: " + question + "\n A:"

In [79]:
prompt = construct_prompt(
    "Who won the 2020 Summer Olympics men's high jump?",
    document_embeddings,
    df
)

print("===\n", prompt)

Selected 2 document sections:
("Athletics at the 2020 Summer Olympics – Men's high jump", 'Summary')
("Athletics at the 2020 Summer Olympics – Men's long jump", 'Summary')
===
 Answer the question as truthfully as possible using the provided context, and if the answer is not contained within the text below, say "I don't know."

Context:

* The men's high jump event at the 2020 Summer Olympics took place between 30 July and 1 August 2021 at the Olympic Stadium. 33 athletes from 24 nations competed; the total possible number depended on how many nations would use universality places to enter athletes in addition to the 32 qualifying through mark or ranking (no universality places were used in 2021). Italian athlete Gianmarco Tamberi along with Qatari athlete Mutaz Essa Barshim emerged as joint winners of the event following a tie between both of them as they cleared 2.37m. Both Tamberi and Barshim agreed to share the gold medal in a rare instance where the athletes of different nations h

In [80]:
prompt = construct_prompt(
    "How many different countries competed?",
    document_embeddings,
    df
)

print("===\n", prompt)

Selected 5 document sections:
('Softball at the 2020 Summer Olympics – Qualification', "2018 Women's Softball World Championship")
("Athletics at the 2020 Summer Olympics – Men's 110 metres hurdles", 'Background')
("Athletics at the 2020 Summer Olympics – Men's 400 metres hurdles", 'Background')
("Athletics at the 2020 Summer Olympics – Men's 100 metres", 'Background')
('Beach volleyball at the Summer Olympics', '1996')
===
 Answer the question as truthfully as possible using the provided context, and if the answer is not contained within the text below, say "I don't know."

Context:

* The World Championships featured 16 teams, with the winner earning an Olympic qualification spot (awarded to the runner-up instead if Japan wins). The United States advanced to the final against already qualified host Japan, securing Olympic qualification.
* This was the 29th appearance of the event, which is one of 12 athletics events to have been held at every Summer Olympics. Comoros and Hong Kong co

## Use the found context to answer the question

In [86]:
COMPLETIONS_API_PARAMS = {
    # We use temperature of 0.0 because it gives the most predictable, factual answer.
    "temperature": 0.0,
    "max_tokens": 1200,
    "model": COMPLETIONS_MODEL,
}

def answer_query_with_context(
    query: str,
    df: pd.DataFrame,
    document_embeddings,
    show_prompt: bool = False
) -> str:
    prompt = construct_prompt(
        query,
        document_embeddings,
        df
    )
    
    if show_prompt:
        print(prompt)

    response = openai.Completion.create(
                prompt=prompt,
                **COMPLETIONS_API_PARAMS
            )

    return response["choices"][0]["text"].strip(" \n")

In [83]:
answer_query_with_context("Who won the 2020 Summer Olympics men's high jump?", df, document_embeddings)

Selected 2 document sections:
("Athletics at the 2020 Summer Olympics – Men's high jump", 'Summary')
("Athletics at the 2020 Summer Olympics – Men's long jump", 'Summary')


'Gianmarco Tamberi and Mutaz Essa Barshim emerged as joint winners of the event following a tie between both of them as they cleared 2.37m. Both Tamberi and Barshim agreed to share the gold medal.'

In [84]:
answer_query_with_context("How many different countries competed?", df, document_embeddings)

Selected 5 document sections:
('Softball at the 2020 Summer Olympics – Qualification', "2018 Women's Softball World Championship")
("Athletics at the 2020 Summer Olympics – Men's 110 metres hurdles", 'Background')
("Athletics at the 2020 Summer Olympics – Men's 400 metres hurdles", 'Background')
("Athletics at the 2020 Summer Olympics – Men's 100 metres", 'Background')
('Beach volleyball at the Summer Olympics', '1996')


'46 countries.'

In [87]:
answer_query_with_context("Please list all the different countries that competed in the 2020 Summer olympics", df, document_embeddings)

Selected 2 document sections:
('List of gymnasts at the 2020 Summer Olympics', 'Summary')
('2020 Summer Olympics medal table', 'Summary')


"206 nations participated in the 2020 Summer Olympics: Afghanistan, Albania, Algeria, Andorra, Angola, Antigua and Barbuda, Argentina, Armenia, Australia, Austria, Azerbaijan, Bahamas, Bahrain, Bangladesh, Barbados, Belarus, Belgium, Belize, Benin, Bermuda, Bhutan, Bolivia, Bosnia and Herzegovina, Botswana, Brazil, Brunei, Bulgaria, Burkina Faso, Burundi, Cabo Verde, Cambodia, Cameroon, Canada, Central African Republic, Chad, Chile, China, Colombia, Comoros, Congo, Costa Rica, Cote d'Ivoire, Croatia, Cuba, Cyprus, Czech Republic, Democratic Republic of the Congo, Denmark, Djibouti, Dominica, Dominican Republic, Ecuador, Egypt, El Salvador, Equatorial Guinea, Eritrea, Estonia, Eswatini, Ethiopia, Fiji, Finland, France, Gabon, Gambia, Georgia, Germany, Ghana, Greece, Grenada, Guatemala, Guinea, Guinea-Bissau, Guyana, Haiti, Honduras, Hungary, Iceland, India, Indonesia, Iran, Iraq, Ireland, Israel, Italy, Jamaica, Japan, Jordan, Kazakhstan, Kenya, Kiribati, Kosovo, Kuwait, Kyrgyzstan, Lao

In [89]:
answer_query_with_context("Can you tell me which of the worlds countries did not compete in the 2020 Summer olympics?", df, document_embeddings)

Selected 0 document sections:



"I don't know."

In [91]:
answer_query_with_context("In which events did Russia compete?", df, document_embeddings)

Selected 2 document sections:
('Russia at the 2020 Winter Youth Olympics', 'Summary')
('Russian Olympic Committee athletes at the 2020 Summer Olympics', 'Table tennis')


'Russia competed in 16 sports at the 2020 Winter Youth Olympics in Lausanne, Switzerland, including table tennis.'

In [None]:
# This is an interesting technique.  It looks like it would not be hard to keep track of which document
# each section comes from, and return a link, along with a highlighted section that answers the question.
# That would be a pretty standard search.