# OpenAI embedding

https://platform.openai.com/docs/guides/embeddings/what-are-embeddings?lang=python

In [1]:
from openai import OpenAI
client = OpenAI()

response = client.embeddings.create(
    input="Help me search LLM technologies",
    model="text-embedding-3-small"
)

print(response.data[0].embedding)

[0.01788897067308426, -0.004116015043109655, 0.02295375056564808, -0.017931293696165085, 0.007646547164767981, -0.029513979330658913, -0.04302946478128433, 0.03910743445158005, 0.013966940343379974, 0.018199346959590912, 0.017705567181110382, -0.00393613800406456, -0.03038867749273777, 0.011067742481827736, 0.010595123283565044, -0.004634484648704529, -0.006408568471670151, 0.0126478411257267, 0.03027581237256527, 0.09864328056573868, -0.013176891952753067, 0.018298102542757988, -0.003989042714238167, 0.017733782529830933, -0.023250019177794456, -0.020964518189430237, 0.017747890204191208, 0.00636624451726675, 0.03724517673254013, -0.013000541366636753, 0.04929342493414879, -0.029655059799551964, -0.03685015067458153, 0.016534600406885147, -0.011307578533887863, 0.0565449483692646, 0.0003943632764276117, 0.03676550090312958, -0.00959345418959856, 0.008366056717932224, -0.03337957710027695, -0.033040985465049744, -0.01574455201625824, -0.006461473647505045, -0.0011180606670677662, -0.02

In [2]:
embedding=response.data[0].embedding
len(embedding)

1536

In [14]:
embedding

[0.01788897067308426,
 -0.004116015043109655,
 0.02295375056564808,
 -0.017931293696165085,
 0.007646547164767981,
 -0.029513979330658913,
 -0.04302946478128433,
 0.03910743445158005,
 0.013966940343379974,
 0.018199346959590912,
 0.017705567181110382,
 -0.00393613800406456,
 -0.03038867749273777,
 0.011067742481827736,
 0.010595123283565044,
 -0.004634484648704529,
 -0.006408568471670151,
 0.0126478411257267,
 0.03027581237256527,
 0.09864328056573868,
 -0.013176891952753067,
 0.018298102542757988,
 -0.003989042714238167,
 0.017733782529830933,
 -0.023250019177794456,
 -0.020964518189430237,
 0.017747890204191208,
 0.00636624451726675,
 0.03724517673254013,
 -0.013000541366636753,
 0.04929342493414879,
 -0.029655059799551964,
 -0.03685015067458153,
 0.016534600406885147,
 -0.011307578533887863,
 0.0565449483692646,
 0.0003943632764276117,
 0.03676550090312958,
 -0.00959345418959856,
 0.008366056717932224,
 -0.03337957710027695,
 -0.033040985465049744,
 -0.01574455201625824,
 -0.006461

By default, the length of the embedding vector will be 1536 for text-embedding-3-small or 3072 for text-embedding-3-large. You can reduce the dimensions of the embedding by passing in the dimensions parameter without the embedding losing its concept-representing properties.

https://github.com/openai/openai-cookbook/blob/main/examples/Get_embeddings_from_dataset.ipynb

In [3]:
!pwd

/Users/kaikailiu/Documents/MyRepo/DeepDataMiningLearning/dataapps


In [4]:
!ls ../sampledata/

Ohana_Resources.pdf      diamonds.csv             sample.txt
Sunflower.jpg            elections.csv            shakespeare.txt
baby.csv                 fake-hate.csv            sjsuimag1.jpg
bus.jpg                  fine_food_reviews_1k.csv sjsupeople.jpg


In [6]:
import pandas as pd
import tiktoken #pip install tiktoken

# load & inspect dataset
input_datapath = "../sampledata/fine_food_reviews_1k.csv"  # to save space, we provide a pre-filtered dataset
df = pd.read_csv(input_datapath, index_col=0)
df.head(10)

Unnamed: 0,Time,ProductId,UserId,Score,Summary,Text
0,1351123200,B003XPF9BO,A3R7JR3FMEBXQB,5,where does one start...and stop... with a tre...,Wanted to save some to bring to my Chicago fam...
1,1351123200,B003JK537S,A3JBPC3WFUT5ZP,1,Arrived in pieces,"Not pleased at all. When I opened the box, mos..."
2,1351123200,B000JMBE7M,AQX1N6A51QOKG,4,"It isn't blanc mange, but isn't bad . . .",I'm not sure that custard is really custard wi...
3,1351123200,B004AHGBX4,A2UY46X0OSNVUQ,3,These also have SALT and it's not sea salt.,I like the fact that you can see what you're g...
4,1351123200,B001BORBHO,A1AFOYZ9HSM2CZ,5,Happy with the product,My dog was suffering with itchy skin. He had ...
5,1351123200,B008PSM0BQ,A3OUFIMGL2K6RS,4,Good Sauce,This is a good all purpose sauce. Has good fl...
6,1351123200,B008YA1LQK,A9YEAAQVHFUTX,5,Blackcat,Great coffee! Love all Green Mountain coffee ...
7,1351123200,B001KP6B98,ABWCUS3HBDZRS,5,Excellent product,After scouring every store in town for orange ...
8,1351123200,B008YA1LQK,A2RSB6FVQ9K9OD,5,Bulk k-Cups,This is the best way to buy coffee for my offi...
9,1351123200,B001E5E2QI,A23WYVBCNE75X1,3,It's Okay,"Next time, I will buy Gevalia Irish Cream deca..."


In [7]:
df = df[["Time", "ProductId", "UserId", "Score", "Summary", "Text"]]
df = df.dropna()
df["combined"] = (
    "Title: " + df.Summary.str.strip() + "; Content: " + df.Text.str.strip()
)
df.head(10)

Unnamed: 0,Time,ProductId,UserId,Score,Summary,Text,combined
0,1351123200,B003XPF9BO,A3R7JR3FMEBXQB,5,where does one start...and stop... with a tre...,Wanted to save some to bring to my Chicago fam...,Title: where does one start...and stop... wit...
1,1351123200,B003JK537S,A3JBPC3WFUT5ZP,1,Arrived in pieces,"Not pleased at all. When I opened the box, mos...",Title: Arrived in pieces; Content: Not pleased...
2,1351123200,B000JMBE7M,AQX1N6A51QOKG,4,"It isn't blanc mange, but isn't bad . . .",I'm not sure that custard is really custard wi...,"Title: It isn't blanc mange, but isn't bad . ...."
3,1351123200,B004AHGBX4,A2UY46X0OSNVUQ,3,These also have SALT and it's not sea salt.,I like the fact that you can see what you're g...,Title: These also have SALT and it's not sea s...
4,1351123200,B001BORBHO,A1AFOYZ9HSM2CZ,5,Happy with the product,My dog was suffering with itchy skin. He had ...,Title: Happy with the product; Content: My dog...
5,1351123200,B008PSM0BQ,A3OUFIMGL2K6RS,4,Good Sauce,This is a good all purpose sauce. Has good fl...,Title: Good Sauce; Content: This is a good all...
6,1351123200,B008YA1LQK,A9YEAAQVHFUTX,5,Blackcat,Great coffee! Love all Green Mountain coffee ...,Title: Blackcat; Content: Great coffee! Love ...
7,1351123200,B001KP6B98,ABWCUS3HBDZRS,5,Excellent product,After scouring every store in town for orange ...,Title: Excellent product; Content: After scour...
8,1351123200,B008YA1LQK,A2RSB6FVQ9K9OD,5,Bulk k-Cups,This is the best way to buy coffee for my offi...,Title: Bulk k-Cups; Content: This is the best ...
9,1351123200,B001E5E2QI,A23WYVBCNE75X1,3,It's Okay,"Next time, I will buy Gevalia Irish Cream deca...","Title: It's Okay; Content: Next time, I will b..."


In [8]:
# subsample to 1k most recent reviews and remove samples that are too long
top_n = 1000
df = df.sort_values("Time").tail(top_n * 2)  # first cut to first 2k entries, assuming less than half will be filtered out
df.drop("Time", axis=1, inplace=True)

In [9]:
df.head(10)

Unnamed: 0,ProductId,UserId,Score,Summary,Text,combined
0,B003XPF9BO,A3R7JR3FMEBXQB,5,where does one start...and stop... with a tre...,Wanted to save some to bring to my Chicago fam...,Title: where does one start...and stop... wit...
297,B003VXHGPK,A21VWSCGW7UUAR,4,"Good, but not Wolfgang Puck good","Honestly, I have to admit that I expected a li...","Title: Good, but not Wolfgang Puck good; Conte..."
296,B008JKTTUA,A34XBAIFT02B60,1,Should advertise coconut as an ingredient more...,"First, these should be called Mac - Coconut ba...",Title: Should advertise coconut as an ingredie...
295,B000LKTTTW,A14MQ40CCU8B13,5,Best tomato soup,I have a hard time finding packaged food of an...,Title: Best tomato soup; Content: I have a har...
294,B001D09KAM,A34XBAIFT02B60,1,Should advertise coconut as an ingredient more...,"First, these should be called Mac - Coconut ba...",Title: Should advertise coconut as an ingredie...
293,B001D09KAM,A1XV4W7JWX341C,5,"Loved these gluten free healthy bars, saved $$...",These Kind Bars are so good and healthy & glut...,"Title: Loved these gluten free healthy bars, s..."
292,B002JA06Z8,A3ESIUM1JTR7KK,5,These fresh berries are truly MIRACULOUS!!!,I have ordered from Ethans on three separate o...,Title: These fresh berries are truly MIRACULOU...
291,B002HQNCBO,A1UW65ZMZ3UWD3,5,Baconnaise,If you are a fan of bacon you're going to like...,Title: Baconnaise; Content: If you are a fan o...
290,B008JKTTUA,A1XV4W7JWX341C,5,"Loved these gluten free healthy bars, saved $$...",These Kind Bars are so good and healthy & glut...,"Title: Loved these gluten free healthy bars, s..."
289,B0048GRNZM,AXG287OY16WWL,1,Cute,"For some reason I thought that you got three ""...",Title: Cute; Content: For some reason I though...


In [10]:
embedding_model = "text-embedding-3-small"
embedding_encoding = "cl100k_base"
max_tokens = 8000  # the maximum for text-embedding-3-small is 8191

encoding = tiktoken.get_encoding(embedding_encoding)
encoding

<Encoding 'cl100k_base'>

Get embeddings and save them for future reuse

In [11]:
# omit reviews that are too long to embed
df["n_tokens"] = df.combined.apply(lambda x: len(encoding.encode(x)))
df = df[df.n_tokens <= max_tokens].tail(top_n)
len(df)

1000

In [12]:
from openai_embedding_utils import get_embedding



In [13]:
a = get_embedding("Help me search LLM technologies", model=embedding_model)
a

[0.01788897067308426,
 -0.004116015043109655,
 0.02295375056564808,
 -0.017931293696165085,
 0.007646547164767981,
 -0.029513979330658913,
 -0.04302946478128433,
 0.03910743445158005,
 0.013966940343379974,
 0.018199346959590912,
 0.017705567181110382,
 -0.00393613800406456,
 -0.03038867749273777,
 0.011067742481827736,
 0.010595123283565044,
 -0.004634484648704529,
 -0.006408568471670151,
 0.0126478411257267,
 0.03027581237256527,
 0.09864328056573868,
 -0.013176891952753067,
 0.018298102542757988,
 -0.003989042714238167,
 0.017733782529830933,
 -0.023250019177794456,
 -0.020964518189430237,
 0.017747890204191208,
 0.00636624451726675,
 0.03724517673254013,
 -0.013000541366636753,
 0.04929342493414879,
 -0.029655059799551964,
 -0.03685015067458153,
 0.016534600406885147,
 -0.011307578533887863,
 0.0565449483692646,
 0.0003943632764276117,
 0.03676550090312958,
 -0.00959345418959856,
 0.008366056717932224,
 -0.03337957710027695,
 -0.033040985465049744,
 -0.01574455201625824,
 -0.006461

In [16]:
from openai_embedding_utils import distances_from_embeddings, cosine_similarity
distance = distances_from_embeddings(query_embedding=a, embeddings=embedding)
similarity = cosine_similarity(a, embedding)

In [18]:
len(distance)

1536

In [17]:
similarity

0.9999999999999999

In [None]:
# This may take a few minutes
df["embedding"] = df.combined.apply(lambda x: get_embedding(x, model=embedding_model))
df.to_csv("data/fine_food_reviews_with_embeddings_1k.csv")

https://cookbook.openai.com/examples/semantic_text_search_using_embeddings

In [None]:
import pandas as pd
import numpy as np
from ast import literal_eval

datafile_path = "data/fine_food_reviews_with_embeddings_1k.csv"

df = pd.read_csv(datafile_path)
df["embedding"] = df.embedding.apply(literal_eval).apply(np.array)


Here we compare the cosine similarity of the embeddings of the query and the documents, and show top_n best matches.

In [None]:
from openai_embedding_utils import get_embedding, cosine_similarity

# search through the reviews for a specific product
def search_reviews(df, product_description, n=3, pprint=True):
    product_embedding = get_embedding(
        product_description,
        model="text-embedding-3-small"
    )
    df["similarity"] = df.embedding.apply(lambda x: cosine_similarity(x, product_embedding))

    results = (
        df.sort_values("similarity", ascending=False)
        .head(n)
        .combined.str.replace("Title: ", "")
        .str.replace("; Content:", ": ")
    )
    if pprint:
        for r in results:
            print(r[:200])
            print()
    return results


results = search_reviews(df, "delicious beans", n=3)


# OpenAI Q&A
ref: https://platform.openai.com/docs/tutorials/web-qa-embeddings
https://github.com/openai/openai-cookbook/blob/main/apps/web-crawl-q-and-a/web-qa.ipynb

https://cookbook.openai.com/examples/question_answering_using_embeddings

In [23]:
# models
EMBEDDING_MODEL = "text-embedding-ada-002"
GPT_MODEL = "gpt-3.5-turbo"

In [24]:
article = """A US drone strike on a car in Baghdad has killed three members of the powerful Kataib Hezbollah militia, including a high-ranking commander, officials said after a string of blasts were heard in the Iraqi capital.
"""

In [25]:
query = f"""Use the below article to answer the subsequent question. If the answer cannot be found, write "I don't know."

Article:
\"\"\"
{article}
\"\"\"

Question: How many people are killed?"""

response = client.chat.completions.create(
    messages=[
        {'role': 'system', 'content': 'You answer questions about the provided article.'},
        {'role': 'user', 'content': query},
    ],
    model=GPT_MODEL,
    temperature=0,
)

print(response.choices[0].message.content)

Three people were killed in the US drone strike on a car in Baghdad.


In [19]:
# download pre-chunked text and pre-computed embeddings
# this file is ~200 MB, so may take a minute depending on your connection speed
embeddings_path = "https://cdn.openai.com/API/examples/data/winter_olympics_2022.csv"

df = pd.read_csv(embeddings_path)

In [20]:
df.head(10)

Unnamed: 0,text,embedding
0,Lviv bid for the 2022 Winter Olympics\n\n{{Oly...,"[-0.005021067801862955, 0.00026050032465718687..."
1,Lviv bid for the 2022 Winter Olympics\n\n==His...,"[0.0033927420154213905, -0.007447326090186834,..."
2,Lviv bid for the 2022 Winter Olympics\n\n==Ven...,"[-0.00915789045393467, -0.008366798982024193, ..."
3,Lviv bid for the 2022 Winter Olympics\n\n==Ven...,"[0.0030951891094446182, -0.006064314860850573,..."
4,Lviv bid for the 2022 Winter Olympics\n\n==Ven...,"[-0.002936174161732197, -0.006185177247971296,..."
5,Lviv bid for the 2022 Winter Olympics\n\n==Ven...,"[-0.0069602788425982, -0.003149641677737236, -..."
6,Damir Sharipzyanov\n\n{{short description|Russ...,"[-0.013576720841228962, -0.012421397492289543,..."
7,Damir Sharipzyanov\n\n==Playing career==\n\nOn...,"[-0.007685251533985138, -0.024144049733877182,..."
8,Damir Sharipzyanov\n\n==International play==\n...,"[-0.013910852372646332, -0.01668008230626583, ..."
9,Damir Sharipzyanov\n\n==Career statistics==\n\...,"[-0.006781470030546188, -0.007359683513641357,..."


In [21]:
import ast  # for converting embeddings saved as strings back to arrays
# convert embeddings from CSV str type back to list type
df['embedding'] = df['embedding'].apply(ast.literal_eval)

In [22]:
df.head(10)

Unnamed: 0,text,embedding
0,Lviv bid for the 2022 Winter Olympics\n\n{{Oly...,"[-0.005021067801862955, 0.00026050032465718687..."
1,Lviv bid for the 2022 Winter Olympics\n\n==His...,"[0.0033927420154213905, -0.007447326090186834,..."
2,Lviv bid for the 2022 Winter Olympics\n\n==Ven...,"[-0.00915789045393467, -0.008366798982024193, ..."
3,Lviv bid for the 2022 Winter Olympics\n\n==Ven...,"[0.0030951891094446182, -0.006064314860850573,..."
4,Lviv bid for the 2022 Winter Olympics\n\n==Ven...,"[-0.002936174161732197, -0.006185177247971296,..."
5,Lviv bid for the 2022 Winter Olympics\n\n==Ven...,"[-0.0069602788425982, -0.003149641677737236, -..."
6,Damir Sharipzyanov\n\n{{short description|Russ...,"[-0.013576720841228962, -0.012421397492289543,..."
7,Damir Sharipzyanov\n\n==Playing career==\n\nOn...,"[-0.007685251533985138, -0.024144049733877182,..."
8,Damir Sharipzyanov\n\n==International play==\n...,"[-0.013910852372646332, -0.01668008230626583, ..."
9,Damir Sharipzyanov\n\n==Career statistics==\n\...,"[-0.006781470030546188, -0.007359683513641357,..."


In [31]:
df[0:]['text']

0       Lviv bid for the 2022 Winter Olympics\n\n{{Oly...
1       Lviv bid for the 2022 Winter Olympics\n\n==His...
2       Lviv bid for the 2022 Winter Olympics\n\n==Ven...
3       Lviv bid for the 2022 Winter Olympics\n\n==Ven...
4       Lviv bid for the 2022 Winter Olympics\n\n==Ven...
                              ...                        
6054    Anaïs Chevalier-Bouchet\n\n==Personal life==\n...
6055    Uliana Nigmatullina\n\n{{short description|Rus...
6056    Uliana Nigmatullina\n\n==Biathlon results==\n\...
6057    Uliana Nigmatullina\n\n==Biathlon results==\n\...
6058    Uliana Nigmatullina\n\n==Biathlon results==\n\...
Name: text, Length: 6059, dtype: object

In [32]:
subdf=df[0:10]
subdf

Unnamed: 0,text,embedding
0,Lviv bid for the 2022 Winter Olympics\n\n{{Oly...,"[-0.005021067801862955, 0.00026050032465718687..."
1,Lviv bid for the 2022 Winter Olympics\n\n==His...,"[0.0033927420154213905, -0.007447326090186834,..."
2,Lviv bid for the 2022 Winter Olympics\n\n==Ven...,"[-0.00915789045393467, -0.008366798982024193, ..."
3,Lviv bid for the 2022 Winter Olympics\n\n==Ven...,"[0.0030951891094446182, -0.006064314860850573,..."
4,Lviv bid for the 2022 Winter Olympics\n\n==Ven...,"[-0.002936174161732197, -0.006185177247971296,..."
5,Lviv bid for the 2022 Winter Olympics\n\n==Ven...,"[-0.0069602788425982, -0.003149641677737236, -..."
6,Damir Sharipzyanov\n\n{{short description|Russ...,"[-0.013576720841228962, -0.012421397492289543,..."
7,Damir Sharipzyanov\n\n==Playing career==\n\nOn...,"[-0.007685251533985138, -0.024144049733877182,..."
8,Damir Sharipzyanov\n\n==International play==\n...,"[-0.013910852372646332, -0.01668008230626583, ..."
9,Damir Sharipzyanov\n\n==Career statistics==\n\...,"[-0.006781470030546188, -0.007359683513641357,..."


In [33]:
from scipy import spatial  # for calculating vector similarities for search

# search function
def strings_ranked_by_relatedness(
    query: str,
    df: pd.DataFrame,
    relatedness_fn=lambda x, y: 1 - spatial.distance.cosine(x, y),
    top_n: int = 100
) -> tuple[list[str], list[float]]:
    """Returns a list of strings and relatednesses, sorted from most related to least."""
    query_embedding_response = client.embeddings.create(
        model=EMBEDDING_MODEL,
        input=query,
    )
    query_embedding = query_embedding_response.data[0].embedding
    strings_and_relatednesses = [
        (row["text"], relatedness_fn(query_embedding, row["embedding"]))
        for i, row in df.iterrows()
    ]
    strings_and_relatednesses.sort(key=lambda x: x[1], reverse=True)
    strings, relatednesses = zip(*strings_and_relatednesses)
    return strings[:top_n], relatednesses[:top_n]


In [37]:
# examples
strings, relatednesses = strings_ranked_by_relatedness("Winter Olympics", subdf, top_n=5)

In [38]:
relatednesses

(0.861883230502706,
 0.861703469804855,
 0.853608864466378,
 0.8518601723569217,
 0.8467052278284372)

In [39]:
for string, relatedness in zip(strings, relatednesses):
    print(f"{relatedness=:.3f}")
    display(string)

relatedness=0.862


'Lviv bid for the 2022 Winter Olympics\n\n==Venues==\n\n===Mountain zone===\n\n====Venue cluster Tysovets-Panasivka====\n\nAn existing military ski training facility in [[Tysovets, Skole Raion|Tysovets]], 139&nbsp;km south of Lviv, along with two disused ski jumps, are proposed to be rebuilt to host all Nordic events.  Additionally, a ski hill would be developed to host all of the snowboard and freestyle skiing events.<ref name=concept_study />\n*Tysovets Nordic Arena - biathlon, cross country, Nordic combined\n*Tysovets Jumping Hills - ski jumping, Nordic combined\n*Panasivka Snow Park - freestyle, snowboard'

relatedness=0.862


"Lviv bid for the 2022 Winter Olympics\n\n==Venues==\n\n{{Location map+\n|Ukraine\n|border =\n|caption = Venue areas\n|float = left\n|width = 350\n|places =\n{{location map~ |Ukraine |lat=49.85 |long=24.01 |label='''[[Lviv]]''' |position=top}}\n{{location map~ |Ukraine |lat=48.9828 |long=23.2774 |label='''[[Tysovets, Skole Raion|Tysovets]]''' |position=right}}\n{{location map~ |Ukraine |lat=48.711111 |long=23.188333 |label='''[[Volovets|Borzhava]]''' |position=bottom}} }}\nThe German companies Proprojekt and AS&P conducted a feasibility study for a Lviv bid in May 2012, which proposed to hold all events in two clusters.  The Lviv Ice Zone will host all ice sports, including bobsleigh and luge, as well as the opening and closing ceremonies.  The Tysovets Snow Zone and the Borzhava Alpine Area, both located in the Carpathian Mountains, would host the snow events."

relatedness=0.854


"Lviv bid for the 2022 Winter Olympics\n\n==Venues==\n\n===Mountain zone===\n\n====Venue cluster Borzhava====\n\nA new alpine skiing centre, with a vertical drop of 900m, would be developed at Borzhava. It would include an alpine athlete's village and media accommodation.<ref name=concept_study />\n*Borzhava Resort - Alpine skiing"

relatedness=0.852


'Lviv bid for the 2022 Winter Olympics\n\n==Venues==\n\n===City zone===\n\nThe main Olympic Park would be centered around the [[Arena Lviv]], hosting the opening and closing ceremonies.  The Olympic Park would have two ice rinks (ice hockey, short track speed skating and figure skating), a temporary speed skating oval and a temporary curling rink.  The Olympic Park would also host the Olympic Village and International Broadcast Centre.  A second ice rink for hockey competitions would be located just to the north-west of the Olympic Park.  A sliding track would also be built in the east of Lviv.<ref name=concept_study />\n*[[Arena Lviv]] - opening and closing ceremonies\n*Skating Arena - figure skating, short track\n*Hockey Venue I - ice hockey\n*Hockey Venue II - ice hockey\n*Curling Sheet - curling\n*Skating Oval - Speed skating\n*Vynnycky Sliding Centre - Bobsleigh, luge, skeleton'

relatedness=0.847


'Lviv bid for the 2022 Winter Olympics\n\n==History==\n\n[[Image:Lwów - Rynek 01.JPG|thumb|right|200px|View of Rynok Square in Lviv]]\n\nOn 27 May 2010, [[President of Ukraine]] [[Viktor Yanukovych]] stated during a visit to [[Lviv]] that Ukraine "will start working on the official nomination of our country as the holder of the Winter Olympic Games in [[Carpathian Mountains|Carpathians]]".\n\nIn September 2012, [[government of Ukraine]] approved a document about the technical-economic substantiation of the national project "Olympic Hope 2022". This was announced by Vladyslav Kaskiv, the head of Ukraine´s Derzhinvestproekt (State investment project). The organizers announced on their website venue plans featuring Lviv as the host city and location for the "ice sport" venues, [[Volovets]] (around {{convert|185|km|mi|abbr=on}} from Lviv) as venue for the [[Alpine skiing]] competitions and [[Tysovets, Skole Raion|Tysovets]] (around {{convert|130|km|mi|abbr=on}} from Lviv) as venue for all 

With the search function above, we can now automatically retrieve relevant knowledge and insert it into messages to GPT.

Below, we define a function ask that:

Takes a user query
Searches for text relevant to the query
Stuffs that text into a message for GPT
Sends the message to GPT
Returns GPT's answer

In [None]:
def num_tokens(text: str, model: str = GPT_MODEL) -> int:
    """Return the number of tokens in a string."""
    encoding = tiktoken.encoding_for_model(model)
    return len(encoding.encode(text))


def query_message(
    query: str,
    df: pd.DataFrame,
    model: str,
    token_budget: int
) -> str:
    """Return a message for GPT, with relevant source texts pulled from a dataframe."""
    strings, relatednesses = strings_ranked_by_relatedness(query, df)
    introduction = 'Use the below articles on the 2022 Winter Olympics to answer the subsequent question. If the answer cannot be found in the articles, write "I could not find an answer."'
    question = f"\n\nQuestion: {query}"
    message = introduction
    for string in strings:
        next_article = f'\n\nWikipedia article section:\n"""\n{string}\n"""'
        if (
            num_tokens(message + next_article + question, model=model)
            > token_budget
        ):
            break
        else:
            message += next_article
    return message + question


def ask(
    query: str,
    df: pd.DataFrame = df,
    model: str = GPT_MODEL,
    token_budget: int = 4096 - 500,
    print_message: bool = False,
) -> str:
    """Answers a query using GPT and a dataframe of relevant texts and embeddings."""
    message = query_message(query, df, model=model, token_budget=token_budget)
    if print_message:
        print(message)
    messages = [
        {"role": "system", "content": "You answer questions about the 2022 Winter Olympics."},
        {"role": "user", "content": message},
    ]
    response = client.chat.completions.create(
        model=model,
        messages=messages,
        temperature=0
    )
    response_message = response.choices[0].message.content
    return response_message



In [2]:
import pandas as pd
outputpath="./output/text/processed"
df = pd.read_csv(outputpath+'/split.csv', index_col=0)
df.head()

Unnamed: 0,text,n_tokens
0,u cmpe faculty faculty emeriti ron mak.php. ...,160
1,u cmpe faculty tenure line ahmet bindal.php. ...,161
2,.,1
3,u cmpe research research areas. Research A...,154
4,u cmpe academic cybersecurity program cybersec...,165


In [None]:
df.to_csv(outputpath+'/embeddings.csv')
df.head()