In [18]:
from openai import OpenAI
import os
# from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec, Pinecone
import time
from datasets import load_dataset, Dataset
from tqdm.auto import tqdm

client = OpenAI(
    api_key=os.getenv('OPENAI_API_KEY')
)  # get API key from platform.openai.com

In [8]:
MODEL = "text-embedding-3-small"

res = client.embeddings.create(
    input=[
        "Sample document text goes here",
        "there will be several phrases in each batch"
    ], model=MODEL
)

In [9]:
# we can extract embeddings to a list
embeds = [record.embedding for record in res.data]
len(embeds)

2

In [10]:
pc = Pinecone(api_key=os.getenv('PINECONE_API_KEY'))
spec = ServerlessSpec(cloud="aws", region="us-east-1")

index_name = "ragbot"

# check if index already exists (it shouldn't if this is your first run)
if index_name not in pc.list_indexes().names():
    # if does not exist, create index
    pc.create_index(
        index_name,
        dimension=len(embeds[0]),  # dimensionality of text-embed-3-small
        metric='dotproduct',
        spec=spec
    )

# connect to index
index = pc.Index(index_name)
time.sleep(1)
# view index stats
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'metric': 'dotproduct',
 'namespaces': {},
 'total_vector_count': 0,
 'vector_type': 'dense'}

In [27]:
# trec = load_dataset(
#     "csv",
#     data_files="data/train_5500.label",
#     split="train[:1000]",
# )
data = []
with open("data/train_5500.label", "r", encoding="latin1") as f:
    for line in f:
        label, question = line.strip().split(" ", 1)
        coarse_label, fine_label = label.split(":")
        data.append({
            "coarse_label": coarse_label,
            "fine_label": fine_label,
            "text": question
        })

# Convert to Hugging Face Dataset
trec = Dataset.from_list(data)

# Optional: take the first 1000
trec = trec.select(range(min(1000, len(trec))))

# Check
print(trec[0])

{'coarse_label': 'DESC', 'fine_label': 'manner', 'text': 'How did serfdom develop in and then leave Russia ?'}


In [28]:

count = 0  # we'll use the count to create unique IDs
batch_size = 32  # process everything in batches of 32
for i in tqdm(range(0, len(trec['text']), batch_size)):
    # set end position of batch
    i_end = min(i+batch_size, len(trec['text']))
    # get batch of lines and IDs
    lines_batch = trec['text'][i: i+batch_size]
    ids_batch = [str(n) for n in range(i, i_end)]
    # create embeddings
    res = client.embeddings.create(input=lines_batch, model=MODEL)
    embeds = [record.embedding for record in res.data]
    # prep metadata and upsert batch
    meta = [{'text': line} for line in lines_batch]
    to_upsert = zip(ids_batch, embeds, meta)
    # upsert to Pinecone
    index.upsert(vectors=list(to_upsert))

  0%|          | 0/32 [00:00<?, ?it/s]

In [29]:
query = "What caused the 1929 Great Depression?"

xq = client.embeddings.create(input=query, model=MODEL).data[0].embedding

In [31]:
res = index.query(vector = [xq], top_k=5, include_metadata=True)

In [32]:
for match in res['matches']:
    print(f"{match['score']:.2f}: {match['metadata']['text']}")

0.75: Why did the world enter a global depression in 1929 ?
0.60: When was `` the Great Depression '' ?
0.37: What crop failure caused the Irish Famine ?
0.32: What were popular songs and types of songs in the 1920s ?
0.32: When did World War I start ?


#### Harder

In [33]:
query = "What was the cause of the major recession in the early 20th century?"

# create the query embedding
xq = client.embeddings.create(input=query, model=MODEL).data[0].embedding

# query, returning the top 5 most similar results
res = index.query(vector=[xq], top_k=5, include_metadata=True)

for match in res['matches']:
    print(f"{match['score']:.2f}: {match['metadata']['text']}")

0.63: Why did the world enter a global depression in 1929 ?
0.55: When was `` the Great Depression '' ?
0.34: What were popular songs and types of songs in the 1920s ?
0.33: What crop failure caused the Irish Famine ?
0.29: What is considered the costliest disaster the insurance industry has ever faced ?


In [34]:
query = "Why was there a long-term economic downturn in the early 20th century?"

# create the query embedding
xq = client.embeddings.create(input=query, model=MODEL).data[0].embedding

# query, returning the top 5 most similar results
res = index.query(vector=[xq], top_k=5, include_metadata=True)

for match in res['matches']:
    print(f"{match['score']:.2f}: {match['metadata']['text']}")

0.62: Why did the world enter a global depression in 1929 ?
0.54: When was `` the Great Depression '' ?
0.34: What were popular songs and types of songs in the 1920s ?
0.33: What crop failure caused the Irish Famine ?
0.32: What do economists do ?
