In [2]:
from flask import Flask,request,jsonify
from flask_cors import CORS
import os 
from dotenv import load_dotenv
from openai import OpenAI
from pinecone_datasets import load_dataset
from pinecone import Pinecone
import time
from pinecone import ServerlessSpec
from datasets import load_dataset
from tqdm.auto import tqdm


  from tqdm.autonotebook import tqdm


In [3]:
load_dotenv()

OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
client = OpenAI(api_key=OPENAI_API_KEY)
PINECONE_API_KEY  = os.getenv('PINECONE_API_KEY ')
pc = Pinecone(api_key=PINECONE_API_KEY)

In [4]:
MODEL = "text-embedding-3-small"

res = client.embeddings.create(
    input=input(),
      model=MODEL
)
res

CreateEmbeddingResponse(data=[Embedding(embedding=[0.019255494698882103, -0.06451909244060516, -0.0016881529008969665, 0.0781298279762268, 0.021642647683620453, -0.01554947067052126, -0.015035111457109451, 0.0457911491394043, -0.005888752173632383, -0.04526359960436821, -0.0039038537070155144, -0.02060074172914028, -0.007115300744771957, -0.013069996610283852, 0.029094260185956955, 0.03357841819524765, -0.06794815510511398, 0.022209761664271355, 0.014784527011215687, 0.035556718707084656, 0.05681689456105232, 0.01165880635380745, -0.013287609443068504, 0.01487684715539217, 0.020416099578142166, 0.005545846186578274, 0.019796229898929596, 0.022288894280791283, 0.01961158961057663, -0.052279986441135406, 0.032918982207775116, -0.0337103046476841, 0.0017128817271441221, -0.006211875006556511, 0.009970652870833874, -0.01921592839062214, -0.018833456560969353, 0.027907278388738632, 0.004006065893918276, -0.03782517462968826, 0.007187838666141033, -0.025045331567525864, 0.029147015884518623,

In [5]:
print(f"vector 0: {len(res.data[0].embedding)}\nvector 1: {len(res.data[0].embedding)}")


vector 0: 1536
vector 1: 1536


In [6]:
# we can extract embeddings to a list
embeds = [record.embedding for record in res.data]
len(embeds)


1

In [7]:
len(embeds[0])

1536

In [8]:
spec = ServerlessSpec(cloud="aws", region="us-west-2")

index_name = 'semantic-search-openai'

# check if index already exists (it shouldn't if this is your first run)
if index_name not in pc.list_indexes().names():
    # if does not exist, create index
    pc.create_index(
        index_name,
        dimension=len(embeds[0]),  # dimensionality of text-embed-3-small
        metric='dotproduct',
        spec=spec
    )
    # wait for index to be initialized
    while not pc.describe_index(index_name).status['ready']:
        time.sleep(1)

# connect to index
index = pc.Index(index_name)
time.sleep(1)
# view index stats
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 1000}},
 'total_vector_count': 1000}

In [9]:
trec = load_dataset('trec', split='train[:1000]')
trec

Dataset({
    features: ['text', 'coarse_label', 'fine_label'],
    num_rows: 1000
})

In [10]:
trec[0]

{'text': 'How did serfdom develop in and then leave Russia ?',
 'coarse_label': 2,
 'fine_label': 26}

In [11]:
count = 0  # we'll use the count to create unique IDs
batch_size = 32  # process everything in batches of 32
for i in tqdm(range(0, len(trec['text']), batch_size)):
    # set end position of batch
    i_end = min(i+batch_size, len(trec['text']))
    # get batch of lines and IDs
    lines_batch = trec['text'][i: i+batch_size]
    ids_batch = [str(n) for n in range(i, i_end)]
    # create embeddings
    res = client.embeddings.create(input=lines_batch, model=MODEL)
    embeds = [record.embedding for record in res.data]
    # prep metadata and upsert batch
    meta = [{'text': line} for line in lines_batch]
    to_upsert = zip(ids_batch, embeds, meta)
    # upsert to Pinecone
    index.upsert(vectors=list(to_upsert))

100%|██████████| 32/32 [01:28<00:00,  2.75s/it]


In [12]:
	
query = "What caused the 1929 Great Depression?"

xq = client.embeddings.create(input=query, model=MODEL).data[0].embedding

In [13]:
	
res = index.query(vector=[xq], top_k=5, include_metadata=True)
res

{'matches': [{'id': '932',
              'metadata': {'text': 'Why did the world enter a global '
                                   'depression in 1929 ?'},
              'score': 0.751915097,
              'values': []},
             {'id': '787',
              'metadata': {'text': "When was `` the Great Depression '' ?"},
              'score': 0.597528219,
              'values': []},
             {'id': '400',
              'metadata': {'text': 'What crop failure caused the Irish Famine '
                                   '?'},
              'score': 0.367533326,
              'values': []},
             {'id': '835',
              'metadata': {'text': 'What were popular songs and types of songs '
                                   'in the 1920s ?'},
              'score': 0.324679315,
              'values': []},
             {'id': '262',
              'metadata': {'text': 'When did World War I start ?'},
              'score': 0.321041375,
              'values': []}],
 'names

In [14]:
for match in res['matches']:
    print(f"{match['score']:.2f}: {match['metadata']['text']}")

0.75: Why did the world enter a global depression in 1929 ?
0.60: When was `` the Great Depression '' ?
0.37: What crop failure caused the Irish Famine ?
0.32: What were popular songs and types of songs in the 1920s ?
0.32: When did World War I start ?


In [15]:
query = "What was the cause of the major recession in the early 20th century?"

# create the query embedding
xq = client.embeddings.create(input=query, model=MODEL).data[0].embedding

# query, returning the top 5 most similar results
res = index.query(vector=[xq], top_k=5, include_metadata=True)

for match in res['matches']:
    print(f"{match['score']:.2f}: {match['metadata']['text']}")

0.63: Why did the world enter a global depression in 1929 ?
0.55: When was `` the Great Depression '' ?
0.35: What were popular songs and types of songs in the 1920s ?
0.33: What crop failure caused the Irish Famine ?
0.29: What is considered the costliest disaster the insurance industry has ever faced ?


In [16]:
query = input()

# create the query embedding
xq = client.embeddings.create(input=query, model=MODEL).data[0].embedding

# query, returning the top 5 most similar results
res = index.query(vector=[xq], top_k=5, include_metadata=True)

for match in res['matches']:
    print(f"{match['score']:.2f}: {match['metadata']['text']}")

0.26: How do I log on to home page at Headquarters U.S. European Command ?
0.25: What is the name of the managing director of Apricot Computer ?
0.24: What is her profession ?
0.24: What product does `` Mrs. Olsen '' promote ?
0.24: Name Alvin 's brothers


In [17]:
pc.delete_index(index_name)