In [1]:
import warnings

warnings.filterwarnings("ignore")

from datasets import load_dataset
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec
from tqdm.notebook import tqdm

import ast
import os
import pandas as pd

PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
OPEN_AI_API_KEY = os.getenv("OPEN_AI_API_KEY")

In [2]:
pinecone = Pinecone(api_key=PINECONE_API_KEY)

INDEX_NAME = f'rag-experimentation'
if INDEX_NAME in [index.name for index in pinecone.list_indexes()]:
  pinecone.delete_index(INDEX_NAME)

pinecone.create_index(name=INDEX_NAME, dimension=1536, metric='cosine',
  spec=ServerlessSpec(cloud='aws', region='us-east-1'))

index = pinecone.Index(INDEX_NAME)

In [None]:
max_articles_num = 500
wikidata = pd.read_csv('./wiki.csv', nrows=max_articles_num)


prepped = []
for i, row in tqdm(wikidata.iterrows(), total=wikidata.shape[0]):
    metadata = ast.literal_eval(row['metadata'])
    prepped.append({'id':row['id'], 
                    'values':ast.literal_eval(row['values']), 
                    'metadata':metadata})
    if len(prepped) >= 250:
        index.upsert(prepped)
        prepped = []

print(index.describe_index_stats())

In [4]:
openai_client = OpenAI(api_key=OPEN_AI_API_KEY)

def get_embeddings(articles, model="text-embedding-ada-002"):
   return openai_client.embeddings.create(input = articles, model=model)

In [None]:
query = "what is the berlin wall?"

embed = get_embeddings([query])
res = index.query(vector=embed.data[0].embedding, top_k=3, include_metadata=True)
text = [r['metadata']['text'] for r in res['matches']]
print('\n'.join(text))

In [None]:
query = "write an article titled: what is the berlin wall?"
embed = get_embeddings([query])
res = index.query(vector=embed.data[0].embedding, top_k=3, include_metadata=True)

contexts = [
    x['metadata']['text'] for x in res['matches']
]

prompt_start = (
    "Answer the question based on the context below.\n\n"+
    "Context:\n"
)

prompt_end = (
    f"\n\nQuestion: {query}\nAnswer:"
)

prompt = (
    prompt_start + "\n\n---\n\n".join(contexts) + 
    prompt_end
)

print(prompt)

In [None]:
res = openai_client.chat.completions.create(
   model = "gpt-4o-mini",
   messages = [
         {"role": "user", "content": prompt}
   ]
)
print('-' * 80)
print(res.choices[0].message.content)