In [22]:
import os
import json
import openai
import pandas as pd
from tqdm import tqdm
from openai import OpenAI, AzureOpenAI
from dotenv import load_dotenv
import qa_package.dataclasses.orm as d
from sqlalchemy.engine import Engine, create_engine
from sqlalchemy.orm import Session, sessionmaker
from sqlalchemy.sql import select

load_dotenv()

API_BASE = os.getenv('API_BASE')
API_KEY = os.getenv('API_KEY')
API_VERSION = os.getenv('API_VERSION')
CHAT_DEPLOYMENT_NAME = os.getenv('CHAT_DEPLOYMENT_NAME')
EMBEDDING_DEPLOYMENT_NAME = os.getenv('EMBEDDING_DEPLOYMENT_NAME')

db_url = "postgresql://postgres:postgres@localhost/postgres"
engine = create_engine(db_url)

CSV_FILE = "/Users/spare/Documents/data/articles.csv"
df = pd.read_csv(CSV_FILE)

client = AzureOpenAI(
    azure_endpoint=API_BASE,
    api_version=API_VERSION,
    api_key=API_KEY
)

def embed_docs(docs: list[str]) -> list[list[float]]:
    vec = client.embeddings.create(input=docs, model=EMBEDDING_DEPLOYMENT_NAME)
    return [tmp.embedding for tmp in vec.data]

In [2]:
TEST_QUERY = "Do you have the Howie shorts in stock in blue?"
VEC_QUERY = embed_docs([TEST_QUERY])

with Session(engine) as sess:
    res = sess.execute(
        select(d.record.id)\
            .order_by(d.record.factors.cosine_distance(VEC_QUERY[0]))\
            .limit(3)
    ).scalars().all()

In [3]:
res

[663463002, 717196001, 651242002]

In [6]:
df[df.article_id.isin(res)].detail_desc.tolist()

['Shorts in a cotton weave with an elasticated drawstring waist, side pockets and a welt back pocket.',
 'Shorts in sweatshirt fabric with striped ribbing and a drawstring at the waist, side pockets and a zipped back pocket.',
 'Short shorts in sweatshirt fabric with an elasticated drawstring waist, side pockets and slits in the sides.']

In [25]:
prompt = """Answer the question as truthfully as possible using the provided text, and if the answer is not contained within the text below, say "I don't know"

Context:
The men's high jump event at the 2020 Summer Olympics took place between 30 July and 1 August 2021 at the Olympic Stadium.
33 athletes from 24 nations competed; the total possible number depended on how many nations would use universality places 
to enter athletes in addition to the 32 qualifying through mark or ranking (no universality places were used in 2021).
Italian athlete Gianmarco Tamberi along with Qatari athlete Mutaz Essa Barshim emerged as joint winners of the event following
a tie between both of them as they cleared 2.37m. Both Tamberi and Barshim agreed to share the gold medal in a rare instance
where the athletes of different nations had agreed to share the same medal in the history of Olympics. 
Barshim in particular was heard to ask a competition official "Can we have two golds?" in response to being offered a 
'jump off'. Maksim Nedasekau of Belarus took bronze. The medals were the first ever in the men's high jump for Italy and 
Belarus, the first gold in the men's high jump for Italy and Qatar, and the third consecutive medal in the men's high jump
for Qatar (all by Barshim). Barshim became only the second man to earn three medals in high jump, joining Patrik Sjöberg
of Sweden (1984 to 1992).

Q: Who won the 2020 Summer Olympics men's high jump?
A:"""

response = client.chat.completions.create(
        model = CHAT_DEPLOYMENT_NAME,
        temperature=0,
        max_tokens=300,
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0,
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}
        ]
    )

In [28]:
json.loads(response.model_dump_json())['choices'][0]['message']['content']

'Gianmarco Tamberi and Mutaz Essa Barshim.'