In [27]:
import os
import openai
import pandas as pd
from tqdm import tqdm
from openai import OpenAI, AzureOpenAI
from dotenv import load_dotenv
import qa_package.dataclasses.orm as d
from sqlalchemy.engine import Engine, create_engine
from sqlalchemy.orm import Session, sessionmaker
from sqlalchemy.sql import select

load_dotenv()

API_BASE = os.getenv('API_BASE')
API_KEY = os.getenv('API_KEY')
API_VERSION = os.getenv('API_VERSION')
CHAT_DEPLOYMENT_NAME = os.getenv('CHAT_DEPLOYMENT_NAME')
EMBEDDING_DEPLOYMENT_NAME = os.getenv('EMBEDDING_DEPLOYMENT_NAME')

db_url = "postgresql://postgres:postgres@localhost/postgres"
engine = create_engine(db_url)

CSV_FILE = "/Users/spare/Documents/data/articles.csv"
df = pd.read_csv(CSV_FILE)

client = AzureOpenAI(
    azure_endpoint=API_BASE,
    api_version=API_VERSION,
    api_key=API_KEY
)

def embed_docs(docs: list[str]) -> list[list[float]]:
    vec = client.embeddings.create(input=docs, model=EMBEDDING_DEPLOYMENT_NAME)
    return [tmp.embedding for tmp in vec.data]

In [26]:
BATCH_SIZE = 5
BATCH = df.shape[0]//BATCH_SIZE + int(df.shape[0]%BATCH_SIZE>0)
with Session(engine) as sess:
    for i in tqdm(range(BATCH)):
        ids = df['article_id'][i*BATCH_SIZE:(i+1)*BATCH_SIZE].tolist()
        docs = df['detail_desc'][i*BATCH_SIZE:(i+1)*BATCH_SIZE].tolist()
        vecs = embed_docs(docs)

        for tmpid, tmpvec in zip(ids, vecs):
            tmprow = d.record(id=tmpid, factors=tmpvec)
            sess.add(tmprow)
        sess.commit()


200
[695255001, 821115007, 553238008, 627147001, 794520001] ['Short-sleeved top in soft viscose jersey with a unique nursing feature. The design includes a double layer at the top to help retain warmth while allowing easier nursing access.', 'Short, pleated skirt in woven fabric with a high waist and concealed zip and press-stud at one side. Unlined.', 'Wide, long-sleeved top in soft cotton jersey with an open chest pocket, ribbing around the neckline and short slits in the sides. Loose fit.', 'Fully lined bikini top with hole-patterned, underwired, moulded, padded cups that lift and shape, and ties at the back of the neck.', 'Tights with a seam down the back of the legs. 30 denier.']
[697564030, 698715001, 526562001, 649099001, 762061003] ['Shirt in airy cotton with a collar, buttons down the front, long sleeves with buttoned cuffs, and a rounded hem.', 'Slip-on trainers in cotton canvas with a print motif and sequins. Glittery elastic gores in the sides and a loop at the back. Contra