In [1]:
import os
import openai
import pandas as pd
from tqdm import tqdm
from openai import OpenAI, AzureOpenAI
from dotenv import load_dotenv
import qa_package.dataclasses.orm as d
from sqlalchemy.engine import Engine, create_engine
from sqlalchemy.orm import Session, sessionmaker
from sqlalchemy.sql import select

load_dotenv()

API_BASE = os.getenv('API_BASE')
API_KEY = os.getenv('API_KEY')
API_VERSION = os.getenv('API_VERSION')
CHAT_DEPLOYMENT_NAME = os.getenv('CHAT_DEPLOYMENT_NAME')
EMBEDDING_DEPLOYMENT_NAME = os.getenv('EMBEDDING_DEPLOYMENT_NAME')

db_url = "postgresql://postgres:postgres@localhost/postgres"
engine = create_engine(db_url)

CSV_FILE = "/Users/spare/Documents/data/articles.csv"
df = pd.read_csv(CSV_FILE)

client = AzureOpenAI(
    azure_endpoint=API_BASE,
    api_version=API_VERSION,
    api_key=API_KEY
)

def embed_docs(docs: list[str]) -> list[list[float]]:
    vec = client.embeddings.create(input=docs, model=EMBEDDING_DEPLOYMENT_NAME)
    return [tmp.embedding for tmp in vec.data]

In [7]:
BATCH_SIZE = 5
BATCH = df.shape[0]//BATCH_SIZE + int(df.shape[0]%BATCH_SIZE>0)
with Session(engine) as sess:
    sess.query(d.record).delete()
    sess.commit()
    for i in tqdm(range(BATCH)):
        ids = df['article_id'][i*BATCH_SIZE:(i+1)*BATCH_SIZE].tolist()
        docs = df['detail_desc'][i*BATCH_SIZE:(i+1)*BATCH_SIZE].tolist()
        try:
            vecs = embed_docs(docs)
    
            for tmpid, tmpvec in zip(ids, vecs):
                tmprow = d.record(id=tmpid, factors=tmpvec)
                sess.add(tmprow)
        except:
            print(docs)
        sess.commit()


 26%|███████████████████████████████████████████████████████▍                                                                                                                                                         | 53/200 [00:11<00:19,  7.71it/s]

['Short satin nightslip with a V-neck, lace at the top and narrow, adjustable shoulder straps.', 'Straight-cut top in sweatshirt fabric with long balloon sleeves in tulle with ribbed cuffs. Soft brushed inside.', 'Jumper in a soft, boxy knit with a ribbed turtle neck, dropped shoulders, long sleeves and ribbing at the cuffs and hem. The polyester content of the jumper is recycled.', 'Short sports shorts in fast-drying, breathable mesh with an elasticated waist and side pockets. Unlined.', nan]


 63%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                             | 126/200 [00:31<00:31,  2.38it/s]

['Sock boots in imitation suede with a soft elasticated shaft and covered heels. Imitation leather insoles. Heel 8 cm.', 'Hat in braided paper straw with a wide fabric band. Width of brim 8.5 cm.', '5-pocket jeans in washed, stretch denim with a regular waist, zip fly and button, and tapered legs. The jeans are made partly from recycled cotton.', 'Long-sleeved jumper in a fine-knit viscose blend with a round neckline, sewn-on decorative bow at the top and ribbing around the neckline, cuffs and hem.', nan]


 76%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                   | 151/200 [00:34<00:04, 11.17it/s]

[nan, 'Thin, patterned socks with a short shaft.', 'Long-sleeved top in soft cotton jersey with a round neck. Slightly longer at the back.', 'Jacket in soft faux shearling with a stand-up collar, yoke at the back and zip down the front. Dropped shoulders, diagonal jetted chest pockets, discreet pockets in the side seams and a trimmed hem.', 'Fitted jumper in soft, fine-knit, ribbed cashmere with 3/4-length sleeves and ribbing around the neckline.']


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 200/200 [00:45<00:00,  4.36it/s]
