In [1]:
import os
import base64
import openai
import json
import time
import pandas as pd
import requests
from tqdm import tqdm
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import openai
from transformers import pipeline
# from openai import OpenAI, AzureOpenAI
from dotenv import load_dotenv
import qa_package.dataclasses.orm as d
from sqlalchemy import func
from sqlalchemy.engine import Engine, create_engine
from sqlalchemy.orm import Session, sessionmaker
from sqlalchemy.sql import select
import numpy as np
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from qa_package.services.openai import OpenAI
from qa_package.services.guardrails import guard_image_search
from sklearn.cluster import KMeans
from sqlalchemy_utils import Ltree

load_dotenv()

API_BASE = os.getenv('API_BASE')
API_KEY = os.getenv('API_KEY')
API_VERSION = os.getenv('API_VERSION')
CHAT_DEPLOYMENT_NAME = os.getenv('CHAT_DEPLOYMENT_NAME')
EMBEDDING_DEPLOYMENT_NAME = os.getenv('EMBEDDING_DEPLOYMENT_NAME')
NEW_API_KEY = os.getenv('MY_API_KEY')
n_clusters = 10
BATCH_SIZE = 16
root_img_path = "/Users/spare/Documents/data/images/"
openai.api_key = API_KEY
openai.api_type = "azure"
openai.api_base = API_BASE
openai.api_version = API_VERSION

CSV_FILE = "/Users/spare/Documents/data/articles.csv"
df = pd.read_csv(CSV_FILE)

In [2]:
df.head(5)

Unnamed: 0,article_id,prod_name,product_type_name,product_group_name,graphical_appearance_name,colour_group_name,garment_group_name,detail_desc
0,695255001,Siv t-shirt,T-shirt,Garment Upper body,All over pattern,Dark Blue,Jersey Fancy,Short-sleeved top in soft viscose jersey with ...
1,821115007,RICHIE SKIRT,Skirt,Garment Lower body,Check,Pink,Skirts,"Short, pleated skirt in woven fabric with a hi..."
2,553238008,THORN LS TEE,Sweater,Garment Upper body,Solid,White,Jersey Basic,"Wide, long-sleeved top in soft cotton jersey w..."
3,627147001,Bling Me Up Push,Bikini top,Swimwear,Lace,Dark Red,Swimwear,"Fully lined bikini top with hole-patterned, un..."
4,794520001,Plus seam at back 1p tights,Underwear Tights,Socks & Tights,Solid,Black,Socks and Tights,Tights with a seam down the back of the legs. ...


In [3]:
db_url = "postgresql://postgres:postgres@localhost/postgres"
engine = create_engine(db_url)

In [4]:
client = OpenAI(
    api_key=API_KEY, api_base=API_BASE, api_version=API_VERSION
)

In [7]:
unique_colors = df.colour_group_name.unique().tolist()
print(unique_colors)
print(len(unique_colors))

['Dark Blue', 'Pink', 'White', 'Dark Red', 'Black', 'Blue', 'Light Pink', 'Red', 'Light Turquoise', 'Dark Orange', 'Light Orange', 'Orange', 'Dark Green', 'Off White', 'Light Grey', 'Yellowish Brown', 'Dark Grey', 'Light Beige', 'Beige', 'Gold', 'Yellow', 'Light Yellow', 'Dark Pink', 'Grey', 'Greenish Khaki', 'Other Yellow', 'Light Green', 'Light Blue', 'Other Pink', 'Green', 'Greyish Beige', 'Dark Purple', 'Light Purple', 'Dark Beige', 'Silver', 'Other Green', 'Dark Yellow', 'Other', 'Light Red', 'Other Red', 'Turquoise', 'Purple', 'Dark Turquoise', 'Other Orange', 'Other Purple']
45


In [6]:
BATCH = len(unique_colors) // BATCH_SIZE + int(len(unique_colors) % BATCH_SIZE > 0)
color_embeddings = []
with Session(engine) as sess:
    for i in tqdm(range(BATCH)):
        docs = unique_colors[i * BATCH_SIZE : (i + 1) * BATCH_SIZE]
        vecs = client.create_embeddings(docs, EMBEDDING_DEPLOYMENT_NAME)
        for c,v in zip(docs, vecs):
            sess.add(d.color(name=c.lower(), factors=v))
        sess.commit()
        color_embeddings += vecs
        if i%30 == 0:
            time.sleep(10)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:10<00:00,  3.66s/it]


In [8]:
unique_patterns = df.graphical_appearance_name.unique().tolist()
print(unique_patterns)
print(len(unique_patterns))

['All over pattern', 'Check', 'Solid', 'Lace', 'Stripe', 'Placement print', 'Melange', 'Front print', 'Denim', 'Treatment', 'Glittering/Metallic', 'Application/3D', 'Colour blocking', 'Dot', 'Other structure', 'Other pattern', 'Chambray', 'Mixed solid/pattern', 'Embroidery', 'Jacquard', 'Metallic', 'Mesh', 'Sequin', 'Slub', 'Contrast']
25


In [9]:
BATCH = len(unique_patterns) // BATCH_SIZE + int(len(unique_patterns) % BATCH_SIZE > 0)
pattern_embeddings = []
with Session(engine) as sess:
    for i in tqdm(range(BATCH)):
        docs = unique_patterns[i * BATCH_SIZE : (i + 1) * BATCH_SIZE]
        vecs = client.create_embeddings(docs, EMBEDDING_DEPLOYMENT_NAME)
        for c,v in zip(docs, vecs):
            sess.add(d.pattern(name=c.lower(), factors=v))
        sess.commit()
        pattern_embeddings += vecs
        if i%30 == 0:
            time.sleep(10)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:10<00:00,  5.41s/it]


In [10]:
unique_groups = df.product_group_name.unique()
print(unique_groups)
print(len(unique_groups))

['Garment Upper body' 'Garment Lower body' 'Swimwear' 'Socks & Tights'
 'Shoes' 'Garment Full body' 'Underwear' 'Accessories' 'Nightwear'
 'Cosmetic' 'Stationery']
11


In [28]:
def replace_to_fit_ltree(string):
    return string.lower()\
        .replace(" ","_")\
        .replace("-","_")\
        .replace("/","_or_")\
        .replace("&","_and_")

In [41]:
set_groups = {
    "Garment Upper body": "set1",
    "Garment Lower body": "set1",
    "Garment Full body": "set2",
    "Swimwear": "set3",
    "Nightwear": "set4"
}
with Session(engine) as sess:
    for i in range(5):
        print(f"set{i+1}")
        gar = d.garment(name=f"set{i+1}",
             factors=[0]*1536,
             path=Ltree(f"set{i+1}"))
        sess.add(gar)
    sess.commit()
    for gp in unique_groups:
        set_name = set_groups[gp] if gp in set_groups else "set5"
        vecgp = client.create_embeddings([gp], EMBEDDING_DEPLOYMENT_NAME)
        rootname = replace_to_fit_ltree(gp)
        print(f"{set_name}.{rootname}")
        sess.add(d.garment(name=gp,
            factors=vecgp[0],
            path=Ltree(f"{set_name}.{rootname}")
        ))
        sess.commit()
        unique_garments = list(df[df.product_group_name==gp].product_type_name.unique())
        vecs = []
        BATCH = len(unique_garments) // BATCH_SIZE + int(len(unique_garments) % BATCH_SIZE > 0)
        for i in tqdm(range(BATCH)):
            docs = unique_garments[i * BATCH_SIZE : (i + 1) * BATCH_SIZE]
            vecs += client.create_embeddings(docs, EMBEDDING_DEPLOYMENT_NAME)
        for ga, v in zip(unique_garments, vecs):
            childname = replace_to_fit_ltree(ga)
            print(f"{set_name}.{rootname}.{childname}")
            sess.add(d.garment(name=ga,
              factors=list(v),
              path=Ltree(f"{set_name}.{rootname}.{childname}")))
        sess.commit()
        time.sleep(10)

set1
set2
set3
set4
set5
set1.garment_upper_body


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  9.16it/s]


set1.garment_upper_body.t_shirt
set1.garment_upper_body.sweater
set1.garment_upper_body.shirt
set1.garment_upper_body.blazer
set1.garment_upper_body.top
set1.garment_upper_body.hoodie
set1.garment_upper_body.coat
set1.garment_upper_body.jacket
set1.garment_upper_body.vest_top
set1.garment_upper_body.blouse
set1.garment_upper_body.cardigan
set1.garment_upper_body.polo_shirt
set1.garment_upper_body.bodysuit
set1.garment_upper_body.tailored_waistcoat
set1.garment_lower_body


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 14.32it/s]

set1.garment_lower_body.skirt
set1.garment_lower_body.trousers
set1.garment_lower_body.shorts
set1.garment_lower_body.outdoor_trousers
set1.garment_lower_body.leggings_or_tights





set3.swimwear


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 14.73it/s]

set3.swimwear.bikini_top
set3.swimwear.swimwear_bottom
set3.swimwear.swimwear_set
set3.swimwear.swimsuit
set3.swimwear.sarong





set5.socks__and__tights


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 15.01it/s]

set5.socks__and__tights.underwear_tights
set5.socks__and__tights.socks





set5.shoes


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 12.47it/s]

set5.shoes.sneakers
set5.shoes.sandals
set5.shoes.boots
set5.shoes.flat_shoe
set5.shoes.ballerinas
set5.shoes.other_shoe
set5.shoes.flat_shoes
set5.shoes.flip_flop
set5.shoes.slippers





set2.garment_full_body


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 14.36it/s]

set2.garment_full_body.garment_set
set2.garment_full_body.dress
set2.garment_full_body.jumpsuit_or_playsuit
set2.garment_full_body.outdoor_overall
set2.garment_full_body.dungarees





set5.underwear


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 13.03it/s]

set5.underwear.underwear_bottom
set5.underwear.bra
set5.underwear.underwear_body
set5.underwear.robe
set5.underwear.nipple_covers





set5.accessories


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 12.08it/s]

set5.accessories.other_accessories
set5.accessories.bracelet
set5.accessories.hair_clip
set5.accessories.hair_or_alice_band
set5.accessories.hat_or_beanie
set5.accessories.earring
set5.accessories.scarf
set5.accessories.cap_or_peaked
set5.accessories.gloves
set5.accessories.wallet
set5.accessories.watch
set5.accessories.necklace
set5.accessories.hair_string
set5.accessories.bag
set5.accessories.beanie
set5.accessories.sunglasses
set5.accessories.belt
set5.accessories.hat_or_brim
set5.accessories.tie





set4.nightwear


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 13.93it/s]

set4.nightwear.pyjama_set
set4.nightwear.pyjama_jumpsuit_or_playsuit
set4.nightwear.pyjama_bottom
set4.nightwear.night_gown





set5.cosmetic


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 16.56it/s]

set5.cosmetic.fine_cosmetics





set5.stationery


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 16.80it/s]

set5.stationery.marker_pen





In [44]:
with Session(engine) as sess:
    stmt = select(d.garment.path)
    res = sess.execute(stmt).scalars().all()
    print(res[:10])

[Ltree('set1'), Ltree('set2'), Ltree('set3'), Ltree('set4'), Ltree('set5'), Ltree('set1.garment_upper_body'), Ltree('set1.garment_upper_body.t_shirt'), Ltree('set1.garment_upper_body.sweater'), Ltree('set1.garment_upper_body.shirt'), Ltree('set1.garment_upper_body.blazer')]


In [45]:
from sqlalchemy import func

In [50]:
with Session(engine) as sess:
    second_layer = sess.query(d.garment.path)\
        .filter(func.nlevel(d.garment.path)==2).all()
    print(second_layer, len(second_layer))
    up_body = sess.query(d.garment)\
        .filter_by(name="Garment Upper body")\
        .one()
    siblings = sess.query(d.garment.path)\
        .filter(
        d.garment.path.descendant_of(up_body.path[:-1]),
        func.nlevel(d.garment.path) == 2,
        d.garment.id != up_body.id
    ).all()
    print(siblings)

[(Ltree('set1.garment_upper_body'),), (Ltree('set1.garment_lower_body'),), (Ltree('set3.swimwear'),), (Ltree('set5.socks__and__tights'),), (Ltree('set5.shoes'),), (Ltree('set2.garment_full_body'),), (Ltree('set5.underwear'),), (Ltree('set5.accessories'),), (Ltree('set4.nightwear'),), (Ltree('set5.cosmetic'),), (Ltree('set5.stationery'),)] 11
[(Ltree('set1.garment_lower_body'),)]


### Procedure of mode 4
- Input: image
- captioner
- extract color/pattern? 
- compare color embedding
- compare garment embedding L2/L3
    - siblings != self
    - compare setn L3 with set5 L3

In [58]:
with Session(engine) as sess:
    hair_or_alice_band = sess.query(d.garment)\
        .filter_by(
            name="Hair/alice band"
        ).one()
    ancestors = sess.query(d.garment.name)\
        .filter(d.garment.path.ancestor_of(
        hair_or_alice_band.path
    )).all()
    print(ancestors)

[('set5',), ('Accessories',), ('Hair/alice band',)]
