In [122]:
import torch
import torch.nn as nn
import clip
from PIL import Image
import pandas as pd
import requests
import os.path as osp
import pickle
import random
import numpy as np
from pathlib import Path
import sys
from operator import itemgetter
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import time
import shutil

In [123]:
dataset_path = "C:/Users/aphri/Documents/t0002/work/data/w210_data/target_store_furniture_datasets.csv"
image_storage = "C:/Users/aphri/Documents/t0002/work/data/w210_data/target_images"
pickle_path = "C:/Users/aphri/Documents/t0002/work/data/w210_data/category_pickle"
model_path = "C:/Users/aphri/Documents/t0002/work/data/w210_data/model"

Path(image_storage).mkdir(parents=True, exist_ok=True)
Path(pickle_path).mkdir(parents=True, exist_ok=True)
Path(model_path).mkdir(parents=True, exist_ok=True)

In [124]:
def read_pickle(dir):
    with open(dir, 'rb') as handle:
        b = pickle.load(handle)
    return b


def write_pickle(dir, data):
    with open(dir, 'wb') as handle:
        pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [125]:
def save_processed_data(name, uid_list, des_list, eimage_list, edes_list):
    df = pd.DataFrame(data={
        "uid": uid_list,
        "description": des_list,
        "encoded_image": eimage_list,
        "encoded_description": edes_list
    })
    
    write_pickle(name, df)

In [129]:
def image_path(uid):
    return osp.join(image_storage, f"{uid}.jpg")

In [126]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)

In [3]:
d1 = pd.read_csv(dataset_path)

In [22]:
len(d1)

42215

In [10]:
d1.columns

Index(['title', 'url', 'brand', 'main_image', 'sku', 'description',
       'raw_description', 'gtin13', 'currency', 'price', 'availability',
       'availableDeliveryMethod', 'available_branch', 'primary_category',
       'sub_category_1', 'sub_category_2', 'sub_category_3', 'images',
       'raw_specifications', 'specifications', 'highlights', 'raw_highlights',
       'uniq_id', 'scraped_at'],
      dtype='object')

In [None]:
d1 = d1[d1["primary_category"] == "Furniture"]

In [105]:
d1 = d1[~(d1["primary_category"].isna() & d1["sub_category_1"].isna() & d1["sub_category_2"].isna() & d1["sub_category_3"].isna())]
d1 = d1[~(d1["sub_category_1"].isna() & d1["sub_category_2"].isna() & d1["sub_category_3"].isna())]
d1 = d1[~d1["description"].isna()]

In [106]:
def combine_category(x):
    res = x["sub_category_1"]
    if not pd.isna(x["sub_category_2"]):
        res += ", " + x["sub_category_2"]
    if not pd.isna(x["sub_category_3"]):
        res += ", " + x["sub_category_3"]
    
    return res

In [107]:
d1["combined_category"] = d1.apply(combine_category, axis=1)

In [112]:
len(d1)

34411

In [108]:
d1.groupby("combined_category")["uniq_id"].count().max()

3320

In [109]:
np.unique(d1["combined_category"])[:10]

array(['Bathroom Furniture', 'Bedroom Furniture, Armoires & Wardrobes',
       'Bedroom Furniture, Bed Frames',
       'Bedroom Furniture, Bedroom Sets & Collections',
       'Bedroom Furniture, Beds', 'Bedroom Furniture, Dressers',
       'Bedroom Furniture, Headboards',
       'Bedroom Furniture, Jewelry Armoires',
       'Bedroom Furniture, Nightstands', 'Bedroom Furniture, Vanities'],
      dtype=object)

In [131]:
d1["description"][:10]

0     The Bush Furniture Salinas Small Computer Desk...
1     You can have extra storage space for your bedr...
2     Clean lines and lustrous nailheads give this u...
3     Enhance the look of your room with the Broadwa...
6     The Lexington 70” TV Stand features a cantilev...
8     Include this recliner in your home for full co...
9     The Fabian kitchen cabinet has the practical a...
10    Make a splash with this pair of extraordinary,...
11    Refresh your living room with this light gray ...
12    It began in 1893 when Edsko Hekman ventured fr...
Name: description, dtype: object

In [132]:
d1["description"][0]

"The Bush Furniture Salinas Small Computer Desk with Hutch takes timeless Mission styling and adds plenty of modern touches to help you complete your daily tasks. Technology-ready features include an integrated pullout keyboard tray or laptop shelf and a work surface with built-in wire management to keep cords and cables in order. The Hutch has plenty of open storage space for work-in-progress, books, decorations and more while the 48W Desk features a vertical storage cabinet along with a box drawer to organize your office supplies and writing utensils. An ideal Computer Desk for small spaces, the compact design boasts a sturdy construction with clean lines, tapered legs and attractive wood detailing. The durable desktop supports up to 200 pounds and offers ample space to complete projects and assignments, pay bills, or just surf the web. Find the perfect look for any work or living space with your choice of seven attractive finishes complemented by Tumbled Pewter hardware. This home o

In [111]:
len(d1[d1["description"].isna()])

0

In [113]:
cat_set = set(d1["combined_category"].values)

In [137]:
import openai
openai.api_key = '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'

In [140]:
def chatgpt_summary(description):
    prefix = "summarize the following product description for me with no more than 30 words:"
    messages = [{"role": "user", "content": f"{prefix} {description}"}]
    chat = openai.ChatCompletion.create(
        model="gpt-3.5-turbo", messages=messages
    )
    reply = chat.choices[0].message.content
    return reply

In [149]:
des_dict = dict(d1[["uniq_id", "description"]].values)

res_dict = {}
res_path = osp.join(pickle_path, "des_res.pkl")
if (osp.exists(res_path)):
    res_dict = read_pickle(res_path)
    
count = 0
for k, v in des_dict.items():
    if k in res_dict:
        continue
    res_dict[k] = chatgpt_summary(v)
    
    count += 1
    if count > 100:
        print(f"writing data...")
        write_pickle(res_path, res_dict)
        count = 0
    

writing data...


Timeout: Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)

In [150]:
cat_list = []
ecat_list = []

for cat in cat_set:

    uid_list = []
    des_list = []
    eimage_list = []
    edes_list = []
    
    ecat = clip.tokenize(cat)
    cat_list.append(cat)
    ecat_list.append(ecat)

    for idx, row in d1[d1["combined_category"] == cat].iterrows():
        uid = row.uniq_id
        des = row.description[:50]

        uid_list.append(uid)
        des_list.append(des)
        image = preprocess(Image.open(image_path(uid))).unsqueeze(0)
        edes = clip.tokenize(des)

        eimage_list.append(image)
        edes_list.append(edes)
#         try:
#             image = preprocess(Image.open(image_path(uid))).unsqueeze(0)
#             edes = clip.tokenize(des)

#             eimage_list.append(image)
#             edes_list.append(edes)
#         except:
#             print(f"failed: {uid}, {cat}, {des}")

    if len(uid_list) > 0:
        print(f"saving category: {cat}")
        save_processed_data(osp.join(pickle_path, f"{cat}.pkl"), uid_list, des_list, eimage_list, edes_list)
    
print(f"saving encoded categories")
df = pd.DataFrame(data={
    "category": cat_list,
    "encoded_category": ecat_list
})

write_pickle(osp.join(pickle_path, f"categories.pkl"), df)

saving category: Living Room Furniture, Storage Furniture
saving category: Living Room Furniture, Chairs, Recliners
saving category: Bedroom Furniture, Jewelry Armoires
saving category: Dorm Furniture
saving category: Home Office Furniture, Bookshelves & Bookcases
saving category: Bedroom Furniture, Vanities
saving category: Bedroom Furniture, Bed Frames
saving category: Kitchen & Dining Furniture, Dining Room Sets & Collections
saving category: Home Office Furniture, Home Office Sets & Collections
saving category: Living Room Furniture, Room Dividers
saving category: Living Room Furniture, Sofa Beds
saving category: Bathroom Furniture
saving category: Living Room Furniture, Chairs, Gliders & Rockers
saving category: Bedroom Furniture, Headboards
saving category: Living Room Furniture, Coffee Tables
saving category: Living Room Furniture, Sofas & Couches
saving category: Living Room Furniture, Sectional Sofas
saving category: Kitchen & Dining Furniture, Bar Stools & Counter Stools
savi