In [1]:
import torch
import torch.nn as nn
import clip
from PIL import Image
import pandas as pd
import requests
import os.path as osp
import pickle
import random
import numpy as np
from pathlib import Path
import sys
from operator import itemgetter
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import time
import shutil

  warn(f"Failed to load image Python extension: {e}")


In [2]:
dataset_path = "C:/Users/aphri/Documents/t0002/work/data/w210_data/target_store_furniture_datasets.csv"
image_storage = "C:/Users/aphri/Documents/t0002/work/data/w210_data/target_images"
pickle_path = "C:/Users/aphri/Documents/t0002/work/data/w210_data/category_pickle"
model_path = "C:/Users/aphri/Documents/t0002/work/data/w210_data/model"

Path(image_storage).mkdir(parents=True, exist_ok=True)
Path(pickle_path).mkdir(parents=True, exist_ok=True)
Path(model_path).mkdir(parents=True, exist_ok=True)

In [3]:
def read_pickle(dir):
    with open(dir, 'rb') as handle:
        b = pickle.load(handle)
    return b


def write_pickle(dir, data):
    with open(dir, 'wb') as handle:
        pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [4]:
def save_processed_data(name, uid_list, des_list, eimage_list, edes_list):
    df = pd.DataFrame(data={
        "uid": uid_list,
        "description": des_list,
        "encoded_image": eimage_list,
        "encoded_description": edes_list
    })
    
    write_pickle(name, df)

In [5]:
def image_path(uid):
    return osp.join(image_storage, f"{uid}.jpg")

In [6]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)

In [7]:
d1 = pd.read_csv(dataset_path)

In [8]:
len(d1)

42215

In [9]:
d1.columns

Index(['title', 'url', 'brand', 'main_image', 'sku', 'description',
       'raw_description', 'gtin13', 'currency', 'price', 'availability',
       'availableDeliveryMethod', 'available_branch', 'primary_category',
       'sub_category_1', 'sub_category_2', 'sub_category_3', 'images',
       'raw_specifications', 'specifications', 'highlights', 'raw_highlights',
       'uniq_id', 'scraped_at'],
      dtype='object')

In [10]:
d1 = d1[d1["primary_category"] == "Furniture"]

In [11]:
d1 = d1[~(d1["primary_category"].isna() & d1["sub_category_1"].isna() & d1["sub_category_2"].isna() & d1["sub_category_3"].isna())]
d1 = d1[~(d1["sub_category_1"].isna() & d1["sub_category_2"].isna() & d1["sub_category_3"].isna())]
d1 = d1[~d1["description"].isna()]

In [12]:
def combine_category(x):
    res = x["sub_category_1"]
    if not pd.isna(x["sub_category_2"]):
        res += ", " + x["sub_category_2"]
    if not pd.isna(x["sub_category_3"]):
        res += ", " + x["sub_category_3"]
    
    return res

In [13]:
d1["combined_category"] = d1.apply(combine_category, axis=1)

In [14]:
len(d1)

34411

In [15]:
d1.groupby("combined_category")["uniq_id"].count().max()

3320

In [16]:
np.unique(d1["combined_category"])[:10]

array(['Bathroom Furniture', 'Bedroom Furniture, Armoires & Wardrobes',
       'Bedroom Furniture, Bed Frames',
       'Bedroom Furniture, Bedroom Sets & Collections',
       'Bedroom Furniture, Beds', 'Bedroom Furniture, Dressers',
       'Bedroom Furniture, Headboards',
       'Bedroom Furniture, Jewelry Armoires',
       'Bedroom Furniture, Nightstands', 'Bedroom Furniture, Vanities'],
      dtype=object)

In [18]:
d1["description"][:10]

0     The Bush Furniture Salinas Small Computer Desk...
1     You can have extra storage space for your bedr...
2     Clean lines and lustrous nailheads give this u...
3     Enhance the look of your room with the Broadwa...
6     The Lexington 70” TV Stand features a cantilev...
8     Include this recliner in your home for full co...
9     The Fabian kitchen cabinet has the practical a...
10    Make a splash with this pair of extraordinary,...
11    Refresh your living room with this light gray ...
12    It began in 1893 when Edsko Hekman ventured fr...
Name: description, dtype: object

In [19]:
d1["description"][0]

"The Bush Furniture Salinas Small Computer Desk with Hutch takes timeless Mission styling and adds plenty of modern touches to help you complete your daily tasks. Technology-ready features include an integrated pullout keyboard tray or laptop shelf and a work surface with built-in wire management to keep cords and cables in order. The Hutch has plenty of open storage space for work-in-progress, books, decorations and more while the 48W Desk features a vertical storage cabinet along with a box drawer to organize your office supplies and writing utensils. An ideal Computer Desk for small spaces, the compact design boasts a sturdy construction with clean lines, tapered legs and attractive wood detailing. The durable desktop supports up to 200 pounds and offers ample space to complete projects and assignments, pay bills, or just surf the web. Find the perfect look for any work or living space with your choice of seven attractive finishes complemented by Tumbled Pewter hardware. This home o

In [20]:
len(d1[d1["description"].isna()])

0

In [43]:
d1["short"] = d1["description"].apply(lambda x: " ".join(x.split()[:20]))

In [44]:
d1["short"].head(10).values

array(['The Bush Furniture Salinas Small Computer Desk with Hutch takes timeless Mission styling and adds plenty of modern touches to',
       'You can have extra storage space for your bedroom decor while adding a touch of contemporary flair with help from',
       'Clean lines and lustrous nailheads give this upholstered bed an air of sophistication. Available in a mix of bold color',
       'Enhance the look of your room with the Broadway Mirrored Side Cabinet by Lifestorey. This cabinet offers a glamorous feel',
       'The Lexington 70” TV Stand features a cantilevered shelf design that combines both contemporary and rustic looks together. The rustic',
       'Include this recliner in your home for full comfort to enjoy after a long day at work. This plush cushioned',
       "The Fabian kitchen cabinet has the practical and organizational elements you've been loofor without sacrificing style. The modern and contemporary-inspired",
       'Make a splash with this pair of extraordin

In [45]:
d2 = d1[["uniq_id", "combined_category", "title", "short"]]
write_pickle(osp.join(pickle_path, f"meta_data.pkl"), d2)

In [18]:
cat_set = set(d1["combined_category"].values)

In [60]:
cat_list = []
ecat_list = []

for cat in cat_set:
    
    ecat = clip.tokenize(cat)
    cat_list.append(cat)
    ecat_list.append(ecat)
    
ecat_list = torch.cat(ecat_list).to(device)
with torch.no_grad():
    ecat_list = list(model.encode_text(ecat_list))

print(f"saving encoded categories")
df = pd.DataFrame(data={
    "category": cat_list,
    "encoded_category": ecat_list
})

write_pickle(osp.join(pickle_path, f"categories.pkl"), df)

saving encoded categories


In [None]:
title_set = set(d1["title"].values)

title_list = []
etitle_list = []

for title in title_set:
    
    etitle = clip.tokenize(title)
    title_list.append(title)
    etitle_list.append(etitle)
    
res = []
chunk = 100
idx = 0
while True:
    print(idx)
    if idx >= len(etitle_list):
        break
    curr_list = torch.cat(etitle_list[idx:idx+chunk]).to(device)
    with torch.no_grad():
        curr_list = list(model.encode_text(curr_list))
        res += curr_list
    idx += chunk

print(f"saving encoded titles")
df = pd.DataFrame(data={
    "title": title_list,
    "encoded_title": res
})

write_pickle(osp.join(pickle_path, f"titles.pkl"), df)

In [19]:
for cat in cat_set:
    
    store_path = osp.join(pickle_path, f"{cat}.pkl")
    if osp.exists(store_path):
        continue

    print(f"processing category {cat}...")
    print("processing uids...")
    uid_list = []
    for idx, row in d1[d1["combined_category"] == cat].iterrows():
        uid = row.uniq_id
        uid_list.append(uid)
    
    print("processing images...")
    image = torch.cat([preprocess(Image.open(image_path(uid))).unsqueeze(0) for uid in uid_list]).to(device)
    with torch.no_grad():
        eimage_list = list(model.encode_image(image))
        
    if len(uid_list) > 0:
        print(f"saving category images: {cat}")
        df = pd.DataFrame(data={
            "uid": uid_list,
            "encoded_image": eimage_list
        })
        write_pickle(store_path, df)

processing category Kitchen & Dining Furniture, Bar Carts...
processing uids...
processing images...
saving category images: Kitchen & Dining Furniture, Bar Carts
processing category Bedroom Furniture, Jewelry Armoires...
processing uids...
processing images...
saving category images: Bedroom Furniture, Jewelry Armoires
processing category Kitchen & Dining Furniture, Dining Room Sets & Collections...
processing uids...
processing images...
saving category images: Kitchen & Dining Furniture, Dining Room Sets & Collections
processing category Bathroom Furniture...
processing uids...
processing images...
saving category images: Bathroom Furniture
processing category Living Room Furniture, Loveseats...
processing uids...
processing images...
saving category images: Living Room Furniture, Loveseats
processing category Living Room Furniture, Sofas & Couches...
processing uids...
processing images...
saving category images: Living Room Furniture, Sofas & Couches
processing category Entryway F

In [None]:
short_set = set(d1["short"].values)

short_list = []
eshort_list = []

for short in short_set:
    
    eshort = clip.tokenize(short)
    short_list.append(short)
    eshort_list.append(eshort)
    
eshort_list = torch.cat(eshort_list).to(device)
with torch.no_grad():
    eshort_list = list(model.encode_text(eshort_list))

print(f"saving encoded shorts")
df = pd.DataFrame(data={
    "short": short_list,
    "encoded_short": eshort_list
})

write_pickle(osp.join(pickle_path, f"shorts.pkl"), df)