In [1]:
import torch
import torch.nn as nn
import clip
from PIL import Image
import pandas as pd
import requests
import os.path as osp
import pickle
import random
import numpy as np
from pathlib import Path
import sys
from operator import itemgetter
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import time
import shutil

  warn(f"Failed to load image Python extension: {e}")


In [11]:
def read_pickle(dir):
    with open(dir, 'rb') as handle:
        b = pickle.load(handle)
    return b


def write_pickle(dir, data):
    with open(dir, 'wb') as handle:
        pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL)
        

class Timer:
    def __init__(self):

        self.t1 = None

    @staticmethod
    def delta_to_string(td):

        res_list = []

        def format():
            return ", ".join(reversed(res_list)) + " elapsed."

        seconds = td % 60
        td //= 60
        res_list.append(f"{round(seconds,3)} seconds")

        if td <= 0:
            return format()

        minutes = td % 60
        td //= 60
        res_list.append(f"{minutes} minutes")

        if td <= 0:
            return format()

        hours = td % 24
        td //= 24
        res_list.append(f"{hours} hours")

        if td <= 0:
            return format()

        res_list.append(f"{td} days")

        return format()

    def __enter__(self):

        self.t1 = time.time()

    def __exit__(self, *args, **kwargs):

        t2 = time.time()
        td = t2 - self.t1

        print(self.delta_to_string(td))


def top_n(input_dict, n):
    return dict(sorted(input_dict.items(), key=itemgetter(1), reverse=True)[:n])


def find_products(text_input, category_df, image_pickle_path):

    text_input = [text_input]

    # stage one, compare categories
    category_df = category_df[~category_df["encoded_category"].isna()]
    categories = list(category_df["category"].values)

    categories_features = torch.stack(list(category_df["encoded_category"].values))
    encoded_texts = clip.tokenize(text_input).to(device)

    with torch.no_grad():

        text_features = model.encode_text(encoded_texts)

        categories_features /= categories_features.norm(dim=-1, keepdim=True)
        text_features /= text_features.norm(dim=-1, keepdim=True)
        similarity =  100 * categories_features @ text_features.T

    res = dict(zip(categories, similarity.reshape(-1).tolist()))

    res = sorted(res.items(), key=itemgetter(1), reverse=True)

    n = 10
    res = res[:n]
    res_set = set([r[0] for r in res])
    
    # do image matching
    res = []
    for cat in res_set:
        store_path = osp.join(image_pickle_path, f"{cat}.pkl")
        cat_res = read_pickle(store_path)
        res.append(cat_res)
    res = pd.concat(res, axis=0)
    
    uniq_ids = list(res["uid"].values)
    image_features = torch.stack(list(res["encoded_image"].values))
    similarity =  100 * image_features @ text_features.T
    res = dict(zip(uniq_ids, similarity.reshape(-1).tolist()))
    res = sorted(res.items(), key=itemgetter(1), reverse=True)
    
    n = 5
    res = res[:n]
    res_set = set([r[0] for r in res])
    
    return res_set


def show_images(res):
    n = len(res)
    fig, ax = plt.subplots(1, n)

    fig.set_figheight(5)
    fig.set_figwidth(5 * n)
    
    iterable = True
    try:
       it = ax[0]
    except:
        iterable = False

    if not iterable:
        img_path = image_path(res[0])
        img = mpimg.imread(img_path)
        ax.imshow(img)
        ax.axis("off")
    else:
        for i, image in enumerate(res):
            img_path = image_path(image)
            img = mpimg.imread(img_path)

            ax[i].imshow(img)
            ax[i].axis('off')
            # ax[i].set_title(get_label(image), fontsize=8)

    plt.subplots_adjust(wspace=0, hspace=0.1)
    plt.show()
    
    
def image_path(uid):
    return osp.join(image_storage, f"{uid}.jpg")


def load_data(pickle_path):
    category_df = read_pickle(osp.join(pickle_path, "categories.pkl"))
    meta_df = read_pickle(osp.join(pickle_path, "meta_data.pkl"))
    
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model, preprocess = clip.load("ViT-B/32", device=device)
    
    return device, model, preprocess, category_df, meta_df


def combine_category(x):
    res = x["sub_category_1"]
    if not pd.isna(x["sub_category_2"]):
        res += ", " + x["sub_category_2"]
    if not pd.isna(x["sub_category_3"]):
        res += ", " + x["sub_category_3"]
    return res


def clean_data(d1):
    d1 = d1[d1["primary_category"] == "Furniture"]
    d1 = d1[~(d1["primary_category"].isna() & d1["sub_category_1"].isna() & d1["sub_category_2"].isna() & d1["sub_category_3"].isna())]
    d1 = d1[~(d1["sub_category_1"].isna() & d1["sub_category_2"].isna() & d1["sub_category_3"].isna())]
    d1 = d1[~d1["description"].isna()]
    d1 = d1[~d1["colors"].isna()]
    d1["colors"] = d1["colors"].astype(str)
    d1["combined_category"] = d1.apply(combine_category, axis=1)
    return d1

In [15]:
image_storage = "demo_data/target_images"
pickle_path = "demo_data/data3_pickle"
image_pickle_path = "demo_data/data3_image_pickle"
dataset_path = "data/cleaned_target_furniture_dataset.csv"
finetune_data_path = "C:/Users/aphri/Documents/t0002/work/data/w210_data/finetune_data"
subset_data_path = osp.join(finetune_data_path, "subset_data.pkl")

In [16]:
if not osp.exists(subset_data_path):
    d1 = pd.read_csv(dataset_path)
    d1 = clean_data(d1)
    d2 = d1[d1["combined_category"] == "Home Office Furniture, Bookshelves & Bookcases"]
    write_pickle(osp.join(finetune_data_path, "subset_data.pkl"), d2)
else:
    d2 = read_pickle(subset_data_path)

In [36]:
d2 = d2[["uniq_id", "combined_category", "title"]]

In [37]:
d2 = d2.reset_index(drop=True)

In [38]:
sub_meta_df = d2[:100]

In [41]:
import openai
openai.api_key = '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'

In [42]:
def get_chat_gpt_reply(title):

    user_query = f"I am looking for {title}"

    prefix = (
        "considering what the user asked before, what is the user looking for with the following request."
        " Only respond with the product description no more than 30 words:"
    )

    messages = [{"role": "user", "content": f"{prefix} {user_query}"}]
    chat = openai.ChatCompletion.create(
        model="gpt-3.5-turbo", messages=messages
    )
    reply = chat.choices[0].message.content
    
    return reply

In [43]:
converted_titles = set()
res = []

In [44]:
count = 0
for title in sub_meta_df["title"]:
    if title not in converted_titles:
        while True:
            try:
                res.append(get_chat_gpt_reply(title))
                converted_titles.add(title)
                break
            except:
                time.sleep(10)
    count += 1
    if count % 10 == 0:
        print(count)

10
20
30
40
50
60
70
80
90
100


In [45]:
sub_meta_df["chatgpt_title"] = res

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sub_meta_df["chatgpt_title"] = res


In [46]:
write_pickle(osp.join(pickle_path, "meta_data_chatgpt_finetune.pkl"), sub_meta_df)

In [47]:
sub_meta_df

Unnamed: 0,uniq_id,combined_category,title,chatgpt_title
0,931709d8-c36d-5d58-8ad7-b81127bc0524,"Home Office Furniture, Bookshelves & Bookcases","Best Choice Products 9-Cube Bookshelf, Display...",The user is looking for a black 9-cube bookshe...
1,720d7766-e873-5844-bbcf-0892f9d7e3fa,"Home Office Furniture, Bookshelves & Bookcases",Holland Kids' Bookshelf 4 Tier Book Organizer ...,The user is looking for a 4-tier book organize...
2,2bbed30a-5b93-5837-a9ee-4b159c3e0fd4,"Home Office Furniture, Bookshelves & Bookcases","Contemporary 47"" TV Stand Bookcase in Cappucci...","The user is looking for a Contemporary 47"" TV ..."
3,0bf04258-6c0e-507d-b9de-9aed74725249,"Home Office Furniture, Bookshelves & Bookcases","18"" Wide White Bookcase - Henn&Hart",The user is looking for a product description ...
4,200b88be-8e1e-5176-8ee6-e3b952dbf455,"Home Office Furniture, Bookshelves & Bookcases",Farmhouse Wood and Metal Shelving Unit White -...,The user is looking for a white Farmhouse Wood...
...,...,...,...,...
95,dd7415f3-e352-55a9-b019-ffd210e82787,"Home Office Furniture, Bookshelves & Bookcases",Furinno Turn-N-Tube 4 Tier Multipurpose Open W...,The user is looking for a Furinno Turn-N-Tube ...
96,c163ef19-c815-5bb5-9c6c-c6623305d931,"Home Office Furniture, Bookshelves & Bookcases","72"" Huntington Club Wood Bookcase Cherry - Mar...","The 72"" Huntington Club Wood Bookcase in Cherr..."
97,d44c276a-7448-5724-a06b-914247f8b2d1,"Home Office Furniture, Bookshelves & Bookcases",3 Sprouts High Quality Multipurpose Kids and T...,"The user is looking for a high quality, multip..."
98,ec4cf72a-57e3-5edc-84c3-eeaee7fd084d,"Home Office Furniture, Bookshelves & Bookcases","72"" Executive Open Wood Bookcase Brown - Marti...",The user is looking for a product description ...
