In [37]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [38]:
import pandas as pd
import numpy as np

import os
import shutil
import re
import skimage.io
from skimage.transform import resize

from tqdm.notebook import tqdm

In [3]:
df1 = pd.read_json("/content/drive/MyDrive/Colab_Notebooks/RecSys/item.json")
df2 = pd.read_json("/content/drive/MyDrive/Colab_Notebooks/RecSys/outfit.json")

In [4]:
items = df1.T
items

Unnamed: 0,title,desc,url,cate
4004971,navy ant embroidery cropped trousers,flared stretch wool-blend trousers in navy fea...,https://cldny.ccindex.cn/ssenseweb/image/uploa...,bottom
2454398,silver ring necklace,chain link and s-link chain necklace in oxidiz...,https://res-2.cloudinary.com/ssenseweb/image/u...,other
2950338,black chintz puffer jacket,long sleeve quilted down-filled nylon chintz j...,https://res-2.cloudinary.com/ssenseweb/image/u...,outer
3786049,yellow marsll edition suede strappy sandals,suede heeled sandals in yellow. smudging throu...,https://img.ssensemedia.com/image/upload/b_whi...,other
3752229,white copper elastic urban knot sneakers,low-top buffed calfskin sneakers in white. rou...,https://img.ssensemedia.com/image/upload/b_whi...,other
...,...,...,...,...
3060318,yellow writing logo tshirt,short sleeve cotton jersey tshirt in yellow. r...,https://res-3.cloudinary.com/ssenseweb/image/u...,top
2929798,black down garrot coat,long sleeve down-filled quilted nylon satin ja...,https://res-2.cloudinary.com/ssenseweb/image/u...,outer
2611608,blue medium puzzle bag,convertible classic calfskin duffle bag in ton...,https://res-2.cloudinary.com/ssenseweb/image/u...,other
4043611,silver bold earcuff,single ear cuff in polished sterling silver. l...,https://cldny.ccindex.cn/ssenseweb/image/uploa...,other


In [5]:
shoes_types = 'sneakers|sandals|boots|heels|slides|slippers|loafers|derbys|mules'
shoes_indices = np.array(items.title.str.contains(shoes_types)).nonzero()[0]

In [6]:
# make shoes category
items = items.reset_index()
items.loc[shoes_indices,'cate'] = 'shoes'
items = items.set_index('index')

In [7]:
outfits = df2.T
outfits

Unnamed: 0,clo,acces
N09724,"[3204429, 3014298]",[3216409]
N06137,"[3598869, 3599069]","[3606119, 3635179]"
N15122,"[4068821, 4069351]","[4075281, 4074381, 4074891]"
N06235,"[3476689, 3476559]",[3465369]
N04745,"[2933268, 2936288]","[2929748, 2931428]"
...,...,...
N14050,"[4377071, 4382551]","[4385061, 4193121]"
N03001,"[2551368, 2689918, 2755138]",[2761398]
N07201,"[3369399, 3476579]","[3365519, 3445259]"
N01552,"[2923998, 2924088, 2923668]",[2924788]


In [8]:
# some images have problems while loading, so I decided to remove corresponding items.
broken_ids = [3378099, 3726509, 3426269, 3460729, 3812609, 3428099, 3763859, 3764209, 3777379]

In [9]:
print(items.cate.value_counts())

top       6261
other     4365
bottom    3944
shoes     2605
outer     2345
suit       798
Name: cate, dtype: int64


In [10]:
pairs_shoes = {}
for i in range(outfits.shape[0]):
    clo = outfits.iloc[i].clo
    access = outfits.iloc[i].acces

    look = clo + access
    look_list = []
    # as I notice, there could be several bottom/shoes items in one look, so
    # by using flags I'm getting only one example of these categories.
    bottom_flag = False
    shoes_flag = False

    for item_id in look:
        idx = int(item_id)
        if idx in broken_ids:
            break
        item = items.query("index == @idx")
        if (item.iloc[0].cate == 'bottom' and bottom_flag == False):
            look_list.append(item_id)
            bottom_flag = True
        
        if (item.iloc[0].cate == 'shoes' and shoes_flag == False):
            look_list.append(item_id)
            shoes_flag = True

    if len(look_list) == 2:
        pairs_shoes[outfits.index.tolist()[i]] = look_list

In [11]:
pairs_bot = {}
for i in range(outfits.shape[0]):
    clo = outfits.iloc[i].clo
    if len(clo) != 2:
        continue
    else:
        idx1 = int(clo[0])
        idx2 = int(clo[1])
        if idx1 in broken_ids or idx2 in broken_ids:
            continue
        item1 = items.query("index == @idx1")
        item2 = items.query("index == @idx2")
        if (item1.iloc[0].cate == 'top' and item2.iloc[0].cate == 'bottom'):
            pairs_bot[outfits.index.tolist()[i]] = clo

Get 3548 outfits with categories top - bottom  
And 6798 outfits with categories bottom - shoes

But there could be duplicates due to "same top, bot and different shoes" case

In [12]:
pairs_bot = pd.DataFrame(pairs_bot).T.rename(columns={0:'top', 1:'bottom'}).drop_duplicates()
pairs_shoes = pd.DataFrame(pairs_shoes).T.rename(columns={0:'bottom', 1:'shoes'}).drop_duplicates()
pairs_bot

Unnamed: 0,top,bottom
N06137,3598869,3599069
N15122,4068821,4069351
N06235,3476689,3476559
N15483,3822611,3822821
N09133,3491469,3486879
...,...,...
N06409,3389469,3673019
N06960,3494229,3729419
N08979,3458369,3458509
N11680,4151601,4337331


After removing duplicates we have 2876 top-bottom outfits and 5667 bottom-shoes outfits

In [13]:
"""
Move bottom items to separate folder

bottom_set = set(list(pairs_shoes.bottom))

i = 0
for dirpath,dirnames,filenames in os.walk(path):
    for fname in filenames:
        fpath = os.path.join(path, fname)
        
        if str(re.sub('.jpg', '', fname)) in bottom_set:
            i += 1
            try:
                shutil.copy(fpath, path_bot)
            except:
                pass
"""

"\nMove bottom items to separate folder\n\nbottom_set = set(list(pairs_shoes.bottom))\n\ni = 0\nfor dirpath,dirnames,filenames in os.walk(path):\n    for fname in filenames:\n        fpath = os.path.join(path, fname)\n        \n        if str(re.sub('.jpg', '', fname)) in bottom_set:\n            i += 1\n            try:\n                shutil.copy(fpath, path_bot)\n            except:\n                pass\n"

In [14]:
# distibute items by folders
path_top = "/content/drive/MyDrive/Colab_Notebooks/RecSys/items/recs/top"
path_bot = "/content/drive/MyDrive/Colab_Notebooks/RecSys/items/recs/bottom"
path_shoes = "/content/drive/MyDrive/Colab_Notebooks/RecSys/items/recs/shoes"

In [15]:
def make_path_dictionary(folder_path: str) -> dict:
    """
    Create dict: item_id - path
    """
    photo_ids = {}
    for dirpath,dirnames,filenames in os.walk(folder_path):
        for fname in filenames:
            fpath = os.path.join(folder_path, fname)
            photo_ids[re.sub('.jpg', '', fname)] = fpath
    
    return photo_ids

In [16]:
photo_ids_top = make_path_dictionary(path_top)
photo_ids_bottom = make_path_dictionary(path_bot)
photo_ids_shoes = make_path_dictionary(path_shoes)

In [17]:
top_lst, bot_lst = [], []
for i in range(pairs_bot.shape[0]):
    bot = pairs_bot.iloc[i].bottom
    top = pairs_bot.iloc[i].top
    top_lst.append(photo_ids_top[top])
    bot_lst.append(photo_ids_bottom[bot])

In [18]:
paths_pairs_bot = pd.DataFrame([top_lst, bot_lst]).T
paths_pairs_bot.index = pairs_bot.index
paths_pairs_bot = paths_pairs_bot.rename(columns=({0:'top', 1:'bot'}))
paths_pairs_bot

Unnamed: 0,top,bot
N06137,/content/drive/MyDrive/Colab_Notebooks/RecSys/...,/content/drive/MyDrive/Colab_Notebooks/RecSys/...
N15122,/content/drive/MyDrive/Colab_Notebooks/RecSys/...,/content/drive/MyDrive/Colab_Notebooks/RecSys/...
N06235,/content/drive/MyDrive/Colab_Notebooks/RecSys/...,/content/drive/MyDrive/Colab_Notebooks/RecSys/...
N15483,/content/drive/MyDrive/Colab_Notebooks/RecSys/...,/content/drive/MyDrive/Colab_Notebooks/RecSys/...
N09133,/content/drive/MyDrive/Colab_Notebooks/RecSys/...,/content/drive/MyDrive/Colab_Notebooks/RecSys/...
...,...,...
N06409,/content/drive/MyDrive/Colab_Notebooks/RecSys/...,/content/drive/MyDrive/Colab_Notebooks/RecSys/...
N06960,/content/drive/MyDrive/Colab_Notebooks/RecSys/...,/content/drive/MyDrive/Colab_Notebooks/RecSys/...
N08979,/content/drive/MyDrive/Colab_Notebooks/RecSys/...,/content/drive/MyDrive/Colab_Notebooks/RecSys/...
N11680,/content/drive/MyDrive/Colab_Notebooks/RecSys/...,/content/drive/MyDrive/Colab_Notebooks/RecSys/...


In [19]:
shoes_lst, bot_lst = [], []
for i in range(pairs_shoes.shape[0]):
    shoes = pairs_shoes.iloc[i].shoes
    bot = pairs_shoes.iloc[i].bottom
    shoes_lst.append(photo_ids_shoes[shoes])
    bot_lst.append(photo_ids_bottom[bot])

In [20]:
paths_pairs_shoes = pd.DataFrame([bot_lst, shoes_lst]).T
paths_pairs_shoes.index = pairs_shoes.index
paths_pairs_shoes = paths_pairs_shoes.rename(columns=({0:'bot', 1:'shoes'}))
paths_pairs_shoes

Unnamed: 0,bot,shoes
N06137,/content/drive/MyDrive/Colab_Notebooks/RecSys/...,/content/drive/MyDrive/Colab_Notebooks/RecSys/...
N15122,/content/drive/MyDrive/Colab_Notebooks/RecSys/...,/content/drive/MyDrive/Colab_Notebooks/RecSys/...
N06235,/content/drive/MyDrive/Colab_Notebooks/RecSys/...,/content/drive/MyDrive/Colab_Notebooks/RecSys/...
N07055,/content/drive/MyDrive/Colab_Notebooks/RecSys/...,/content/drive/MyDrive/Colab_Notebooks/RecSys/...
N09133,/content/drive/MyDrive/Colab_Notebooks/RecSys/...,/content/drive/MyDrive/Colab_Notebooks/RecSys/...
...,...,...
N09173,/content/drive/MyDrive/Colab_Notebooks/RecSys/...,/content/drive/MyDrive/Colab_Notebooks/RecSys/...
N11680,/content/drive/MyDrive/Colab_Notebooks/RecSys/...,/content/drive/MyDrive/Colab_Notebooks/RecSys/...
N07201,/content/drive/MyDrive/Colab_Notebooks/RecSys/...,/content/drive/MyDrive/Colab_Notebooks/RecSys/...
N01552,/content/drive/MyDrive/Colab_Notebooks/RecSys/...,/content/drive/MyDrive/Colab_Notebooks/RecSys/...


In [21]:
paths_pairs_shoes.to_csv("/content/drive/MyDrive/Colab_Notebooks/RecSys/arrays/paths_pairs_shoes.csv")
paths_pairs_bot.to_csv("/content/drive/MyDrive/Colab_Notebooks/RecSys/arrays/paths_pairs_bot.csv")

Make data for metrics

In [55]:
pd.DataFrame(
    pairs_bot\
        .groupby('top')\
        .aggregate('count')\
        .value_counts()
    ).rename(columns={0:'count'})

Unnamed: 0_level_0,count
bottom,Unnamed: 1_level_1
1,2318
2,189
3,30
4,14
8,3
5,2


Let's see how many looks do we have with different amounts of shoes:

In [54]:
pd.DataFrame(
    pairs_shoes\
        .groupby('bottom')\
        .aggregate('count')\
        .value_counts()
    ).rename(columns={0:'count'})

Unnamed: 0_level_0,count
shoes,Unnamed: 1_level_1
1,2161
2,575
3,224
4,113
5,76
6,30
7,23
8,21
9,12
14,4


So, we even have the shoes that are part of 20 different outfits.

In [39]:
pairs_bot.applymap(int).to_csv("/content/drive/MyDrive/Colab_Notebooks/RecSys/arrays/pairs_bot.csv")
pairs_shoes.applymap(int).to_csv("/content/drive/MyDrive/Colab_Notebooks/RecSys/arrays/pairs_shoes.csv")

Downloading and resizing images

In [21]:
dimx = 128
dimy = 128

In [24]:
"""from torchvision import transforms as T
train_transforms = T.Compose([
                              T.Resize(128),
                              T.ToTensor(),
                              T.Normalize(        
                                  mean=[0.485, 0.456, 0.406],
                                  std=[0.229, 0.224, 0.225]
                                  )
                              
])

test_transforms = T.Compose([
                             T.Resize(128),
                             T.ToTensor(),
                             T.Normalize(        
                                  mean=[0.485, 0.456, 0.406],
                                  std=[0.229, 0.224, 0.225]
                                  )
])
"""

In [33]:
%%time
top_photos = paths_pairs_bot['top'].apply(skimage.io.imread).apply(lambda img: resize(img,[dimx,dimy]))

CPU times: user 5min 3s, sys: 3min 58s, total: 9min 1s
Wall time: 5min 9s


In [35]:
%%time
bot_photos = paths_pairs_bot['bot'].apply(skimage.io.imread).apply(lambda img: resize(img,[dimx,dimy]))

CPU times: user 4min 49s, sys: 3min 51s, total: 8min 41s
Wall time: 11min 37s


In [34]:
np.save('/content/drive/MyDrive/Colab_Notebooks/RecSys/arrays/top_photos.npy', top_photos.values)

In [36]:
np.save('/content/drive/MyDrive/Colab_Notebooks/RecSys/arrays/bot_photos.npy', bot_photos.values)

In [37]:
%%time
# load images by batches due to lack of RAM memory
bot_photos_shoes_1 = paths_pairs_shoes['bot'].iloc[0:1500].apply(skimage.io.imread).apply(lambda img: resize(img,[dimx,dimy]))

CPU times: user 2min 33s, sys: 2min 3s, total: 4min 36s
Wall time: 4min 21s


In [38]:
%%time
bot_photos_shoes_2 = paths_pairs_shoes['bot'].iloc[1500:3500].apply(skimage.io.imread).apply(lambda img: resize(img,[dimx,dimy]))

CPU times: user 3min 25s, sys: 3min 8s, total: 6min 34s
Wall time: 5min 27s


In [39]:
%%time
bot_photos_shoes_3 = paths_pairs_shoes['bot'].iloc[3500:].apply(skimage.io.imread).apply(lambda img: resize(img,[dimx,dimy]))

CPU times: user 3min 42s, sys: 2min 51s, total: 6min 34s
Wall time: 5min 27s


In [40]:
bot_photos_shoes_12 = np.append(bot_photos_shoes_1, bot_photos_shoes_2)
bot_photos_shoes = np.append(bot_photos_shoes_12, bot_photos_shoes_3)

In [41]:
np.save('/content/drive/MyDrive/Colab_Notebooks/RecSys/arrays/bot_photos_shoes.npy', bot_photos_shoes)

In [22]:
%%time
shoes_photos_1 = paths_pairs_shoes['shoes'].iloc[:1500].apply(skimage.io.imread).apply(lambda img: resize(img,[dimx,dimy]))

CPU times: user 2min 37s, sys: 2min 9s, total: 4min 46s
Wall time: 3min 56s


In [23]:
%%time
shoes_photos_2 = paths_pairs_shoes['shoes'].iloc[1500:3500].apply(skimage.io.imread).apply(lambda img: resize(img,[dimx,dimy]))

CPU times: user 3min 22s, sys: 2min 39s, total: 6min 1s
Wall time: 3min 24s


In [24]:
%%time
shoes_photos_3 = paths_pairs_shoes['shoes'].iloc[3500:].apply(skimage.io.imread).apply(lambda img: resize(img,[dimx,dimy]))

CPU times: user 3min 50s, sys: 2min 58s, total: 6min 48s
Wall time: 3min 55s


In [25]:
shoes_photos_12 = np.append(shoes_photos_1, shoes_photos_2)
shoes_photos = np.append(shoes_photos_12, shoes_photos_3)

In [26]:
np.save('/content/drive/MyDrive/Colab_Notebooks/RecSys/arrays/shoes_photos.npy', shoes_photos)

In [None]:
#top_photos = np.stack(top_photos.values)
#bot_photos = np.stack(bot_photos.values)
#bot_photos_shoes = np.stack(bot_photos_shoes.values)
#shoes_photos = np.stack(shoes_photos)