### Clean and filter metadata

In [1]:
import pandas as pd
import os

base_path = "../data/fashion-product-images-small/myntradataset/"
old_csv_path = os.path.join(base_path, "styles.csv")
csv_path = os.path.join(base_path, "styles_fixed.csv")
img_path = os.path.join(base_path, "images")

In [2]:
# Fixing bad lines in csv file (due to commas in product names)

import csv

with open(old_csv_path) as rf, open(csv_path, 'w') as wf:
    csv_reader = csv.reader(rf, delimiter=',')
    csv_writer = csv.writer(wf, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    for row in csv_reader:
        if len(row) > 10:
            save_row = row[:8]
            save_row.append(','.join(row[9:]))
        else:
            save_row = row
        csv_writer.writerow(save_row)

In [41]:
styles = pd.read_csv(csv_path)

In [76]:
styles['articleType'] = styles['articleType'].astype('category').cat.as_ordered()

In [43]:
styles['image'] = styles.apply(lambda row: str(row['id']) + ".jpg", axis=1)
print(styles['image'])

0        15970.jpg
1        39386.jpg
2        59263.jpg
3        21379.jpg
4        53759.jpg
           ...    
44441    17036.jpg
44442     6461.jpg
44443    18842.jpg
44444    46694.jpg
44445    51623.jpg
Name: image, Length: 44446, dtype: object


In [44]:
for image in styles['image']:
    if not os.path.exists(os.path.join(img_path, image)):
        print("Image {} doesn't exist!".format(image))

Image 39403.jpg doesn't exist!
Image 39410.jpg doesn't exist!
Image 39401.jpg doesn't exist!
Image 39425.jpg doesn't exist!
Image 12347.jpg doesn't exist!


In [45]:
# Filer out rows for which images don't exist

img_exists = styles.apply(lambda row: os.path.exists(os.path.join(img_path, row['image'])), axis=1)

styles = styles[img_exists]
print(styles)

          id gender masterCategory subCategory            articleType  \
0      15970    Men        Apparel     Topwear                 Shirts   
1      39386    Men        Apparel  Bottomwear                  Jeans   
2      59263  Women    Accessories     Watches                Watches   
3      21379    Men        Apparel  Bottomwear            Track Pants   
4      53759    Men        Apparel     Topwear                Tshirts   
...      ...    ...            ...         ...                    ...   
44441  17036    Men       Footwear       Shoes           Casual Shoes   
44442   6461    Men       Footwear  Flip Flops             Flip Flops   
44443  18842    Men        Apparel     Topwear                Tshirts   
44444  46694  Women  Personal Care   Fragrance  Perfume and Body Mist   
44445  51623  Women    Accessories     Watches                Watches   

      baseColour  season    year   usage  \
0      Navy Blue    Fall  2011.0  Casual   
1           Blue  Summer  2012.0  C

### Create train-test splits

In [46]:
print(styles[styles['year'].isnull()])

          id gender masterCategory subCategory articleType baseColour season  \
21285  53781    Men        Apparel     Topwear     Tshirts       Blue    NaN   

       year   usage                      productDisplayName      image  
21285   NaN  Sports  Puma Men Blue Sless Round Neck T-shirt  53781.jpg  


The year column has some NaN values which are added to the test set

In [47]:
is_train = styles['year']%2==0
is_test = styles['year']%2!=0

full_train = styles[is_train]
print(full_train.shape)

full_test = styles[is_test]
print(full_test.shape)

(23787, 11)
(20654, 11)


### Sub-split training data for pre-training and fine-tuning

In [48]:
top_articleType = full_train.groupby('articleType').size().sort_values(ascending=False).head(20).reset_index()

print("Top 20 classes:")
print(top_articleType)

Top 20 classes:
     articleType     0
0        Tshirts  2749
1        Watches  2482
2   Casual Shoes  1551
3         Kurtas  1140
4         Shirts  1128
5           Tops  1059
6     Sunglasses  1042
7       Handbags  1006
8   Sports Shoes   881
9          Heels   775
10        Briefs   662
11       Wallets   600
12    Flip Flops   508
13         Socks   505
14         Belts   492
15       Sandals   477
16        Sarees   427
17       Dresses   371
18  Formal Shoes   359
19         Flats   342


In [49]:
filter_topArticles = full_train['articleType'].isin(top_articleType['articleType'])
train_data = full_train[filter_topArticles]
print(train_data.shape)

(18556, 11)


In [50]:
fine_tune_data = full_train[~filter_topArticles]
print(fine_tune_data.shape)

(5231, 11)


In [63]:
test = pd.DataFrame(train_data['articleType'])
test['code'] = test['articleType'].cat.codes

train_codes = test['code'].unique()
train_codes

array([140, 134, 104, 110,   7,  48,  39, 127,  19,  38,  63, 112,  15,
        41, 101,  51, 116,  28, 139, 100], dtype=int16)

In [64]:
test = pd.DataFrame(fine_tune_data['articleType'])
test['code'] = test['articleType'].cat.codes

fine_tune_codes = test['code'].unique()
fine_tune_codes

array([ 56,  13,  14, 107,  53,  90,  65, 102,  92, 109,   8,  94,  62,
       108,  31, 132,  12,   2,  58,  17,  18, 135,  22, 133,  85, 111,
        44, 128, 131, 121, 120,   3,  64,  81, 123,  73,  87,  80, 130,
        59, 141, 118,  55,  30,  99,  66,  89, 129,   0,  16,  20,  75,
       138,  29, 119,  88,  74, 122, 114, 103,  86,  98,  25,  57,  40,
        96,  82,  45,  11,  46,   4,  67, 105,  21, 137, 113,  26, 125,
        50,  61, 142, 136,  97,  93,  76,  95,  54], dtype=int16)

In [71]:
styles['articleType'].cat.codes.values

array([104,  56, 140, ..., 134,  91, 140], dtype=int16)

In [74]:
dict(enumerate(styles['articleType'].cat.categories))

{0: 'Accessory Gift Set',
 1: 'Baby Dolls',
 2: 'Backpacks',
 3: 'Bangle',
 4: 'Basketballs',
 5: 'Bath Robe',
 6: 'Beauty Accessory',
 7: 'Belts',
 8: 'Blazers',
 9: 'Body Lotion',
 10: 'Body Wash and Scrub',
 11: 'Booties',
 12: 'Boxers',
 13: 'Bra',
 14: 'Bracelet',
 15: 'Briefs',
 16: 'Camisoles',
 17: 'Capris',
 18: 'Caps',
 19: 'Casual Shoes',
 20: 'Churidar',
 21: 'Clothing Set',
 22: 'Clutches',
 23: 'Compact',
 24: 'Concealer',
 25: 'Cufflinks',
 26: 'Cushion Covers',
 27: 'Deodorant',
 28: 'Dresses',
 29: 'Duffel Bag',
 30: 'Dupatta',
 31: 'Earrings',
 32: 'Eye Cream',
 33: 'Eyeshadow',
 34: 'Face Moisturisers',
 35: 'Face Scrub and Exfoliator',
 36: 'Face Serum and Gel',
 37: 'Face Wash and Cleanser',
 38: 'Flats',
 39: 'Flip Flops',
 40: 'Footballs',
 41: 'Formal Shoes',
 42: 'Foundation and Primer',
 43: 'Fragrance Gift Set',
 44: 'Free Gifts',
 45: 'Gloves',
 46: 'Hair Accessory',
 47: 'Hair Colour',
 48: 'Handbags',
 49: 'Hat',
 50: 'Headband',
 51: 'Heels',
 52: 'Highli

In [77]:
dict(zip(styles['articleType'].cat.codes, styles['articleType']))

{104: 'Shirts',
 56: 'Jeans',
 140: 'Watches',
 128: 'Track Pants',
 134: 'Tshirts',
 110: 'Socks',
 19: 'Casual Shoes',
 7: 'Belts',
 39: 'Flip Flops',
 48: 'Handbags',
 127: 'Tops',
 13: 'Bra',
 100: 'Sandals',
 105: 'Shoe Accessories',
 120: 'Sweatshirts',
 27: 'Deodorant',
 41: 'Formal Shoes',
 14: 'Bracelet',
 72: 'Lipstick',
 38: 'Flats',
 63: 'Kurtas',
 138: 'Waistcoat',
 112: 'Sports Shoes',
 107: 'Shorts',
 15: 'Briefs',
 101: 'Sarees',
 91: 'Perfume and Body Mist',
 51: 'Heels',
 116: 'Sunglasses',
 53: 'Innerwear Vests',
 90: 'Pendant',
 84: 'Nail Polish',
 65: 'Laptop Bag',
 102: 'Scarves',
 92: 'Rain Jacket',
 28: 'Dresses',
 87: 'Night suits',
 109: 'Skirts',
 139: 'Wallets',
 8: 'Blazers',
 94: 'Ring',
 62: 'Kurta Sets',
 22: 'Clutches',
 108: 'Shrug',
 2: 'Backpacks',
 18: 'Caps',
 132: 'Trousers',
 31: 'Earrings',
 16: 'Camisoles',
 12: 'Boxers',
 58: 'Jewellery Set',
 30: 'Dupatta',
 17: 'Capris',
 69: 'Lip Gloss',
 5: 'Bath Robe',
 82: 'Mufflers',
 135: 'Tunics',
 55