# Preprocess dataset
Remove invalid views, images, etc.

In [1]:
import os
import re
import json
import requests

import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm


Bad key "text.kerning_factor" on line 4 in
c:\anaconda\lib\site-packages\matplotlib\mpl-data\stylelib\_classic_test_patch.mplstyle.
You probably need to get an updated matplotlibrc file from
https://github.com/matplotlib/matplotlib/blob/v3.1.3/matplotlibrc.template
or from the matplotlib source distribution


In [2]:
# Load dataset
if not os.path.exists('dataset.json'):
    url = 'https://raw.githubusercontent.com/thilove98/uit-shoesgan/master/dataset/dataset.json'
    res = requests.get(url)
    dataset = json.loads(res.text)
else:
    with open('dataset.json') as f:
        dataset = json.load(f)

## Remove invalid keys

In [3]:
# Get all views
def get_all_views(dataset):
    views = []
    for key, value in dataset.items():
        for k in value['images'].keys():
            if k not in views:
                views.append(k)
    return views

print(get_all_views(dataset))

['PAIR', 'TOPP', 'BOTT', 'LEFT', 'BACK', 'RGHT', 'FRNT', 'MAIN']


In [4]:
# Delete invalid keys

valid_views = ['PAIR', 'TOPP', 'BOTT', 'LEFT', 'BACK', 'RGHT', 'FRNT']

for key, value in dataset.items():
    views = list(value['images'].keys())
    for v in views:
        if v not in valid_views:
            value['images'].pop(v)

print(get_all_views(dataset))

['PAIR', 'TOPP', 'BOTT', 'LEFT', 'BACK', 'RGHT', 'FRNT']


## Change duplicated genders

In [5]:
# Get all genders
def get_all_genders(dataset):
    genders = []
    for key, value in dataset.items():
        if value['info']['gender'] not in genders:
            genders.append(value['info']['gender'])
    return genders

print(get_all_genders(dataset))

['general', 'women', '["Men","Women"]', 'men', '["Women"]', 'womens', '["Womens"]', 'mens', '["Men"]', 'boys', '["Mens"]', 'kids', 'girls', 'unisex', '["Girls"]']


In [6]:
def change_gender(text):
    for c in '"[]"':
        text = text.replace(c, '')
    texts = text.split(',')
    if len(texts) > 1 or text == 'unisex':
        return 'general'
    text = text.replace('s', '').lower()
    return text

for key, value in dataset.items():
    value['info']['gender'] = change_gender(value['info']['gender'])
    
print(get_all_genders(dataset))

['general', 'women', 'men', 'boy', 'kid', 'girl']


## Remove duplicated brands

Remove some duplicated brands, i.e `adidas` refer to `adidas Skateboarding` and `adidas Originals`

In [7]:
def get_all_brands(dataset):
    brands = []
    for key, value in dataset.items():
        if value['info']['brand'] not in brands:
            brands.append(value['info']['brand'])
    return brands

In [8]:
def create_brands_map(dataset):
    # get some well-known brands
    url = 'https://poshmark.com/brands'
    res = requests.get(url)
    known_brands = re.findall(r'<a href="brand/(.+)" class="d--b">', res.text)
    known_brands = [x.replace('_', ' ').replace('amp;', '') for x in known_brands]
    known_brands = sorted(known_brands, key=lambda x:len(x), reverse=True)
    
    all_brands = get_all_brands(dataset)
    
    res = {}
    
    # brands with a single name or has the same name in known-brands
    for x in all_brands:
        if ' ' not in x:
            res[x] = x
            continue
        for y in known_brands:
            if x.lower() == y.lower():
                res[x] = y
                break
                
    # brands with duplicated name
    for x in all_brands:
        if x in res:
            continue
        for k in list(res.keys()):
            if k == 'On': #skip special keys
                continue
            if k in x:
                res[x] = k
                break
                
    # some special brands name
    exceptions = {
            'SOLE \\u002F SOCIETY': 'Sole Society',
            'GBG Los Angeles': 'GBG Los Angeles',
            'Cecelia New York': 'Cecelia New York',
            'EMU Australia Kids': 'Emu',
            'Iron Age': 'Iron Age',
            'Jacques Soloviere Paris': 'Jacques Soloviere Paris',
            'Red Wing Heritage': 'Red Wing',
            'STEVEN NEW YORK': 'STEVEN NEW YORK',
            'Sea Star Beachwear': 'Sea Star Beachwear',
            'Vintage Foundry': 'Vintage Foundry',
            'Walking Cradles': 'Walking Cradles',
            'Wolf & Shepherd': 'Wolf & Shepherd',
            'J&M Collection' : 'J&M',
            'J&M EST. 1850': 'J&M',
            }
    
    # set brands in exceptions
    for x in exceptions:
        res[x] = exceptions[x]
    
    # set brands in knowns brands with some modified
    for x in all_brands:
        if x in res:
            continue
        for y in known_brands:
            if len(y) < 3:
                continue
            if x.lower().replace(' ', '') in y.lower().replace(' ', ''):
                res[x] = x
                break
            elif y.lower().replace(' ', '') in x.lower().replace(' ', ''):
                res[x] = y
                break
    
    # add some leftovers
    for x in all_brands:
        if x in res:
            continue
        y = x.replace('Kids', '').strip()
        res[x] = y
    
    return res

In [9]:
brands_map = create_brands_map(dataset)
for k, v in dataset.items():
    brand = v['info']['brand']
    v['info']['brand'] = brands_map[brand]

In [10]:
total_images = 0
names = {}
brands = {}
categories = {}
genders = {}

for k, v in dataset.items():
    total_images += len(v['images'])
    
    name = v['info']['name']
    if name not in names:
        names[name] = 1
    else:
        names[name] += 1
    
    brand = v['info']['brand']
    if brand not in brands:
        brands[brand] = 1
    else:
        brands[brand] += 1
        
    category = v['info']['category']
    if category not in categories:
        categories[category] = 1
    else:
        categories[category] += 1
    
    gender = v['info']['gender']
    if gender not in genders:
        genders[gender] = 1
    else:
        genders[gender] += 1

print('Total shoes', len(dataset))
print('Total images', total_images)
print('Total kind of shoes', len(names))
print('Total brands', len(brands))
print('Total categories', len(categories))
print('Total genders', len(genders))

Total shoes 71957
Total images 501863
Total kind of shoes 28271
Total brands 508
Total categories 11
Total genders 6


## Save as a new file

In [None]:
with open('preprocess_dataset.json', 'w') as f:
    json.dump(dataset, f)