In [1]:
import sys

def change_python_version(pv):
    
    if "/lib" in pv:
        pv = pv.replace("/lib", "/envs/shoe_ml_evaluator/lib")
    
    if "python36" in pv:
        pv = pv.replace("python36", "python37")
    elif "python3.6" in pv:
        pv = pv.replace("python3.6", "python3.7")

    return pv

updated_path = list(map(change_python_version, sys.path))
sys.path.extend(updated_path)

In [27]:
import cv2
import glob
import numpy as np
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import MinMaxScaler
import webcolors

In [3]:
df = pd.read_csv("../data/combined_browse_info.csv")

In [4]:
df.columns

Index(['belowRetail', 'brand', 'category', 'colorway', 'condition', 'gender',
       'id', 'market', 'name', 'objectID', 'releaseDate', 'releaseTime',
       'retailPrice', 'shoe', 'shortDescription', 'styleId', 'tickerSymbol',
       'title', 'urlKey', 'uuid', 'year', 'imageUrl', 'absChangePercentage',
       'annualHigh', 'annualLow', 'averageDeadstockPrice',
       'averageDeadstockPriceRank', 'changePercentage', 'changeValue',
       'createdAt', 'deadstockRangeHigh', 'deadstockRangeLow', 'deadstockSold',
       'deadstockSoldRank', 'featured', 'highestBid', 'highestBidSize',
       'lastHighestBidTime', 'lastLowestAskTime', 'lastSale', 'lastSaleDate',
       'lastSaleSize', 'lowestAsk', 'lowestAskSize', 'numberOfAsks',
       'numberOfBids', 'parentLowestAsk', 'pricePremium', 'pricePremiumRank',
       'productId', 'productUuid', 'salesLast72Hours', 'salesLastPeriod',
       'salesThisPeriod', 'skuUuid', 'totalDollars', 'updatedAt',
       'volatility'],
      dtype='object')

In [25]:
df = df[["brand", "category", "colorway", "gender", "title", "averageDeadstockPrice", "deadstockSold", "imageUrl"]]

In [26]:
df = df[["title", "brand", "category", "colorway", "gender", "imageUrl", "averageDeadstockPrice", "deadstockSold"]]
df.head()

Unnamed: 0,title,brand,category,colorway,gender,imageUrl,averageDeadstockPrice,deadstockSold
0,adidas Yeezy Boost 350 V2 Cream/Triple White,adidas,adidas Yeezy,Cream White/Cream White,men,https://stockx.imgix.net/Adidas-Yeezy-Boost-35...,210,44325
1,adidas Yeezy Boost 350 V2 Zebra,adidas,adidas Yeezy,White/Core Black/Red,men,https://stockx.imgix.net/Adidas-Yeezy-Boost-35...,261,38747
2,adidas Yeezy Boost 700 Wave Runner Solid Grey,adidas,adidas Yeezy,Solid Grey/Chalk White/Core Black,men,https://stockx.imgix.net/Adidas-Yeezy-Wave-Run...,323,33708
3,adidas Yeezy Boost 350 V2 Semi Frozen Yellow,adidas,adidas Yeezy,Semi Frozen Yellow/Raw Steel/Red,men,https://stockx.imgix.net/Adidas-Yeezy-Boost-35...,233,21305
4,adidas Yeezy Boost 350 V2 Blue Tint,adidas,adidas Yeezy,Blue Tint/Grey Three/High Risk Red,men,https://stockx.imgix.net/Adidas-Yeezy-Boost-35...,307,11207


In [18]:
X, y = df.iloc[:,:-1], df.iloc[:,-1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [50]:
def find_colours(colorway):
    
    colours = " ".join(colorway.lower().split("/")).split(" ")
    
    unique_colours = list(set([colour_str for colour_str in colours if colour_str in webcolors.CSS3_NAMES_TO_HEX]))
    
    return sorted(unique_colours)

df["uniqueColours"] = df["colorway"].apply(find_colours)

In [52]:
df.head()

Unnamed: 0,title,brand,category,colorway,gender,imageUrl,averageDeadstockPrice,deadstockSold,uniqueColours
0,adidas Yeezy Boost 350 V2 Cream/Triple White,adidas,adidas Yeezy,Cream White/Cream White,men,https://stockx.imgix.net/Adidas-Yeezy-Boost-35...,210,44325,[white]
1,adidas Yeezy Boost 350 V2 Zebra,adidas,adidas Yeezy,White/Core Black/Red,men,https://stockx.imgix.net/Adidas-Yeezy-Boost-35...,261,38747,"[black, red, white]"
2,adidas Yeezy Boost 700 Wave Runner Solid Grey,adidas,adidas Yeezy,Solid Grey/Chalk White/Core Black,men,https://stockx.imgix.net/Adidas-Yeezy-Wave-Run...,323,33708,"[black, grey, white]"
3,adidas Yeezy Boost 350 V2 Semi Frozen Yellow,adidas,adidas Yeezy,Semi Frozen Yellow/Raw Steel/Red,men,https://stockx.imgix.net/Adidas-Yeezy-Boost-35...,233,21305,"[red, yellow]"
4,adidas Yeezy Boost 350 V2 Blue Tint,adidas,adidas Yeezy,Blue Tint/Grey Three/High Risk Red,men,https://stockx.imgix.net/Adidas-Yeezy-Boost-35...,307,11207,"[blue, grey, red]"


In [64]:
# df[df.colorway == "Multi-Color/Multi-Color"]
df[df['uniqueColours'].map(lambda d: len(d)) == 0]

Unnamed: 0,title,brand,category,colorway,gender,imageUrl,averageDeadstockPrice,deadstockSold,uniqueColours
33,Jordan 6 Retro Wheat,Jordan,Air Jordan Six,Golden Harvest/Golden Harvest-Sail,men,https://stockx.imgix.net/Air-Jordan-6-Retro-Wh...,110,1183,[]
40,Nike HyperAdapt 1.0 Sport Royal Tinker Blue,Nike,Nike Other Running,Sport Royal/Black-White,men,https://stockx.imgix.net/Nike-Hyperadapt-1pt0-...,350,170,[]
44,adidas Tennis HU Pharrell Multi-Color,adidas,adidas Other,Multi-Color/Multi-Color,men,https://stockx.imgix.net/Adidas-Tennis-Hu-Phar...,77,168,[]
56,Air More Uptempo Light Bone,Nike,Nike Basketball Other,Light Bone/White-Light Bone,men,https://stockx.imgix.net/Nike-Air-More-Uptempo...,120,160,[]
60,Nike SB Air Force 2 Low Supreme Yellow,Nike,Nike SB Other,Varsity Maize/White-Varsity Maize,men,https://stockx.imgix.net/Nike-SB-Air-Force-2-L...,114,158,[]
65,Air VaporMax Platinum Red Black,Nike,Air Max VaporMax,Pure Platinum/University Red-Black-Anthracite,men,https://stockx.imgix.net/Nike-Air-VaporMax-Pla...,165,156,[]
78,Air Max 97 Country Camo (UK),Nike,Air Max 97,Raw Umber/Fortress Green-Black Earth,men,https://stockx.imgix.net/Nike-Air-Max-97-Count...,163,152,[]
90,Nike Kyrie S1 Hybrid What The (Multicolor),Nike,Nike Basketball Kyrie,Multi-Color/Multi-Color,men,https://stockx.imgix.net/Nike-Nike-Kyrie-S1-Hy...,130,147,[]
102,Air More Uptempo Iridescent (W),Nike,Nike Basketball Other,Dark Stucco/White-Black,women,https://stockx.imgix.net/Nike-Air-More-Uptempo...,77,143,[]
123,Air VaporMax CS Midnight Fog,Nike,Air Max VaporMax,Midnight Fog/Midnight Navy-Black,men,https://stockx.imgix.net/Nike-Air-VaporMax-CS-...,88,137,[]


In [63]:
# Ignore normalising continous values for time being

In [53]:
# TODO: https://stackoverflow.com/questions/42711861/scikit-learn-one-hot-encoding-of-column-with-list-values