In [1]:
import sys

def change_python_version(pv):
    
    if "/lib" in pv:
        pv = pv.replace("/lib", "/envs/shoe_ml_evaluator/lib")
    
    if "python36" in pv:
        pv = pv.replace("python36", "python37")
    elif "python3.6" in pv:
        pv = pv.replace("python3.6", "python3.7")

    return pv

updated_path = list(map(change_python_version, sys.path))
sys.path.extend(updated_path)

In [2]:
import cv2
import glob
from joblib import dump,load
import numpy as np
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import MultiLabelBinarizer
import webcolors

In [3]:
df = pd.read_csv("../data/combined_browse_info.csv")

In [4]:
df.columns

Index(['belowRetail', 'brand', 'category', 'colorway', 'condition', 'gender',
       'id', 'market', 'name', 'objectID', 'releaseDate', 'releaseTime',
       'retailPrice', 'shoe', 'shortDescription', 'styleId', 'tickerSymbol',
       'title', 'urlKey', 'uuid', 'year', 'imageUrl', 'absChangePercentage',
       'annualHigh', 'annualLow', 'averageDeadstockPrice',
       'averageDeadstockPriceRank', 'changePercentage', 'changeValue',
       'createdAt', 'deadstockRangeHigh', 'deadstockRangeLow', 'deadstockSold',
       'deadstockSoldRank', 'featured', 'highestBid', 'highestBidSize',
       'lastHighestBidTime', 'lastLowestAskTime', 'lastSale', 'lastSaleDate',
       'lastSaleSize', 'lowestAsk', 'lowestAskSize', 'numberOfAsks',
       'numberOfBids', 'parentLowestAsk', 'pricePremium', 'pricePremiumRank',
       'productId', 'productUuid', 'salesLast72Hours', 'salesLastPeriod',
       'salesThisPeriod', 'skuUuid', 'totalDollars', 'updatedAt',
       'volatility'],
      dtype='object')

In [5]:
df = df[["brand", "category", "colorway", "gender", "title", "averageDeadstockPrice", "deadstockSold", "imageUrl"]]

In [6]:
df.head()

Unnamed: 0,brand,category,colorway,gender,title,averageDeadstockPrice,deadstockSold,imageUrl
0,adidas,adidas Yeezy,Cream White/Cream White,men,adidas Yeezy Boost 350 V2 Cream/Triple White,210,44325,https://stockx.imgix.net/Adidas-Yeezy-Boost-35...
1,adidas,adidas Yeezy,White/Core Black/Red,men,adidas Yeezy Boost 350 V2 Zebra,261,38747,https://stockx.imgix.net/Adidas-Yeezy-Boost-35...
2,adidas,adidas Yeezy,Solid Grey/Chalk White/Core Black,men,adidas Yeezy Boost 700 Wave Runner Solid Grey,323,33708,https://stockx.imgix.net/Adidas-Yeezy-Wave-Run...
3,adidas,adidas Yeezy,Semi Frozen Yellow/Raw Steel/Red,men,adidas Yeezy Boost 350 V2 Semi Frozen Yellow,233,21305,https://stockx.imgix.net/Adidas-Yeezy-Boost-35...
4,adidas,adidas Yeezy,Blue Tint/Grey Three/High Risk Red,men,adidas Yeezy Boost 350 V2 Blue Tint,307,11207,https://stockx.imgix.net/Adidas-Yeezy-Boost-35...


In [7]:
def find_colours(colorway):
    
    colours = " ".join(colorway.lower().split("/")).split(" ")
    
    unique_colours = list(set([colour_str for colour_str in colours if colour_str in webcolors.CSS3_NAMES_TO_HEX]))
    
    return sorted(unique_colours)

df["uniqueColours"] = df["colorway"].apply(find_colours)

In [8]:
df.head()

Unnamed: 0,brand,category,colorway,gender,title,averageDeadstockPrice,deadstockSold,imageUrl,uniqueColours
0,adidas,adidas Yeezy,Cream White/Cream White,men,adidas Yeezy Boost 350 V2 Cream/Triple White,210,44325,https://stockx.imgix.net/Adidas-Yeezy-Boost-35...,[white]
1,adidas,adidas Yeezy,White/Core Black/Red,men,adidas Yeezy Boost 350 V2 Zebra,261,38747,https://stockx.imgix.net/Adidas-Yeezy-Boost-35...,"[black, red, white]"
2,adidas,adidas Yeezy,Solid Grey/Chalk White/Core Black,men,adidas Yeezy Boost 700 Wave Runner Solid Grey,323,33708,https://stockx.imgix.net/Adidas-Yeezy-Wave-Run...,"[black, grey, white]"
3,adidas,adidas Yeezy,Semi Frozen Yellow/Raw Steel/Red,men,adidas Yeezy Boost 350 V2 Semi Frozen Yellow,233,21305,https://stockx.imgix.net/Adidas-Yeezy-Boost-35...,"[red, yellow]"
4,adidas,adidas Yeezy,Blue Tint/Grey Three/High Risk Red,men,adidas Yeezy Boost 350 V2 Blue Tint,307,11207,https://stockx.imgix.net/Adidas-Yeezy-Boost-35...,"[blue, grey, red]"


In [9]:
# Entries where matching colour has not been found
df[df['uniqueColours'].map(lambda d: len(d)) == 0].shape

(35, 9)

In [10]:
mlb = MultiLabelBinarizer()
unique_colour_labels = mlb.fit_transform(df['uniqueColours'])
df = df.join(pd.DataFrame(unique_colour_labels, columns=mlb.classes_))
df.drop(["uniqueColours"], axis=1, inplace=True)
df.head()

Unnamed: 0,brand,category,colorway,gender,title,averageDeadstockPrice,deadstockSold,imageUrl,aqua,beige,...,orange,pink,purple,red,silver,tan,teal,violet,white,yellow
0,adidas,adidas Yeezy,Cream White/Cream White,men,adidas Yeezy Boost 350 V2 Cream/Triple White,210,44325,https://stockx.imgix.net/Adidas-Yeezy-Boost-35...,0,0,...,0,0,0,0,0,0,0,0,1,0
1,adidas,adidas Yeezy,White/Core Black/Red,men,adidas Yeezy Boost 350 V2 Zebra,261,38747,https://stockx.imgix.net/Adidas-Yeezy-Boost-35...,0,0,...,0,0,0,1,0,0,0,0,1,0
2,adidas,adidas Yeezy,Solid Grey/Chalk White/Core Black,men,adidas Yeezy Boost 700 Wave Runner Solid Grey,323,33708,https://stockx.imgix.net/Adidas-Yeezy-Wave-Run...,0,0,...,0,0,0,0,0,0,0,0,1,0
3,adidas,adidas Yeezy,Semi Frozen Yellow/Raw Steel/Red,men,adidas Yeezy Boost 350 V2 Semi Frozen Yellow,233,21305,https://stockx.imgix.net/Adidas-Yeezy-Boost-35...,0,0,...,0,0,0,1,0,0,0,0,0,1
4,adidas,adidas Yeezy,Blue Tint/Grey Three/High Risk Red,men,adidas Yeezy Boost 350 V2 Blue Tint,307,11207,https://stockx.imgix.net/Adidas-Yeezy-Boost-35...,0,0,...,0,0,0,1,0,0,0,0,0,0


In [11]:
cols_to_rename = {col: f"colour_{col}" for col in df.columns[8:]}
df.rename(index=str, columns=cols_to_rename, inplace=True)
df.columns

Index(['brand', 'category', 'colorway', 'gender', 'title',
       'averageDeadstockPrice', 'deadstockSold', 'imageUrl', 'colour_aqua',
       'colour_beige', 'colour_black', 'colour_blue', 'colour_brown',
       'colour_coral', 'colour_crimson', 'colour_gold', 'colour_goldenrod',
       'colour_green', 'colour_grey', 'colour_indigo', 'colour_ivory',
       'colour_khaki', 'colour_linen', 'colour_navy', 'colour_olive',
       'colour_orange', 'colour_pink', 'colour_purple', 'colour_red',
       'colour_silver', 'colour_tan', 'colour_teal', 'colour_violet',
       'colour_white', 'colour_yellow'],
      dtype='object')

In [12]:
def encode_categorical_features(df, category):
    
    encoded = pd.get_dummies(df[category], prefix=category)
    
    df = df.join(encoded)
    df.drop([category], axis=1, inplace=True)
    return df

In [13]:
for category in ["brand", "category", "gender"]:
    df = encode_categorical_features(df, category)

In [14]:
df.head()

Unnamed: 0,colorway,title,averageDeadstockPrice,deadstockSold,imageUrl,colour_aqua,colour_beige,colour_black,colour_blue,colour_brown,...,category_adidas NMD,category_adidas Other,category_adidas Stan Smith,category_adidas Ultra Boost,category_adidas Yeezy,gender_child,gender_infant,gender_men,gender_preschool,gender_women
0,Cream White/Cream White,adidas Yeezy Boost 350 V2 Cream/Triple White,210,44325,https://stockx.imgix.net/Adidas-Yeezy-Boost-35...,0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0
1,White/Core Black/Red,adidas Yeezy Boost 350 V2 Zebra,261,38747,https://stockx.imgix.net/Adidas-Yeezy-Boost-35...,0,0,1,0,0,...,0,0,0,0,1,0,0,1,0,0
2,Solid Grey/Chalk White/Core Black,adidas Yeezy Boost 700 Wave Runner Solid Grey,323,33708,https://stockx.imgix.net/Adidas-Yeezy-Wave-Run...,0,0,1,0,0,...,0,0,0,0,1,0,0,1,0,0
3,Semi Frozen Yellow/Raw Steel/Red,adidas Yeezy Boost 350 V2 Semi Frozen Yellow,233,21305,https://stockx.imgix.net/Adidas-Yeezy-Boost-35...,0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0
4,Blue Tint/Grey Three/High Risk Red,adidas Yeezy Boost 350 V2 Blue Tint,307,11207,https://stockx.imgix.net/Adidas-Yeezy-Boost-35...,0,0,0,1,0,...,0,0,0,0,1,0,0,1,0,0


In [15]:
cols = list(df.columns.values)
cols.pop(cols.index("deadstockSold"))
df = df[cols+["deadstockSold"]]

In [16]:
df.head()

Unnamed: 0,colorway,title,averageDeadstockPrice,imageUrl,colour_aqua,colour_beige,colour_black,colour_blue,colour_brown,colour_coral,...,category_adidas Other,category_adidas Stan Smith,category_adidas Ultra Boost,category_adidas Yeezy,gender_child,gender_infant,gender_men,gender_preschool,gender_women,deadstockSold
0,Cream White/Cream White,adidas Yeezy Boost 350 V2 Cream/Triple White,210,https://stockx.imgix.net/Adidas-Yeezy-Boost-35...,0,0,0,0,0,0,...,0,0,0,1,0,0,1,0,0,44325
1,White/Core Black/Red,adidas Yeezy Boost 350 V2 Zebra,261,https://stockx.imgix.net/Adidas-Yeezy-Boost-35...,0,0,1,0,0,0,...,0,0,0,1,0,0,1,0,0,38747
2,Solid Grey/Chalk White/Core Black,adidas Yeezy Boost 700 Wave Runner Solid Grey,323,https://stockx.imgix.net/Adidas-Yeezy-Wave-Run...,0,0,1,0,0,0,...,0,0,0,1,0,0,1,0,0,33708
3,Semi Frozen Yellow/Raw Steel/Red,adidas Yeezy Boost 350 V2 Semi Frozen Yellow,233,https://stockx.imgix.net/Adidas-Yeezy-Boost-35...,0,0,0,0,0,0,...,0,0,0,1,0,0,1,0,0,21305
4,Blue Tint/Grey Three/High Risk Red,adidas Yeezy Boost 350 V2 Blue Tint,307,https://stockx.imgix.net/Adidas-Yeezy-Boost-35...,0,0,0,1,0,0,...,0,0,0,1,0,0,1,0,0,11207


In [17]:
df.drop(["colorway", "title", "imageUrl"], axis=1, inplace=True)

In [18]:
X, y = df.iloc[:,:-1], df.iloc[:,-1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [19]:
X_train.loc[:,"averageDeadstockPrice"] = MinMaxScaler().fit_transform(X_train["averageDeadstockPrice"].values.reshape(-1, 1))
X_test.loc[:,"averageDeadstockPrice"] = MinMaxScaler().fit_transform(X_test["averageDeadstockPrice"].values.reshape(-1, 1))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [20]:
X_train.head()

Unnamed: 0,averageDeadstockPrice,colour_aqua,colour_beige,colour_black,colour_blue,colour_brown,colour_coral,colour_crimson,colour_gold,colour_goldenrod,...,category_adidas NMD,category_adidas Other,category_adidas Stan Smith,category_adidas Ultra Boost,category_adidas Yeezy,gender_child,gender_infant,gender_men,gender_preschool,gender_women
225,0.033349,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
94,0.0155,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
462,0.044152,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
284,0.017379,0,0,1,0,0,0,0,0,0,...,1,0,0,0,0,0,0,1,0,0
23,0.050728,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [21]:
y_train.head()

225     796
94      145
462     197
284     499
23     1674
Name: deadstockSold, dtype: int64

In [22]:
dump(X_train, "X_train.joblib")
dump(y_train, "y_train.joblib")
dump(X_test, "X_test.joblib")
dump(y_test, "y_test.joblib")

['y_test.joblib']

In [37]:
df.head()

Unnamed: 0,averageDeadstockPrice,colour_aqua,colour_beige,colour_black,colour_blue,colour_brown,colour_coral,colour_crimson,colour_gold,colour_goldenrod,...,category_adidas Other,category_adidas Stan Smith,category_adidas Ultra Boost,category_adidas Yeezy,gender_child,gender_infant,gender_men,gender_preschool,gender_women,deadstockSold
0,210,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,1,0,0,44325
1,261,0,0,1,0,0,0,0,0,0,...,0,0,0,1,0,0,1,0,0,38747
2,323,0,0,1,0,0,0,0,0,0,...,0,0,0,1,0,0,1,0,0,33708
3,233,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,1,0,0,21305
4,307,0,0,0,1,0,0,0,0,0,...,0,0,0,1,0,0,1,0,0,11207
