# Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime

from sklearn.model_selection import train_test_split
from tensorflow.keras import models
from tensorflow.keras import Sequential
from tensorflow.keras import layers
from tensorflow.keras import optimizers, metrics
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.optimizers.schedules import ExponentialDecay
from tensorflow.keras.applications.resnet50 import ResNet50

# Extracting the embeddings

The following code works for both menswear & womensear data/models

In [None]:
#Loading the data
path = "..."
cdf_name = "name.csv"
cidf_name = 'name.npy'
cdf = pd.read_csv(path + mtdf_name)
idf = np.load(path + midf_name)
print(cdf.shape)
print(idf.shape)

In [None]:
#Loading the resnet model
resnet_path = "..."
resnet_name = "name"
resnet = models.load_model(mresnet_path + mresnet_name)
resnet.summary()

In [None]:
#Loading the custom model
custom_path = "..."
custom_name = "name"
custom = models.load_model(mcustom_path + mcustom_name)
custom.summary()

In [None]:
#Resnet embeddings
resnetFC = Model(inputs=resnet.input,
                  outputs=resnet.get_layer('avg_pool').output)
resnet_embeds = resnetFC.predict(idf)
resnet_embeds.shape

In [None]:
#Custom embeddings
customFC = Model(inputs=custom.input,
                  outputs=custom.get_layer('flatten').output)
custom_embeds = customFC.predict(idf)
custom_embeds.shape

In [None]:
#Saving
path = "..."
ts = datetime.datetime.now()
strs = str(ts)[:10] + "_" + str(ts)[11:16]
strs = strs.replace(":", "-")

name = "Embeddings_"

resname = name + "resnet_" + strs + ".npy"
cusname = name + "custom_" + strs + ".npy"

res_path = path + resname
cus_path = path + cusname

np.save(res_path, resnet_embeds)
print(f"Saved {resname} with shape {resnet_embeds.shape} !")
np.save(cus_path, custom_embeds)
print(f"Saved {cusname} with shape {custom_embeds.shape} !")

# Preparing the data to be filtered

## Preparing the data to filter by category

The data does not have a variable with the category (tops, dresses, pants...) of each item. <br>
Given the wide range of vocabulary and the *variation* in the spelling (e.g. 'polo shirt', 'polo-shirt', 'polo-shirts'...) used to describe the items in their title, grouping them by the keyword in their title was impossible. <br>
Thus, I decided to train a computer vision model to classify them in 8 categories (tops, shirts, pants, dress, sweaters, underwear, coats & overall).

First, we prepare our training data by grouping items with an easily identified keyword in their title

### Data preparation

In [1]:
dico_categories = {
    #Tops
    'polo-shirt': 'tops',
    'polo shirt': 'tops',
    'tops': 'tops',
    'top': 'tops',
    'crop top': 'tops',
    'crop-top': 'tops',
    'croptop': 'tops',
    't shirt': 'tops',
    't-shirt': 'tops',
    'tees': 'tops',
    'rugby-shirt': 'tops',
    'rugby shirt': 'tops',
    'henley': 'tops',
    'tee': 'tops',
    'tees': 'tops',
    'long sleeve': 'tops',
    'long-sleeve': 'tops',
    't-shirts': 'tops',
    't shirts': 'tops',
    'polo': 'tops',
    'polos': 'tops',
    'corset': 'tops',
    'tank': 'tops',
    't': 'tops',
    #Shirts
    'shirt': 'shirts',
    'cabana': 'shirts',
    'shirts': 'shirts',
    'overshirt': 'shirts',
    'overshirts': 'shirts',
    #Pants
    'jeans': 'pants',
    'chino': 'pants',
    'chinos': 'pants',
    'trousers': 'pants',
    'leggings': 'pants',
    'sweatpants': 'pants',
    "sweatpant": 'pants',
    'pants': 'pants',
    'sweat pant': 'pants',
    'pant': 'pants',
    'jean': 'pants',
    'shorts': 'pants',
    'bermuda': 'pants',
    'joggers': 'pants',
    'jeggings': 'pants',
    'jegging': 'pants',
    'leggings': 'pants',
    'legging': 'pants',
    'short': 'pants',
    'trouser': 'pants',
    'jogger': 'pants',
    'joggers': 'pants',
    'flare': 'pants',
    'flares': 'pants',
    'sweatshort': 'pants',
    'sweatshorts': 'pants',
    'leg': 'pants',
    'legs': 'pants',
    'tight': 'pants',
    'tights': 'pants',
    #Overall _ jumpsuit
    'overall': 'overall',
    'overalls': 'overall',
    'jumpsuit': 'overall',
    'jumpsuits': 'overall',
    'playsuit': 'overall',
    'playsuits': 'overall',
    'unitard': 'overall',
    'body': 'overall',
    'bodysuit': 'overall',
    'Jumpsuits/one pieces': 'overall',
    # Dresses
    'dress': 'dress',
    'dresses': 'dress',
    'skirt': 'dress',
    'skirts': 'dress',
    'sarong': 'dress',
    'sarongs': 'dress',
    'robe': 'dress',
    'gown': 'dress',
    # sweaters
    'sweater': 'sweaters',
    'hoodies': 'sweaters',
    'jumper': 'sweaters',
    'hoodie': 'sweaters',
    'cardigan': 'sweaters',
    'cardigans': 'sweaters',
    'pullover': 'sweaters',
    'pullovers': "sweaters",
    'sweatshirt': 'sweaters',
    'sweat-shirt': 'sweaters',
    'sweat shirt': 'sweaters',
    'hoody': 'sweaters',
    'knit': 'sweaters',
    'turtleneck': 'sweaters',
    'turtlenecks': "sweaters",
    'pull-over': 'sweaters',
    'crewneck': 'sweaters',
    'sweatshirts': 'sweaters',
    'sweaters': 'sweaters',
    'crew': 'sweaters',
    'sweat': 'sweaters',
    'sweats': 'sweaters',
    'fleece': 'sweaters',
    'fleeces': 'sweaters',
    'hood': 'sweaters',
    #Underwear & swim
    'socks': 'underwear',
    'underwear': 'underwear',
    'pyjamas': 'underwear',
    'trunks': 'underwear',
    'trunk': 'underwear',
    'boxer': 'underwear',
    'boxers': 'underwear',
    'brief': 'underwear',
    'briefs': 'underwear',
    'bra': 'underwear',
    'bralette': 'underwear',
    'bathrobe': 'underwear',
    'panty': 'underwear',
    'panties': 'underwear',
    'lingerie': 'underwear',
    'tights': 'underwear',
    'thong': 'underwear',
    'bras': 'underwear',
    'bottom': 'underwear',
    'bottoms': 'underwear',
    'bralet': 'underwear',
    'bralets': 'underwear',
    'sportsbra': 'underwear',
    'sportbra': 'underwear',
    'sportbras': 'underwear',
    'sportsbras': 'underwear',
    'bikini': 'underwear',
    'bikinis': 'underwear',
    'thongs': 'underwear',
    'bikini-top': 'underwear',
    'bikini top': 'underwear',
    'swimshort': 'underwear',
    'swimwear': 'underwear',
    'swim-short': 'underwear',
    'swimshorts': 'underwear',
    'swim-shorts': 'underwear',
    'volley-short': 'underwear',
    'volley-shorts': 'underwear',
    'volley short': 'underwear',
    'volley shorts': 'underwear',
    'bikini': 'underwear',
    'saron': 'underwear',
    'swimsuit': 'underwear',
    'swim-suit': 'underwear',
    'swim suit': 'underwear',
    'swimming trunks': 'underwear',
    'beachwear': 'underwear',
    'swim trunk': 'underwear',
    'swim trunks': 'underwear',
    'swim-trunks': 'underwear',
    'pajamas': 'underwear',
    'pajama': 'underwear',
    'pyjama': 'underwear',
    'bathing': 'underwear',
    'swimming': 'underwear',
    #Coats & jackets
    'vest': 'coats',
    'gilet': 'coats',
    'coat': 'coats',
    'jacket': 'coats',
    'puffer': 'coats',
    'trench': 'coats',
    'blazer': 'coats',
    'blazers': 'coats',
    'parka': 'coats',
    'anorak': 'coats',
    'blouse': 'coats',
    'bomber': 'coats',
    'jersey': "coats",
    'bombers': 'coats',
    'jackets': 'coats',
    'coats': 'coats',
    'vests': 'coats',
    'mac': 'coats',
    'poncho': 'coats',
    'ponchos': 'coats',
    'overcoat': 'coats',
    'overcoats': 'coats',
    'windbreaker': "coats",
    "windbreakers": 'coats'
}

second_dic = {
    'tops': 0,
    'shirts': 1,
    'pants': 2,
    'dress': 3,
    'sweaters': 4,
    'underwear': 5,
    'coats': 6,
    'overall': 7
}


def classify_by_title(x):
    categ = ""
    x = x.split(" ")
    for word in x:
        if word.lower() in dico_categories.keys():
            categ = dico_categories[word.lower()]
    if categ == "":
        categ = "NA"
    return categ

In [None]:
#We only need the keywords extracted from the titles
X_temporary = cdf['Title'].apply(classify_by_title)
#We can exclude the items for which no category could be extracted from their title
X = midf[tt[tt != "NA"].index]

#For our dependent variable, we can One-Hot encode it - each category is attributed a key
y = to_categorical(X.apply(lambda x: second_dic[x]))

print(X.shape)
print(y.shape)

In [None]:
#Preparing train and test sets
X_img_train, X_img_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

print(f'X train shape: {X_img_train.shape}')
print(f'y train shape: {y_train.shape}')
print(f'X test shape: {X_img_test.shape}')
print(f'y test shape: {y_test.shape}')

Let's start from a ResNet-50 model which we train to classify items by their category

### Training a ResNet-50 model to categorize items

In [None]:
#Loading resnet
def load_model():
    model = ResNet50()
    return model


#freeze the resnet layers
def set_nontrainable_layers(model):
    # Set the first layers to be untrainable
    model.trainable = False
    return model


def load_resnet():
    res = load_model()
    res = set_nontrainable_layers(res)
    resFC = res.get_layer('avg_pool').output

    output = layers.Flatten(name='new_flatten')(resFC)
    #output = layers.Dense(1000, activation='relu', name='dense')(output)
    output = layers.Dense(500, activation='relu', name='dense1')(output)
    output = layers.Dense(250, activation='relu', name='dense2')(output)
    output = layers.Dense(8, activation='softmax', name='prediction')(output)
    resnet_model = Model(res.input, output)

    return resnet_model

In [None]:
rmodel = load_resnet()
rmodel.summary()

In [None]:
#Add parameters to compile the model
lr_schedule = ExponentialDecay(initial_learning_rate=0.001,
                               decay_steps=5000,
                               decay_rate=0.9)
adam = Adam(learning_rate=lr_schedule)
f1_score = F1Score(num_classes=8)

# Compiling model
rmodel.compile(optimizer=adam,
               loss='categorical_crossentropy',
               metrics=['accuracy', 'Precision', f1_score])

In [None]:
#Fitting the model
es = EarlyStopping(patience=5, restore_best_weights=True)

rhistory = rmodel.fit(X_img_train,
                      y_train,
                      validation_split=0.2,
                      epochs=15,
                      verbose=1,
                      batch_size=32,
                      callbacks=[es])

In [None]:
rmodel.evaluate(X, y)

In [None]:
predictions = rmodel.predict(widf, verbose=1)
predictions.shape

In [None]:
model_path = "..."

ts = datetime.datetime.now()
strs = str(ts)[:10] + "_" + str(ts)[11:16]
strs = strs.replace(":", "-")

model_file_name = "Categorization_" + strs
rmodel.save(model_path + model_file_name)

### Assigning categories

Once the model trained, let's use it to assign each item a category.  <br>
The model has a 75% accuracy score. As a result, we will only use it to classify items whose category can not be derived from their title.

In [None]:
def categorize(tdf, idf):
    #Assigning categories based on the title
    tdf["Title"] = tdf['Title'].apply(
        lambda x: x.replace("Long Sleeve", "long-sleeve"))
    cat_named = list(tdf['Title'].apply(classify_by_title))
    num_NA = pd.DataFrame(cat_named)[0].value_counts()['NA']
    print(f'There are {num_NA} items without a clear category in their title -> need to be predicted ')

    #Assuming the model was saved - Importing DL model
    print("Now importing the DL model to predict clothing categories")
    model_path = "..."
    catmo_name = "name"
    catmodel = models.load_model(model_path + catmo_name)
    print("Model imported, now making predictions")

    #Assignin categories of NA items
    preds = catmodel.predict(idf, verbose=1)
    cat_preds = []
    for i in list(np.argmax(preds, axis=1)):
        cat_preds.append(list(second_dic.keys())[i])

    cat_fin = []
    counter = 0
    for i in cat_named:
        if i == "NA":
            valu = cat_preds[counter]
        else:
            valu = i
        counter += 1
        cat_fin.append(valu)
    print("Done with the predictions")
    return pd.DataFrame(cat_fin)

In [None]:
catdf = categorize(cdf, idf)
ctdf['Category'] = catdf[0]

## Preparing the data to filter by budget

In [None]:
def clean_price(x):
    #Function to drop the $ in front of the price
    new_price = []
    for i in x:
        if i != '$':
            new_price.append(i)
    new_price = ''.join(new_price)
    new_price = new_price.replace(",", "")
    return new_price

In [None]:
cdf['New_price'] = cdf['Price'].apply(clean_price)

Once the price cleaned, we will group items by budget. <br>
Having browsed several fashion websites, I came up with 4 budget brackets, and their threshold for each category of item. <br>
Example: Coats costing less than **£65** are consider **accessible**, those between **£65 and £135** are in the **intermediate** budget, between **£135 and £250** in the **premium** segment, while those worth more than **£250** are deemed **luxury** items. 

In [None]:
def budget(cat, price):
    #Takes as an argument both the price and the category
    tops_budget = {20: 1, 35: 2, 60: 3}
    shirts_budget = {30: 1, 60: 2, 120: 3}
    pants_budget = {40: 1, 70: 2, 140: 3}
    dress_budget = {40: 1, 80: 2, 150: 3}
    sweaters_budget = {30: 1, 60: 2, 120: 3}
    underwear_budget = {10: 1, 25: 2, 40: 3}
    coats_budget = {65: 1, 135: 2, 250: 3}
    overall_budget = dress_budget

    dic_budget = {
        'tops': tops_budget,
        'shirts': shirts_budget,
        'pants': pants_budget,
        'dress': dress_budget,
        'sweaters': sweaters_budget,
        'underwear': underwear_budget,
        'coats': coats_budget,
        'overall': overall_budget
    }

    budgets = []

    for i in range(len(cat)):
        item_cat = cat[i]
        item_range = dic_budget[item_cat]
        for bracket in item_range.keys():
            if float(price[i]) < bracket:
                item_budget = item_range[bracket]
                break
            else:
                item_budget = 4
        budgets.append(item_budget)

    return budgets

In [None]:
cdf['Budget'] = budget(list(cdf['Category']), list(cdf['New_price']))

In [None]:
plt.hist(cdf['Budget']);

## Preparing the data to filter by size

Preparing the data to be filter by size - i.e. grouping it by a common size scale - also proved to be complicated given the wide variety of sizes used by brands and the different websites the data was scrapped from. 

In [None]:
dico_size = {
        'XXS': 'XX-Small',
        'XS': '1X-Small',
        'S': '0X-Small',
        'M': 'Medium',
        'L': '0X-Large',
        'XL': '1X-Large',
        'XXL': '2X-Large',
        '3XL': '3X-Large',
        'XXXL': '3X-Large',
        '(XS)': '1X-Small',
        '(S)': '0X-Small',
        '(M)': 'Medium',
        '(L)': '0X-Large',
        '(XL)': '1X-Large',
        '(XXL)': '2X-Large',
        '(3XL)': '3X-Large',
        '(XXXL)': '3X-Large',
        'One Size': 'One Size',
        '(XXS)': '2X-Small',
        'X-Small': '1X-Small',
        'XX-Small': '2X-Small',
        'XXX-Small': '3X-Small',
        'Small': '0X-Small',
        'Medium': 'Medium',
        'Large': '0X-Large',
        'X-Large': '1X-Large',
        'XX-Large': "2X-Large",
        '3X-Large': '3X-Large',
        'XXX-Large': "3X-Large"}


def top_size(x):
    #Function to clean the size of tops (shits, tshirts, sweaters...)
    new_x = []

    x = x.split(" ")
    for sz in x:
        if sz in dico_size.keys():
            new_x.append(dico_size[sz])

    new_x = " | ".join(new_x)

    if new_x == "":
        new_x = x
        new_x = " ".join(new_x)
    print(new_x)
    if new_x == ['One', 'Size']:
        new_x = ['One Size']
    return new_x

def pant_size(cat, size):
    #This function allows to handle the size of pants
    #Top sizes function only handle X L M S format but not waist / inseam format for pants, as this function does
    #This function takes into argument the list of categories and list of 'clean sizes'
    #That is sizes already cleanned by the top_sizes function because some pants are in the S M L XL format
    new_size = []
    for i in range(len(cat)):
        if cat[i] == 'pants':
            split = size[i].split(" | ")
            if split[0] in dico_size.values():
                #Situation where size -> M L XL format
                new_size.append(size[i])
                continue
            else:
                #Situation where size -> 32" or 32" x 32" format
                int_size = []
                for i in split:
                    int_size.append(i[:3])
                new_size.append(" | ".join(int_size))
        else:
            new_size.append(size[i])
            
    return new_size
                

In [None]:
size = mtdf['Sizing'].apply(top_size)
mtdf['Clean_sizes'] = size

In [None]:
top_and_pant_sizes = pant_size(list(mtdf['Category']), list(mtdf['Clean_sizes']))
cdf['Clean_sizes'] = top_and_pant_sizes

## Saving the modified DataFrame

In [None]:
path = "..."
ts = datetime.datetime.now()
strs = str(ts)[:10] + "_" + str(ts)[11:16]
strs = strs.replace(":","-")

name = "Clean_DataFrame_"

file_name = name + strs + ".csv"
print(f"Saved {file_name} !")
cdf.to_csv(path + file_name)