# Imports & Utils

In [None]:
#General
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import string 
import random


#Deep Learning - Computer Vision
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from tensorflow.keras import models
from tensorflow.keras import Sequential
from tensorflow.keras import layers
from tensorflow.keras import optimizers, metrics
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.optimizers.schedules import ExponentialDecay
from tensorflow.keras.applications.resnet50 import ResNet50
from tensorflow_addons.metrics import F1Score

#NLP
import spacy
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences


In [None]:
#defining plotting function
def plot_metrics(history, title=None):
    fig, ax = plt.subplots(1,3, figsize=(15,5))
    
    # --- LOSS --- 
    
    ax[0].plot(history.history['loss'])
    ax[0].plot(history.history['val_loss'])
    ax[0].set_title('Model loss')
    ax[0].set_ylabel('Loss')
    ax[0].set_xlabel('Epoch')
    ax[0].set_ylim((0,3))
    ax[0].legend(['Train', 'Test'], loc='best')
    ax[0].grid(axis="x",linewidth=0.5)
    ax[0].grid(axis="y",linewidth=0.5)
    
    # --- ACCURACY
    
    ax[1].plot(history.history['accuracy'])
    ax[1].plot(history.history['val_accuracy'])
    ax[1].set_title('Model Accuracy')
    ax[1].set_ylabel('Accuracy')
    ax[1].set_xlabel('Epoch')
    ax[1].legend(['Train', 'Test'], loc='best')
    ax[1].set_ylim((0,1))
    ax[1].grid(axis="x",linewidth=0.5)
    ax[1].grid(axis="y",linewidth=0.5)
    
    # --- F1
    ax[2].plot(history.history['f1_score'])
    ax[2].plot(history.history['val_f1_score'])
    ax[2].set_title('Model F1 Score')
    ax[2].set_ylabel('F1 Score')
    ax[2].set_xlabel('Epoch')
    ax[2].legend(['Train', 'Test'], loc='best')
    ax[2].set_ylim((0,1))
    ax[2].grid(axis="x",linewidth=0.5)
    ax[2].grid(axis="y",linewidth=0.5)
    
    if title:
        fig.suptitle(title)

# Loading & preprocessing data

In [None]:
#Loading data
path = "..."
#cdf = DataFrame with title, brand, description, price etc...
cdf_name = "name.csv"
#idf = images stored as numpy arrays
idf_name = 'name.npy'
cdf = pd.read_csv(path + tdf_name)
Ximg = np.load(path + idf_name)

In [None]:
#One hot encoding the target variable
#For the womenswear dataset, there are 6 different styles stored in the ['Style'] column
dicat = {'classic':0, 'edgy':1, 'glamour': 2,
        'street':3, 'minimalism':4, 'feminity':5}

y_raw = cdf['Style'].apply(lambda x: dicat[x])
y = to_categorical(y_raw)
y.shape

In [None]:
#Train test split
#Preparing train and test sets
X_img_train, X_img_test, y_train, y_test = train_test_split(Ximg, y, test_size=0.25)
print(f'X train shape: {X_img_train.shape}')
print(f'y train shape: {y_train.shape}')
print(f'X test shape: {X_img_test.shape}')
print(f'y test shape: {y_test.shape}')

# ResNet-50 model

First, let's train do transfer learning and train ResNet-50 model to identify the style of clothes from images

In [None]:
#Loading resnet
def load_model():
    model = ResNet50()
    return model

#freeze the resnet layers in order to only train the layers adapted to our task
def set_nontrainable_layers(model):
    # Set the first layers to be untrainable
    model.trainable = False
    return model

In [None]:
def load_resnet():
    res = load_model()
    res = set_nontrainable_layers(res)
    resFC = res.get_layer('avg_pool').output
    
    output = layers.Flatten(name='new_flatten')(resFC)
    output = layers.Dense(500, activation='relu', name='dense1')(output)
    output = layers.Dense(250, activation='relu', name='dense2')(output)
    output = layers.Dense(6, activation='softmax', name='prediction')(output)
    resnet_model = Model(res.input, output)

    return resnet_model

In [None]:
rmodel = load_resnet()
rmodel.summary()

In [None]:
#Add parameters to compile the model
lr_schedule = ExponentialDecay(initial_learning_rate=0.001, decay_steps=5000, decay_rate=0.5)
adam = Adam(learning_rate=lr_schedule)
f1_score = F1Score(num_classes=6)

# Compiling model
rmodel.compile(optimizer = adam, loss = 'categorical_crossentropy', metrics=['accuracy', 'Precision', f1_score])

In [None]:
#Fitting the model
es = EarlyStopping(patience=5, restore_best_weights=True)

rhistory = rmodel.fit(X_img_train, y_train,
                  validation_split=0.2, 
                  epochs = 15, 
                  verbose = 1,
                  batch_size=32,
                  callbacks=[es])

In [None]:
rmodel.evaluate(X_img_test, y_test)

In [None]:
plot_metrics(rhistory)

In [None]:
#Saving the model
model_path = "..."

ts = datetime.datetime.now()
strs = str(ts)[:10] + "_" + str(ts)[11:16]
strs = strs.replace(":","-")

model_file_name = "rmodel_women_" + strs
rmodel.save(model_path + model_file_name)

# Custom CNN model

Now let's try to train a CNN model created for the occasion - and thus specialized in thi specific task

Given that the FC layer of the ResNet model has a dimension of (,2048), let's design the model to have a **similar sized FC layer** in order to have embeddings that are **comparable** for the two models 

In [None]:
def initialize_model():
    # initialize
    model = models.Sequential()

    #first layer
    model.add(layers.Conv2D(64, kernel_size=(3, 3), activation='relu', input_shape=(224, 224, 3)))
    model.add(layers.Conv2D(64, kernel_size=(3, 3), activation='relu'))
    model.add(layers.MaxPool2D(pool_size=(2,2)))
    model.add(layers.Dropout(rate=0.3))

    #second layer
    model.add(layers.Conv2D(64, kernel_size=(3, 3), activation='relu'))
    model.add(layers.Conv2D(64, kernel_size=(3, 3), activation='relu'))
    model.add(layers.MaxPool2D(pool_size=(2,2)))
    model.add(layers.Dropout(rate=0.3))
    
    #third layer
    model.add(layers.Conv2D(32, kernel_size=(3, 3), activation='relu'))
    model.add(layers.Conv2D(32, kernel_size=(3, 3), activation='relu'))
    model.add(layers.MaxPool2D(pool_size=(2,2)))
    model.add(layers.Dropout(rate=0.3))
    
    #third layer
    model.add(layers.Conv2D(24, kernel_size=(3, 3), activation='relu'))
    model.add(layers.Conv2D(24, kernel_size=(3, 3), activation='relu'))
    model.add(layers.MaxPool2D(pool_size=(2,2)))
    model.add(layers.Dropout(rate=0.15))
    
    #flattening before dense
    model.add(layers.Flatten())
    
    model.add(layers.Dense(150, activation='relu'))

    #dense layer
    model.add(layers.Dense(75, activation='relu'))

    #last classification layer
    model.add(layers.Dense(6, activation='sigmoid'))

    return model

In [None]:
#Compiling the model

#Decaying learning rate in the optimizer
lr_schedule = ExponentialDecay(initial_learning_rate=0.001, decay_steps=5000, decay_rate=0.5)
adam = Adam(learning_rate=lr_schedule)

f1_score = F1Score(num_classes=6)

# instantiating model
cmodel = initialize_model()
cmodel.compile(optimizer = adam, loss = 'categorical_crossentropy', metrics=['accuracy', 'Precision', f1_score], run_eagerly=True)

In [None]:
cmodel.summary()
#Here we have a (,2400) FC layer - close enough to 2048 !

In [None]:
#Fitting the model
es = EarlyStopping(patience=3, restore_best_weights=True)

chistory = cmodel.fit(X_img_train, y_train,
                  validation_split=0.2, 
                  epochs = 15, 
                  verbose = 1,
                  batch_size=32,
                  callbacks=[es])

In [None]:
cmodel.evaluate(X_img_test, y_test)

In [None]:
plot_metrics(cmodel)

In [None]:
#Saving the model
model_path = "..."

ts = datetime.datetime.now()
strs = str(ts)[:10] + "_" + str(ts)[11:16]
strs = strs.replace(":","-")

model_file_name = "cmodel_women" + strs
cmodel.save(model_path + model_file_name)

# NLP model (not used)

Here, I tried to train an NLP model to identify the style of clothes from their description and title

The model produced good results when trained and used on data sourced from a single website (and thus with description written in a similar vein / with the same style).
However, it failed to scale and produce good results when the textual data was diversified and came from different websites (with different description styles, i.e. vocabulary, text length...).
As a result, I ended up not using the embeddings extracted from the NLP model

## Feature Engineering

In [None]:
#Creating text column with the text of the title and description
cdf["Text"] = cdf["Title"] + " " + cdf["Description"]
cdf.head(3)

In [None]:
#Switch text to lowercase
cdf['Text'] = cdf['Text'].apply(lambda x: x.lower())

#Handle punctuation
def punctualize(text):
    for punctuation in string.punctuation:
        text = text.replace(punctuation, '')
    return text

cdf['Text'] = cdf['Text'].apply(punctualize)

In [None]:
#Only keep text, no integers
def drop_numbers(x):
    text = ''.join(word for word in x if not word.isdigit())
    return text

cdf['Text'] = cdf['Text'].apply(drop_numbers)

To lemmatize the text, I used SpaCy because it produced better results than NLTK/Wordnet, Clip's Pattern and FastText

In [None]:
#Stopwords & lemmatization

nlp = spacy.load('en_core_web_sm')

def stop_lemme_spacy(x):
    stop_words = set(stopwords.words('english')) 
    lemmatizer = WordNetLemmatizer()
    #stopwords
    word_tokens = word_tokenize(x)
    text = [w for w in word_tokens if not w in stop_words]
    #spaCy lemmatization
    doc = nlp(x)
    lemmatized_sentence = " ".join([token.lemma_ for token in doc])
    return lemmatized_sentence

cdf['Text'] = cdf['Text'].apply(stop_lemme_spacy)

## TF-IDF Embeddings

In [None]:
#I used tf-idf to vectorize the text
texts = cdf['Text']
tf_idf_vectorizer = TfidfVectorizer()
Xt = tf_idf_vectorizer.fit_transform(texts)

#Create DataFrame of vectorized text
features_names = tf_idf_vectorizer.get_feature_names()
Xt = pd.DataFrame(Xt.toarray(), columns=features_names)
Xt.shape

To limit the calculating strain on my computer, I below reduce the 'vocabulary' of my vectorized word matrix <br>
The goal here is to reduce the dimension of the word matrix while still accounting for more than 90% of the variance

In [None]:
#I optimized the size of my vectorized dataframe by reducing its dimension in order to optimize computation

num_components = 800 #Reduce the 'vocabulary' of the matrix to 800 vectors
svd = TruncatedSVD(n_components=num_components)
svd.fit(Xt)
print(f'explained total variance ratio: {svd.explained_variance_ratio_.sum()}')
axis = np.arange(1, (num_components + 1))
plt.plot(axis, svd.explained_variance_ratio_.cumsum());

In [None]:
#Create new matrix with truncated word vectors
latent_df = pd.DataFrame(svd.fit_transform(Xt), index=cdf.index)

The latent_df matrix contains the text embedding of the description of clothes, which can be used to make recommendations based on the cosine similarity between them (but which I ended up not doing for the reasons given at the beginning of the NLP section)

## Embeddings from a RNN model

Because of the poor quality of the textual data, I had troubles training NLP models and tried different architectures and vectorizing methods. <br>
The Deep Learning architecture which produced the best results was use Keras' Tokenizer and a very simple GRU architecture

In [None]:
X = []
sentences = list(cdf["Text"])
for sen in sentences:
    X.append(sen)
print(len(X))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

In [None]:
#lets choose max length of vectors
maxlen = 200

### Let's tokenize the vocabulary 
tk = Tokenizer()
tk.fit_on_texts(X_train)
vocab_size = len(tk.word_index)
print(f'There are {vocab_size} different words in your corpus')
#Return tokenized sequences of different length (length : previous text lenght)
Xt_train = tk.texts_to_sequences(X_train)
Xt_test = tk.texts_to_sequences(X_test)
Xt = tk.texts_to_sequences(X)


#We need to uniformize the length by padding
#Maxlen -> maximum length of train data since it is data the vectorizer is trained on
Xp_train = pad_sequences(Xt_train, dtype='float32', padding='post', maxlen=maxlen)
Xp_test = pad_sequences(Xt_test, dtype='float32', padding='post', maxlen=maxlen)
Xp = pad_sequences(Xt, dtype='float32', padding='post', maxlen=maxlen)


print(f'Train shape: {Xp_train.shape}')
print(f'Test shape: {Xp_test.shape}')
print(f'Whole df shape: {Xp.shape}')

In [None]:
def rnnmodel(embedding_size):
    #initialization
    model = models.Sequential()
    
    #embedding layer
    model.add(layers.Embedding(input_dim=vocab_size + 1, 
                               output_dim=embedding_size, #size to represent each word
                               #input_length=max_sentence_length, #optional
                               mask_zero=True))
    
    #NN layers
    model.add(layers.GRU(150, activation='tanh', return_sequences=False))
    
    model.add(layers.Dense(1000, activation='relu'))
        
    model.add(layers.Dense(500, activation='relu'))
        
    model.add(layers.Dense(100, activation='relu'))
    
    #Output
    model.add(layers.Dense(6, activation='softmax'))
    
    return model

vrnnmodel = rnnmodel(maxlen)
vrnnmodel.summary()

In [None]:
lr_schedule = ExponentialDecay(initial_learning_rate=0.001, decay_steps=5000, decay_rate=0.5)
adam = Adam(learning_rate=lr_schedule)

vrnnmodel.compile(loss='binary_crossentropy', optimizer=adam, metrics=['accuracy', f1_score])

In [None]:
es = EarlyStopping(patience=5, restore_best_weights=True)

vrnnhistory = vrnnmodel.fit(Xp_train, y_train,
                     validation_split=0.2,
                     batch_size=32, 
                     epochs=15, 
                     verbose=1,  
                     callbacks=[es])

In [None]:
vrnnmodel.evaluate(Xp_test, y_test)

Accuracy = 40%, which is higher than the baseline (1/6 = 16,7%), but not very high

In [None]:
model_path = "..."

ts = datetime.datetime.now()
strs = str(ts)[:10] + "_" + str(ts)[11:16]
strs = strs.replace(":","-")

model_file_name = "NLP_women_" + strs
vrnnmodel.save(model_path + model_file_name)