In [2]:
import pandas as pd
import json
from tensorflow.keras.preprocessing.text import tokenizer_from_json
from matplotlib import pyplot as plt
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, SpatialDropout1D
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
import numpy as np

%matplotlib inline 

In [3]:
with open('tokenizer.json') as f:
    data = json.load(f)
    tokenizer = tokenizer_from_json(data)

In [46]:
second_levels = [["Jewellery", ["Bangles, Bracelets & Armlets", "Rings", "Pendants & Lockets", "Necklaces & Chains", "Mangalsutras & Tanmaniyas", "Jewellery Sets", "Accessories", "Anklets", "Earrings", "Hair Accessories", "Artificial Jewellery", "Precious Jewellery", "Nose Rings & Studs"]], ["Clothing", ["Women's Clothing", "Kids' Clothing", "Men's Clothing"]], ["Footwear", ["Women's Footwear", "Men's Footwear", "Kids' & Infant Footwear", "REMSON INDIA Women Flats"]], ["Automotive", ["Accessories & Spare parts", "Car & Bike Accessories", "Car Accessories"]], ["Computers", ["Network Components", "Tablet Accessories", "Laptop Accessories", "Software", "Audio Players", "Computer Peripherals", "Storage", "Computer Components", "Laptops"]], ["Watches", ["Wrist Watches", "Watch Accessories", "Clocks"]]]

In [47]:
products = pd.read_csv("train_set.csv")
products.head()

Unnamed: 0,name,clean_name,category
0,Karatcraft Robini Emerald Gold Diamond 18 K Ring,karatcraft robini emerald gold diamond k ring,Jewellery
1,Karatcraft Robini Emerald Gold Diamond 18 K Ring,karatcraft robini emerald gold diamond k ring,Rings
2,Fashion Flow+ Women's Leggings,fashion flow womens leggings,Clothing
3,Fashion Flow+ Women's Leggings,fashion flow womens leggings,Women's Clothing
4,Moda Vastra Casual 3/4 Sleeve Graphic Print Wo...,moda vastra casual sleeve graphic print womens...,Clothing


In [48]:
sequence_maxlen = 250
vocabulary_size = 10000 # Confirm the count of unique tokens from tokenizer. This number should be at least that
epochs = 5
batch_size = 64

In [49]:
def get_model(num_labels):
    model = Sequential()
    model.add(Embedding(vocabulary_size, 100, input_length=X.shape[1]))
    model.add(SpatialDropout1D(0.2))
    model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
    model.add(Dense(num_labels, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [50]:
dataset = {}

In [51]:
# First Level Classifier
first_level_labels = ["Automotive", "Clothing", "Computers", "Footwear", "Furniture", "Jewellery", "Watches", "eBooks"]
first_level_labels.sort()

print('Training the top level model')

# Filter the products for top level categories
filtered_products = products[products.category.isin(first_level_labels)]
print(len(filtered_products))

# Generate X, Y and labels
X = tokenizer.texts_to_sequences(filtered_products['clean_name'].values)
X = pad_sequences(X, maxlen=sequence_maxlen)
num_labels = len(first_level_labels)
Y = pd.get_dummies(filtered_products['category']).values

X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.10, random_state = 42)
print(X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)

model = get_model(num_labels)

model_filename = 'top_classifier.h5'

checkpoint = ModelCheckpoint(model_filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
early_stopping = EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)

history = model.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size, validation_split=0.1, callbacks=[checkpoint, early_stopping])


dataset["top"] = { "X_train": X_train, "X_test": X_test, "Y_train": Y_train, "Y_test": Y_test, "model_filename": model_filename, "model": model }


Training the top level model
100
(90, 250) (10, 250) (90, 8) (10, 8)
Epoch 1/5
Epoch 00001: val_loss improved from inf to 2.06174, saving model to top_classifier.h5
Epoch 2/5
Epoch 00002: val_loss improved from 2.06174 to 2.03443, saving model to top_classifier.h5
Epoch 3/5
Epoch 00003: val_loss improved from 2.03443 to 1.99429, saving model to top_classifier.h5
Epoch 4/5
Epoch 00004: val_loss improved from 1.99429 to 1.92254, saving model to top_classifier.h5
Epoch 5/5
Epoch 00005: val_loss improved from 1.92254 to 1.78591, saving model to top_classifier.h5


In [52]:
for [sub_category, children] in second_levels:
    print(f'Training model for: {sub_category}, with children: {children}')
    # Filter the products for sub category
    filtered_products = products[products.category.isin(children)]
    print(len(filtered_products))
    
    # Generate X, Y and labels
    X = tokenizer.texts_to_sequences(filtered_products['clean_name'].values)
    X = pad_sequences(X, maxlen=sequence_maxlen)
    labels = list(set(filtered_products['category'].values))
    labels.sort()
    num_labels = len(labels)
    Y = pd.get_dummies(filtered_products['category']).values
    
    X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.10, random_state = 42)
    
    model = get_model(num_labels)
    
    model_filename = f'{sub_category}_classifier.h5'
    
    checkpoint = ModelCheckpoint(model_filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
    early_stopping = EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)

    history = model.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size, validation_split=0.1, callbacks=[checkpoint, early_stopping])
    
    dataset[sub_category] = { "X_train": X_train, "X_test": X_test, "Y_train": Y_train, "Y_test": Y_test, "model_filename": model_filename, "model": model }


Training model for: Jewellery, with children: ['Bangles, Bracelets & Armlets', 'Rings', 'Pendants & Lockets', 'Necklaces & Chains', 'Mangalsutras & Tanmaniyas', 'Jewellery Sets', 'Accessories', 'Anklets', 'Earrings', 'Hair Accessories', 'Artificial Jewellery', 'Precious Jewellery', 'Nose Rings & Studs']
26
Epoch 1/5
Epoch 00001: val_loss improved from inf to 1.60477, saving model to Jewellery_classifier.h5
Epoch 2/5
Epoch 00002: val_loss improved from 1.60477 to 1.59386, saving model to Jewellery_classifier.h5
Epoch 3/5
Epoch 00003: val_loss improved from 1.59386 to 1.58311, saving model to Jewellery_classifier.h5
Epoch 4/5
Epoch 00004: val_loss improved from 1.58311 to 1.57170, saving model to Jewellery_classifier.h5
Epoch 5/5
Epoch 00005: val_loss improved from 1.57170 to 1.56002, saving model to Jewellery_classifier.h5
Training model for: Clothing, with children: ["Women's Clothing", "Kids' Clothing", "Men's Clothing"]
41
Epoch 1/5
Epoch 00001: val_loss improved from inf to 1.07322,

Epoch 2/5
Epoch 00002: val_loss did not improve from 0.00000
Epoch 3/5
Epoch 00003: val_loss did not improve from 0.00000
Epoch 4/5
Epoch 00004: val_loss did not improve from 0.00000
