In [1]:
import sys
sys.path.append('../')

from src.datasets.preprocess import Preprocessor
from src.tests.util import PATH
import logging

logging.basicConfig(level=logging.INFO)

In [2]:
processor = Preprocessor(PATH)

INFO:fashion:Cleaning csv
INFO:fashion:Reading clean csv into df
INFO:fashion:Splitting into test-train
INFO:fashion:Sub-splitting based on top-20 classes
INFO:fashion:Create maps for categoires


In [3]:
processor.mastercat_map

{'Apparel': 0,
 'Accessories': 1,
 'Footwear': 2,
 'Personal Care': 3,
 'Free Items': 4,
 'Sporting Goods': 5,
 'Home': 6}

In [4]:
processor.subcat_map

{'Topwear': 0,
 'Bottomwear': 1,
 'Watches': 2,
 'Socks': 3,
 'Shoes': 4,
 'Belts': 5,
 'Flip Flops': 6,
 'Bags': 7,
 'Innerwear': 8,
 'Sandal': 9,
 'Shoe Accessories': 10,
 'Fragrance': 11,
 'Jewellery': 12,
 'Lips': 13,
 'Saree': 14,
 'Eyewear': 15,
 'Nails': 16,
 'Scarves': 17,
 'Dress': 18,
 'Loungewear and Nightwear': 19,
 'Wallets': 20,
 'Apparel Set': 21,
 'Headwear': 22,
 'Mufflers': 23,
 'Skin Care': 24,
 'Makeup': 25,
 'Free Gifts': 26,
 'Ties': 27,
 'Accessories': 28,
 'Skin': 29,
 'Beauty Accessories': 30,
 'Water Bottle': 31,
 'Eyes': 32,
 'Bath and Body': 33,
 'Gloves': 34,
 'Sports Accessories': 35,
 'Cufflinks': 36,
 'Sports Equipment': 37,
 'Stoles': 38,
 'Hair': 39,
 'Perfumes': 40,
 'Home Furnishing': 41,
 'Umbrellas': 42,
 'Wristbands': 43,
 'Vouchers': 44}

### Train simple NB models

In [10]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB

def evaluate(styles, get_clf=False, split=True):
    styles = styles.dropna()
    if split:
        styles_train, styles_val = train_test_split(styles, test_size=0.2)
    else:
        styles_train = styles
        styles_val = styles
    
    count_vect = CountVectorizer()
    X_train = count_vect.fit_transform(styles_train.productDisplayName)
    if split:
        X_val = count_vect.transform(styles_val.productDisplayName)
    else:
        X_val = X_train
    
    print(X_train.shape)
    
    y_train = styles_train.articleType.values
    y_val = styles_val.articleType.values
    
    clf = MultinomialNB()
    clf.fit(X_train, y_train)
    pred = clf.predict(X_val)
    
    if get_clf:
        return clf, count_vect, np.mean(pred == y_val)
    else:
        return np.mean(pred == y_val)

In [11]:
# Complete data

evaluate(processor.styles)

(35257, 7718)


0.8545660805445264

In [53]:
# Train split

evaluate(processor.full_train)

0.8625552515259945

In [54]:
import pandas as pd

# Top-20 train split

df = pd.concat((processor.data_top20_map['train'], processor.data_top20_map['val']))
clf, vect, acc = evaluate(df, True, False)
print("Train split acc: {}".format(acc))

df = processor.data_top20_map['test'].dropna()
X_test = vect.transform(df.productDisplayName)
y_test = df.articleType.values

pred = clf.predict(X_test)
print("Test acc: {}".format(np.mean(pred == y_test)))

Train split acc: 0.9659735349716446
Test acc: 0.852290783898305


In [55]:
# Fine-tune train split

df = pd.concat((processor.data_ft_map['train'], processor.data_ft_map['val']))
clf, vect, acc = evaluate(df, True, False)
print("Train split acc: {}".format(acc))

df = processor.data_ft_map['test'].dropna()
X_test = vect.transform(df.productDisplayName)
y_test = df.articleType.values

pred = clf.predict(X_test)
print("Test acc: {}".format(np.mean(pred == y_test)))

Train split acc: 0.9225169006760271
Test acc: 0.42547477460195665
