In [1]:
from src.datasets.preprocess import Preprocessor
from src.tests.util import PATH
import logging

logging.basicConfig(level=logging.INFO)

In [2]:
processor = Preprocessor(PATH)

INFO:fashion:Cleaning csv
INFO:fashion:Reading clean csv into df
INFO:fashion:Splitting into test-train
INFO:fashion:Sub-splitting based on top-20 classes


In [3]:
processor.styles

Unnamed: 0,id,gender,masterCategory,subCategory,articleType,baseColour,season,year,usage,productDisplayName,image
0,15970,Men,Apparel,Topwear,Shirts,Navy Blue,Fall,2011.0,Casual,Turtle Check Men Navy Blue Shirt,15970.jpg
1,39386,Men,Apparel,Bottomwear,Jeans,Blue,Summer,2012.0,Casual,Peter England Men Party Blue Jeans,39386.jpg
2,59263,Women,Accessories,Watches,Watches,Silver,Winter,2016.0,Casual,Titan Women Silver Watch,59263.jpg
3,21379,Men,Apparel,Bottomwear,Track Pants,Black,Fall,2011.0,Casual,Manchester United Men Solid Black Track Pants,21379.jpg
4,53759,Men,Apparel,Topwear,Tshirts,Grey,Summer,2012.0,Casual,Puma Men Grey T-shirt,53759.jpg
...,...,...,...,...,...,...,...,...,...,...,...
44441,17036,Men,Footwear,Shoes,Casual Shoes,White,Summer,2013.0,Casual,Gas Men Caddy Casual Shoe,17036.jpg
44442,6461,Men,Footwear,Flip Flops,Flip Flops,Red,Summer,2011.0,Casual,Lotto Men's Soccer Track Flip Flop,6461.jpg
44443,18842,Men,Apparel,Topwear,Tshirts,Blue,Fall,2011.0,Casual,Puma Men Graphic Stellar Blue Tshirt,18842.jpg
44444,46694,Women,Personal Care,Fragrance,Perfume and Body Mist,Blue,Spring,2017.0,Casual,Rasasi Women Blue Lady Perfume,46694.jpg


### Train simple NB models

In [51]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB

def evaluate(styles, get_clf=False, split=True):
    styles = styles.dropna()
    if split:
        styles_train, styles_val = train_test_split(styles, test_size=0.2)
    else:
        styles_train = styles
        styles_val = styles
    
    count_vect = CountVectorizer()
    X_train = count_vect.fit_transform(styles_train.productDisplayName)
    if split:
        X_val = count_vect.transform(styles_val.productDisplayName)
    else:
        X_val = X_train
    
    y_train = styles_train.articleType.values
    y_val = styles_val.articleType.values
    
    clf = MultinomialNB()
    clf.fit(X_train, y_train)
    pred = clf.predict(X_val)
    
    if get_clf:
        return clf, count_vect, np.mean(pred == y_val)
    else:
        return np.mean(pred == y_val)

In [52]:
# Complete data

evaluate(processor.styles)

0.8637549631310266

In [53]:
# Train split

evaluate(processor.full_train)

0.8625552515259945

In [54]:
import pandas as pd

# Top-20 train split

df = pd.concat((processor.data_top20_map['train'], processor.data_top20_map['val']))
clf, vect, acc = evaluate(df, True, False)
print("Train split acc: {}".format(acc))

df = processor.data_top20_map['test'].dropna()
X_test = vect.transform(df.productDisplayName)
y_test = df.articleType.values

pred = clf.predict(X_test)
print("Test acc: {}".format(np.mean(pred == y_test)))

Train split acc: 0.9659735349716446
Test acc: 0.852290783898305


In [55]:
# Fine-tune train split

df = pd.concat((processor.data_ft_map['train'], processor.data_ft_map['val']))
clf, vect, acc = evaluate(df, True, False)
print("Train split acc: {}".format(acc))

df = processor.data_ft_map['test'].dropna()
X_test = vect.transform(df.productDisplayName)
y_test = df.articleType.values

pred = clf.predict(X_test)
print("Test acc: {}".format(np.mean(pred == y_test)))

Train split acc: 0.9225169006760271
Test acc: 0.42547477460195665
