In [1]:
import pandas as pd
import numpy as np
from scipy import sparse
import re
import datetime as dt
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
train_data = pd.read_csv('train.tsv', sep= '\t')
test_data = pd.read_csv('test.tsv', sep= '\t')

In [3]:
train_data.sample(10)

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description
289832,289832,"Retro 5 ""Laneys""",2,Men/Shoes/Fashion Sneakers,Jordans,56.0,0,Size 7y Very Great Condition Condition 9/10 10...
532065,532065,Vintage Cavel Hall Constellation knives,3,Home/Kitchen & Dining/Kitchen Knives & Cutlery...,,10.0,0,6 piece Cavel Hall Constellation steak knives ...
1355816,1355816,ND 3 Eyeshadow Palette Freeship,1,Beauty/Makeup/Eyes,,19.0,1,Brand new never been used! Colors: Strange Dus...
388660,388660,Special bundle,3,Women/Women's Accessories/Hats,,11.0,1,Jogger sweats & blue forever 21 beanie
570850,570850,L.A. girl Pro Concealer,1,Beauty/Makeup/Face,Sephora,13.0,1,New L.A. girl HD Pro Concealer 1.GC981 Toast 1...
731536,731536,Black and White V Cut One Piece Swimsuit,2,Women/Swimwear/One-Piece,,25.0,1,Black and White V Cut One piece Swimsuit! Hand...
392691,392691,Coach crossbody. Beige/nude,3,Women/Women's Handbags/Messenger & Crossbody,Coach,24.0,0,No description yet
943271,943271,Black Lace HALTER BRALETTE large,1,Women/Underwear/Bras,,14.0,1,Awesome NEW black lace halter bralette in a si...
39882,39882,3 jeans boy baby gap 0-3 monts,3,Kids/Boys 0-24 Mos/One-Pieces,Gap,18.0,1,Exellent conditions
462696,462696,Lisette leggings size small,2,"Women/Athletic Apparel/Pants, Tights, Leggings",,30.0,1,Fabletics legging. I took the tag off so I can...


In [4]:
# get top n category based on the number of rows
n_top_category = 40
category_list = list(train_data.category_name.value_counts()[:n_top_category].index)

In [5]:
#filter the dataset to n top category only 
train_data = train_data.loc[train_data.category_name.isin(category_list)]
test_data = test_data.loc[test_data.category_name.isin(category_list)]

In [6]:
df_aggregated = pd.DataFrame(train_data.groupby('category_name')['name'].agg(lambda s: ' '.join(s)))
df_aggregated = df_aggregated.reset_index()


In [7]:
df_aggregated[:10]

Unnamed: 0,category_name,name
0,Beauty/Fragrance/Women,New vs pi k body mists Marc Jacobs Honey Pink ...
1,Beauty/Makeup/Eyes,Glitter Eyeshadow Too Faced Better Than Sex Ma...
2,Beauty/Makeup/Face,Smashbox primer Apricot beige stick foundation...
3,Beauty/Makeup/Lips,. BBW Mentha Lip Balms Bare minerals Nude Matt...
4,Beauty/Makeup/Makeup Palettes,"Too Faced Limited ""Merry Macaroons"" Realher Ev..."
5,Beauty/Skin Care/Body,Scentsy Bath Tabs Bath & body works cool citru...
6,Beauty/Skin Care/Face,Caudalíe Beauty Elixir Spray lots of Korean Na...
7,"Electronics/Cell Phones & Accessories/Cases, C...",Otterbox Defender iPhone 6 Plus/6s Plus Iphone...
8,Electronics/Cell Phones & Accessories/Cell Pho...,Smart watch for android and iOS NEW IN BOX Bel...
9,Electronics/Video Games & Consoles/Games,Goosebumps HorrorLand DS Game NBA 2k14 BRAND N...


In [8]:
color_stopwords = ['red','green','blue','yellow','magenta','pink','brown','white','black']
english_stopwords = ['and', 'or', 'with', 'the', 'on', 'in', 'new']

stopwords = color_stopwords + english_stopwords

vectorizer = CountVectorizer(min_df= 20, max_df = 0.9, stop_words= stopwords, max_features= 7000,
                                 token_pattern= r"\b\w+\b")

In [9]:
vectorizer.fit(train_data['name'])

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.9, max_features=7000, min_df=20,
        ngram_range=(1, 1), preprocessor=None,
        stop_words=['red', 'green', 'blue', 'yellow', 'magenta', 'pink', 'brown', 'white', 'black', 'and', 'or', 'with', 'the', 'on', 'in', 'new'],
        strip_accents=None, token_pattern='\\b\\w+\\b', tokenizer=None,
        vocabulary=None)

In [10]:
len(vectorizer.vocabulary_)

7000

In [11]:
vocab = vectorizer.vocabulary_

In [12]:
vectorizer_agg = CountVectorizer(max_df = 0.9, ngram_range= (1,3), stop_words= stopwords, 
                                 vocabulary= vocab, 
                                 token_pattern= r"\b\w+\b")

In [13]:
X = vectorizer_agg.fit_transform(df_aggregated['name'])

In [14]:
vocab_inv = {value:key for key,value in vectorizer.vocabulary_.items()}

In [15]:
dict_idx_cat = {idx:cat for idx,cat in enumerate(list(df_aggregated['category_name']))}

In [16]:
df_count = pd.DataFrame(X.toarray())

In [17]:
df_prob = df_count / df_count.sum(axis = 0)

In [18]:
prob_matrix = sparse.csr_matrix(df_prob)

In [19]:
def predict_single(item_name):
    input_matrix = vectorizer_agg.transform([item_name])
    predict_vector = np.dot(input_matrix, prob_matrix.T).toarray()
    if predict_vector.sum() == 0:
        return None
    else:
        max_index = np.argmax(predict_vector)
        cat_name = dict_idx_cat[max_index]
        return cat_name

In [20]:
predict_single('ps4 slim')

'Electronics/Video Games & Consoles/Games'

In [21]:
def predict_batch(list_item_name):
    input_matrix = vectorizer_agg.transform(list_item_name)
    predict_vector = np.dot(input_matrix, prob_matrix.T).toarray()
    
    max_index_list = np.argmax(predict_vector, axis = 1)
    cat_name_list = [dict_idx_cat[max_index] for max_index in max_index_list]
    
    null_list = list(predict_vector.sum(axis = 1) != 0)
    
    result = [i*int(j) for i,j in zip(cat_name_list,null_list)]
    
    return result

In [22]:
print("Total test data: ", len(test_data))
print("Probability matrix shape: ", prob_matrix.shape)

start_time = dt.datetime.now()
test_data['category_prediction'] = predict_batch(test_data['category_name'])
end_time = dt.datetime.now()

print("total time: ", (end_time - start_time).seconds, "seconds")
print("accuracy: ", (test_data['category_prediction'] == test_data['category_name']).mean())

Total test data:  340814
Probability matrix shape:  (40, 7000)
total time:  5 seconds
accuracy:  0.8331171841532331
