In [1]:
import pandas as pd
import numpy as np
from scipy import sparse
import re
import datetime as dt
from sklearn.feature_extraction.text import CountVectorizer

pd.options.display.max_rows = 100
pd.options.display.max_columns = 10

In [2]:
train_data = pd.read_csv('train.tsv', sep= '\t', compression= ''
test_data = pd.read_csv('test.tsv', sep= '\t')

In [3]:
train_data.sample(50)

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description
283876,283876,CHANCE 3 Baseball Black Hat Cap,1,Men/Men's Accessories/Hats,,10.0,1,"""100% New and High Quality. Color:Black circum..."
558408,558408,Nintendo DS lite,3,Electronics/Video Games & Consoles/Games,Nintendo,20.0,1,"White, works like new No problems, a little bi..."
1223834,1223834,Auth LV 6Key Chain Snap w/Zipper Pocket,3,Vintage & Collectibles/Accessories/Keychain,Louis Vuitton,39.0,0,***100% Vintage Authentic Pre-Owned Louis Vuit...
945703,945703,Red Mickey Minnie charm bracelet,1,Women/Jewelry/Bracelets,,25.0,1,Note: this is not pandora! Material: 925 silve...
1230850,1230850,NEW Lipsense Bundle,1,Beauty/Makeup/Lips,SeneGence,75.0,0,1 Corallina 1 mulled wine 1 Aussie Rose 1 Fire...
1058192,1058192,Vineyard Vines boys shirt,3,Kids/Boys (4+)/Top & T-shirts,,13.0,1,Vineyard Vines boys polo Light blue & white st...
141179,141179,Desert strike - SNES cart only,3,Electronics/Video Games & Consoles/Games,Nintendo,8.0,1,Cart only cart in great shape Other games for ...
1195312,1195312,Ring,2,Women/Jewelry/Rings,FOREVER 21,7.0,1,Size 8 Perfect condition
405345,405345,Women's Harley Davidson Riding Jacket,3,Women/Coats & Jackets/Motorcycle,Harley-Davidson,80.0,1,Women's functional riding jacket. Gently used....
646465,646465,Zelda Wind Waker,4,Electronics/Video Games & Consoles/Games,Nintendo,29.0,0,Missing manual Disc has some scratches but don...


In [4]:
# get top n category based on the number of rows
n_top_category = 40
category_list = list(train_data.category_name.value_counts()[:n_top_category].index)

In [5]:
#filter the dataset to n top category only 
train_data = train_data.loc[train_data.category_name.isin(category_list)]
test_data = test_data.loc[test_data.category_name.isin(category_list)]

In [6]:
df_aggregated = pd.DataFrame(train_data.groupby('category_name')['name'].agg(lambda s: ' '.join(s)))
df_aggregated = df_aggregated.reset_index()


In [7]:
df_aggregated

Unnamed: 0,category_name,name
0,Beauty/Fragrance/Women,New vs pi k body mists Marc Jacobs Honey Pink ...
1,Beauty/Makeup/Eyes,Glitter Eyeshadow Too Faced Better Than Sex Ma...
2,Beauty/Makeup/Face,Smashbox primer Apricot beige stick foundation...
3,Beauty/Makeup/Lips,. BBW Mentha Lip Balms Bare minerals Nude Matt...
4,Beauty/Makeup/Makeup Palettes,"Too Faced Limited ""Merry Macaroons"" Realher Ev..."
5,Beauty/Skin Care/Body,Scentsy Bath Tabs Bath & body works cool citru...
6,Beauty/Skin Care/Face,Caudalíe Beauty Elixir Spray lots of Korean Na...
7,"Electronics/Cell Phones & Accessories/Cases, C...",Otterbox Defender iPhone 6 Plus/6s Plus Iphone...
8,Electronics/Cell Phones & Accessories/Cell Pho...,Smart watch for android and iOS NEW IN BOX Bel...
9,Electronics/Video Games & Consoles/Games,Goosebumps HorrorLand DS Game NBA 2k14 BRAND N...


In [8]:
color_stopwords = ['red','green','blue','yellow','magenta','pink','brown','white','black']
english_stopwords = ['and', 'or', 'with', 'the', 'on', 'in', 'new']

stopwords = color_stopwords + english_stopwords

vectorizer = CountVectorizer(min_df= 20, max_df = 0.9, stop_words= stopwords, max_features= 7000,
                                 token_pattern= r"\b\w+\b")

In [9]:
vectorizer.fit(train_data['name'])

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.9, max_features=7000, min_df=20,
        ngram_range=(1, 1), preprocessor=None,
        stop_words=['red', 'green', 'blue', 'yellow', 'magenta', 'pink', 'brown', 'white', 'black', 'and', 'or', 'with', 'the', 'on', 'in', 'new'],
        strip_accents=None, token_pattern='\\b\\w+\\b', tokenizer=None,
        vocabulary=None)

In [10]:
len(vectorizer.vocabulary_)

7000

In [11]:
vocab = vectorizer.vocabulary_

In [12]:
vectorizer_agg = CountVectorizer(max_df = 0.9, ngram_range= (1,3), stop_words= stopwords, 
                                 vocabulary= vocab, 
                                 token_pattern= r"\b\w+\b")

In [13]:
X = vectorizer_agg.fit_transform(df_aggregated['name'])

In [14]:
vocab_inv = {value:key for key,value in vectorizer.vocabulary_.items()}

In [15]:
dict_idx_cat = {idx:cat for idx,cat in enumerate(list(df_aggregated['category_name']))}

In [16]:
df_count = pd.DataFrame(X.toarray())

In [17]:
df_prob = df_count / df_count.sum(axis = 0)

In [18]:
prob_matrix = sparse.csr_matrix(df_prob)

In [19]:
def predict_single(item_name):
    input_matrix = vectorizer_agg.transform([item_name])
    predict_vector = np.dot(input_matrix, prob_matrix.T).toarray()
    if predict_vector.sum() == 0:
        return None
    else:
        max_index = np.argmax(predict_vector)
        cat_name = dict_idx_cat[max_index]
        return cat_name

In [20]:
predict_single('ps4 slim')

'Electronics/Video Games & Consoles/Games'

In [21]:
def predict_batch(list_item_name):
    input_matrix = vectorizer_agg.transform(list_item_name)
    predict_vector = np.dot(input_matrix, prob_matrix.T).toarray()
    
    max_index_list = np.argmax(predict_vector, axis = 1)
    cat_name_list = [dict_idx_cat[max_index] for max_index in max_index_list]
    
    null_list = list(predict_vector.sum(axis = 1) != 0)
    
    result = [i*int(j) for i,j in zip(cat_name_list,null_list)]
    
    return result

In [35]:
print("Total test data: ", len(test_data))
print("Probability matrix shape: ", prob_matrix.shape)

start_time = dt.datetime.now()
test_data['category_prediction'] = predict_batch(test_data['category_name'])
end_time = dt.datetime.now()

print("total time: ", (end_time - start_time).seconds, "seconds")
print("accuracy: ", (test_data['category_prediction'] == test_data['category_name']).mean())

Total test data:  340814
Probability matrix shape:  (40, 7000)
total time:  5 seconds
accuracy:  0.8331171841532331
