In [1]:
import pandas as pd
import numpy as np
#from sklearn import linear_model
#from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_extraction.text import CountVectorizer
import matplotlib.pyplot as plt

# Section 1. prepare data

## load Amazon product dataset

In [2]:
amazon = pd.read_csv('output/selectedAmazonProductData_final.csv')
amazon.head()
amazon.shape

(5589590, 2)

## load target dataset

In [4]:
loaddata = pd.read_csv('input/ecommerce_product_names.csv')
target = loaddata['Product Name']
#del loaddata
target.head

<bound method NDFrame.head of 0                      Alisha Solid Women's Cycling Shorts
1                      FabHomeDecor Fabric Double Sofa Bed
2                                               AW Bellies
3                    Sicons All Purpose Arnica Dog Shampoo
4        Eternal Gandhi Super Series Crystal Paper Weig...
5        dilli bazaaar Bellies, Corporate Casuals, Casuals
6                                           Ladela Bellies
7                                   Carrel Printed Women's
8                  Sicons All Purpose Tea Tree Dog Shampoo
9                   Freelance Vacuum Bottles 350 ml Bottle
10                                      Style Foot Bellies
11              Sicons Conditioning Conditoner Dog Shampoo
12                 dongli Printed Boy's Round Neck T-Shirt
13                                      SWAGGA Women Clogs
14       Kennel Rubber Dumbell With Bell - Small Rubber...
15                               Glus Wedding Lingerie Set
16       Veelys Shiny Whit

## convert category names to a numerical variable

In [5]:
temp = amazon['macro categories'].unique()
categoryList = {temp[i]: i for i in range(len(temp))}
categoryList

{'Movies & TV': 0,
 'Sports & Outdoors': 1,
 'Clothing': 2,
 'Toys & Games': 3,
 'CDs & Vinyl': 4,
 'Musical Instruments': 5,
 'Tools & Home Improvement': 6,
 'Software': 7,
 'Home & Kitchen': 8,
 'Health & Personal Care': 9,
 'Video Games': 10,
 'Office Products': 11,
 'Cell Phones & Accessories': 12,
 'Electronics': 13,
 'Office & School Supplies': 14,
 'Baby': 15,
 'Beauty': 16,
 'Automotive': 17,
 'Arts, Crafts & Sewing': 18,
 'Computers': 19,
 'All Electronics': 20,
 'Accessories': 21,
 'Pet Supplies': 22,
 'Grocery & Gourmet Food': 23,
 'Kitchen & Dining': 24,
 'Industrial & Scientific': 25,
 'Costumes & Accessories': 26,
 'Appliances': 27,
 'All Beauty': 28,
 'Patio, Lawn & Garden': 29,
 'Watches': 30,
 'Home Improvement': 31,
 'Baby Products': 32,
 'Amazon Fashion': 33,
 'Collectibles & Fine Art': 34,
 'Shoes': 35,
 'Jewelry': 36,
 'Kindle Store': 37,
 'Amazon Instant Video': 38,
 'International': 39,
 'Alternative Rock': 40,
 'Miscellaneous': 41,
 'Christian': 42,
 'Apps for A

In [7]:
# create a new colomn recording ID of products
amazon['category_ID'] = amazon['macro categories'].map(categoryList)
amazon.head(10)

Unnamed: 0,title,macro categories,category_ID
0,"Everyday Italian (with Giada de Laurentiis), V...",Movies & TV,0
1,Adult Ballet Tutu Cheetah Pink,Sports & Outdoors,1
2,Girls Ballet Tutu Neon Pink,Sports & Outdoors,1
3,Adult Ballet Tutu Yellow,Sports & Outdoors,1
4,Why Don't They Just Quit? DVD Roundtable Discu...,Movies & TV,0
5,Girls Ballet Tutu Zebra Hot Pink,Sports & Outdoors,1
6,Adult Ballet Tutu Purple,Sports & Outdoors,1
7,Understanding Seizures and Epilepsy DVD,Movies & TV,0
8,Live in Houston [VHS],Movies & TV,0
9,My Fair Pastry (Good Eats Vol. 9),Movies & TV,0


In [8]:
amazon.dropna()
amazon = amazon[amazon['title'] != 'NaN']
amazon.head(10)

Unnamed: 0,title,macro categories,category_ID
0,"Everyday Italian (with Giada de Laurentiis), V...",Movies & TV,0
1,Adult Ballet Tutu Cheetah Pink,Sports & Outdoors,1
2,Girls Ballet Tutu Neon Pink,Sports & Outdoors,1
3,Adult Ballet Tutu Yellow,Sports & Outdoors,1
4,Why Don't They Just Quit? DVD Roundtable Discu...,Movies & TV,0
5,Girls Ballet Tutu Zebra Hot Pink,Sports & Outdoors,1
6,Adult Ballet Tutu Purple,Sports & Outdoors,1
7,Understanding Seizures and Epilepsy DVD,Movies & TV,0
8,Live in Houston [VHS],Movies & TV,0
9,My Fair Pastry (Good Eats Vol. 9),Movies & TV,0


In [None]:
type(amazon['title'][5589584])

# Section 2. Represent text as numerical data

In [9]:
# instantiate CountVectorizer (vectorizer)
vect = CountVectorizer()

In [10]:
# define X and y 
X = amazon['title']
y = amazon['category_ID']
print(X.shape)
print(y.shape)

(5589590,)
(5589590,)


In [11]:
# split X and y into training and testing sets
# by default, it splits 75% training and 25% test
# random_state=1 for reproducibility
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(4192192,)
(1397398,)
(4192192,)
(1397398,)


In [13]:
X_train.head(10)

278128                                       Go The Distance
3662218    VMG Motorola Droid RAZR MAXX 2nd Gen 2-ITEM Ha...
4450446    Hot 20000pcs Clear Rhinestone Decoration Cryst...
140084     The Pelican Brief: Music from the Motion Pictu...
4533636    Torre &amp; Tagus Cluck Salt and Pepper Shaker...
5207235    Earhoox - Earbud Attachments - for iPhone 3G/4...
2309593    Midwest Quiet Time e'Sensuals Orthopedic Nesti...
2460090                                                  NaN
389138                                            Love Songs
2361820                                                  NaN
Name: title, dtype: object

## combine all product names

In [14]:
productNames = pd.concat([amazon['title'], target])
productNames[5589590:5589595]

0                  Alisha Solid Women's Cycling Shorts
1                  FabHomeDecor Fabric Double Sofa Bed
2                                           AW Bellies
3                Sicons All Purpose Arnica Dog Shampoo
4    Eternal Gandhi Super Series Crystal Paper Weig...
dtype: object

### Note: the vocabulary is learned with all product data, including amazon dataset and target dataset

In [15]:
# fit
# learn the 'vocabulary' of the training data (occurs in-place)
vect.fit(productNames.values.astype('U'))

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [17]:
# examine the fitted vocabulary
vocabulary = vect.get_feature_names()

In [18]:
len(vocabulary)

1094032

In [19]:
# transform training data
X_train_dtm = vect.transform(X_train.values.astype('U'))
X_train_dtm

<4192192x1094032 sparse matrix of type '<class 'numpy.int64'>'
	with 37158149 stored elements in Compressed Sparse Row format>

In [20]:
# transform testing data (using fitted vocabulary) into a document-term matrix
X_test_dtm = vect.transform(X_test.values.astype('U'))
X_test_dtm

<1397398x1094032 sparse matrix of type '<class 'numpy.int64'>'
	with 12378938 stored elements in Compressed Sparse Row format>

In [25]:
frequency = np.sum(X_train_dtm, axis = 0) # count frequencies of each word in vocabulary
frequency = frequency.tolist()[0] # convert to a list
frequency = [(frequency[index], index) for index in range(len(frequency))]
frequency = sorted(frequency, key = lambda x: -x[0]) # sort as frequency
indices_frequency = [index for frequency, index in frequency] # word indices sorted by frequency
HighFreqWord = [vocabulary[i] for i in indices_frequency] # show high frquency words
HighFreqWord[:20]

['for',
 'with',
 'black',
 'quot',
 'nan',
 'and',
 'case',
 'of',
 'amp',
 'pack',
 'inch',
 'the',
 'set',
 'cover',
 'women',
 'white',
 'in',
 'by',
 'blue',
 'oz']

# Section 3. Building and evaluating a model

## naive_bayes

In [26]:
from sklearn.naive_bayes import MultinomialNB

#  instantiate a Multinomial Naive Bayes model
nb = MultinomialNB()

In [27]:
# train the model 
nb.fit(X_train_dtm, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [28]:
#  make class predictions for X_test_dtm
y_pred_class = nb.predict(X_test_dtm)

In [29]:
# calculate accuracy of class predictions
from sklearn import metrics
metrics.accuracy_score(y_test, y_pred_class)

0.751523188096734

In [30]:
# print the confusion matrix
metrics.confusion_matrix(y_test, y_pred_class)

array([[ 10152,   1165,    213, ...,      0,      0,      1],
       [   148, 108177,   7412, ...,      0,      0,      1],
       [     1,   4628,  73180, ...,      0,      0,      0],
       ...,
       [     0,      0,      0, ...,      0,      0,      0],
       [     0,      0,      0, ...,      0,      0,      0],
       [     0,      0,      0, ...,      0,      0,      0]])

## predict the product categories in the target dataset

In [31]:
# transform target data 
target_dtm = vect.transform(target.values.astype('U'))
target_dtm

<12623x1094032 sparse matrix of type '<class 'numpy.int64'>'
	with 82048 stored elements in Compressed Sparse Row format>

In [32]:
target_pred_class = nb.predict(target_dtm)
target_pred_class.shape

(12623,)

In [33]:
categoryList

{'Movies & TV': 0,
 'Sports & Outdoors': 1,
 'Clothing': 2,
 'Toys & Games': 3,
 'CDs & Vinyl': 4,
 'Musical Instruments': 5,
 'Tools & Home Improvement': 6,
 'Software': 7,
 'Home & Kitchen': 8,
 'Health & Personal Care': 9,
 'Video Games': 10,
 'Office Products': 11,
 'Cell Phones & Accessories': 12,
 'Electronics': 13,
 'Office & School Supplies': 14,
 'Baby': 15,
 'Beauty': 16,
 'Automotive': 17,
 'Arts, Crafts & Sewing': 18,
 'Computers': 19,
 'All Electronics': 20,
 'Accessories': 21,
 'Pet Supplies': 22,
 'Grocery & Gourmet Food': 23,
 'Kitchen & Dining': 24,
 'Industrial & Scientific': 25,
 'Costumes & Accessories': 26,
 'Appliances': 27,
 'All Beauty': 28,
 'Patio, Lawn & Garden': 29,
 'Watches': 30,
 'Home Improvement': 31,
 'Baby Products': 32,
 'Amazon Fashion': 33,
 'Collectibles & Fine Art': 34,
 'Shoes': 35,
 'Jewelry': 36,
 'Kindle Store': 37,
 'Amazon Instant Video': 38,
 'International': 39,
 'Alternative Rock': 40,
 'Miscellaneous': 41,
 'Christian': 42,
 'Apps for A

In [35]:
target_category = []
for ID in target_pred_class:
    for cat, cat_ID in categoryList.items():    
        if cat_ID == ID:
            target_category.append(cat)
target_category[:20]

['Clothing',
 'Home & Kitchen',
 'Toys & Games',
 'Pet Supplies',
 'Home & Kitchen',
 'Shoes',
 'Toys & Games',
 'Clothing',
 'Pet Supplies',
 'Health & Personal Care',
 'Toys & Games',
 'Beauty',
 'Clothing',
 'Shoes',
 'Pet Supplies',
 'Home & Kitchen',
 'Sports & Outdoors',
 'Tools & Home Improvement',
 'Clothing',
 'Clothing']

## export categories of target dataset

In [40]:
df = loaddata.copy()

In [41]:
df['Categories'] = np.array(target_category)

In [None]:
df

In [42]:
export_csv = df.to_csv (r'output/ProductWithCategories.csv', index = None, header=True)

In [43]:
category = df.groupby('Categories').size()
category = category.rename('Number')
category = category.sort_values(ascending=False)
category

Categories
Clothing                     3120
Home & Kitchen               1825
Sports & Outdoors            1474
Jewelry                      1015
Electronics                   847
Automotive                    842
Shoes                         572
Toys & Games                  518
Arts, Crafts & Sewing         485
Beauty                        444
Cell Phones & Accessories     366
Tools & Home Improvement      292
Health & Personal Care        199
Office Products               160
Patio, Lawn & Garden          140
Pet Supplies                   93
CDs & Vinyl                    91
Accessories                    67
Baby                           32
Grocery & Gourmet Food         27
Movies & TV                     5
Musical Instruments             4
Watches                         3
Costumes & Accessories          2
Name: Number, dtype: int64

In [44]:
len(category)

24

### 10-fold cross validation splits

In [51]:
split = 10
skf = StratifiedKFold(n_splits = split)
skf.get_n_splits(train, target)

NameError: name 'train' is not defined

In [47]:
import gc
import xgboost as xgb 
import numpy as np
import pandas as pd
import pickle
from scipy.sparse import csr_matrix
from sklearn.metrics import log_loss, roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import normalize

import os  # for Macbook
os.environ['KMP_DUPLICATE_LIB_OK']='True'

In [None]:
### Set parameters

In [69]:
num_boost_round = 500
params = {}
params["objective"] = 'multi:softprob'
params["eta"] = 0.03 # 如同学习率
params["subsample"] = 1  # 随机采样训练样本 训练实例的子采样比
params["colsample_bytree"] = 0.7 # 生成树时进行的列采样
params["silent"] = 0 #设置成1则没有运行信息输出，最好是设置为0.是否在运行升级时打印消息。
params["max_depth"] = 5 # 构建树的深度，越大越容易过拟合
params["min_child_weight"] = 1
# 这个参数默认是 1，是每个叶子里面 h 的和至少是多少，对正负样本不均衡时的 0-1 分类而言
#，假设 h 在 0.01 附近，min_child_weight 为 1 意味着叶子节点中最少需要包含 100 个样本。
params["eval_metric"] = "mlogloss"
params["num_class"] = 44


In [65]:
D_train = xgb.DMatrix(
                    csr_matrix(X_train_dtm),
                    y_train,
                    silent = True)

print(D_train.num_col())
print(D_train.num_row())   

1094032
4192192


In [54]:
D_test = xgb.DMatrix(
                    csr_matrix(X_test_dtm),
                    y_test,
                    silent = True)
watchlist = [(D_test, 'test'), (D_train, 'train')]
print(D_test.num_col())
print(D_test.num_row())

1094032
1397398


In [None]:
# fit the classfier now
clf = xgb.train(params, D_train, num_boost_round,
                    evals = watchlist, early_stopping_rounds = 10)
    
test_prediction = clf.predict(D_test) 

[0]	test-mlogloss:3.64016	train-mlogloss:3.63655
Multiple eval metrics have been passed: 'train-mlogloss' will be used for early stopping.

Will train until train-mlogloss hasn't improved in 250 rounds.
[1]	test-mlogloss:3.52804	train-mlogloss:3.52812
[2]	test-mlogloss:3.44076	train-mlogloss:3.44121
[3]	test-mlogloss:3.36887	train-mlogloss:3.36929
[4]	test-mlogloss:3.30338	train-mlogloss:3.30307
[5]	test-mlogloss:3.24419	train-mlogloss:3.24319
[6]	test-mlogloss:3.19149	train-mlogloss:3.19051
[7]	test-mlogloss:3.1438	train-mlogloss:3.14455
[8]	test-mlogloss:3.0964	train-mlogloss:3.09713
[9]	test-mlogloss:3.05355	train-mlogloss:3.05389
[10]	test-mlogloss:3.01392	train-mlogloss:3.01334
[11]	test-mlogloss:2.97629	train-mlogloss:2.97581
[12]	test-mlogloss:2.94014	train-mlogloss:2.94069
[13]	test-mlogloss:2.90676	train-mlogloss:2.90589
[14]	test-mlogloss:2.87642	train-mlogloss:2.87526
[15]	test-mlogloss:2.84525	train-mlogloss:2.84424
[16]	test-mlogloss:2.81645	train-mlogloss:2.81542
[17]	tes

In [None]:
clf.save_model('categoryPrediction_XGBoost.model')