In [83]:
import numpy as np
from sklearn import preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer
from pandas import read_hdf, concat
from sklearn.utils import Bunch
from sklearn.metrics import f1_score, accuracy_score
from time import time
import matplotlib.pyplot as plt

from sklearn.naive_bayes import MultinomialNB
%matplotlib inline 

In [84]:

def type2idx(Data_c,Type_c):
    n_samples=len(Data_c)
    target = np.empty((n_samples,), dtype=np.int)
    for idx in range(n_samples):
        if Data_c[idx] in Type_c:
            target[idx]=Type_c.index(Data_c[idx])
        else:
            target[idx] = -1
    return target


In [85]:
url = 'D:\python_projects\ServeNet_others\data\\ramdom_categorg_percent\RandomSplittedByCatagories9.h5'
TrainServices = read_hdf(url, key='Train')
TestServices = read_hdf(url, key='Test')
AllData = concat([TrainServices, TestServices])

In [86]:
data_train = list(TrainServices['Service Desciption'])
target_train = list(TrainServices['Service Classification'])
data_test = list(TestServices['Service Desciption'])
target_test = list(TestServices['Service Classification'])

X_train = data_train
Y_train = target_train
X_test = data_test
Y_test = target_test

In [87]:
Type_c = (list(np.unique(target_train)))
# Type_c

In [88]:
encoder = preprocessing.LabelEncoder()
Y_train = encoder.fit_transform(Y_train)
Y_test = encoder.fit_transform(Y_test)


In [89]:
max_features = 1500
tfidf_vectorizer = TfidfVectorizer(sublinear_tf=True, stop_words='english', max_features=max_features)
tfidf_vectorizer.fit(list(AllData['Service Desciption']))

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=1500, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words='english', strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [90]:
X_train = tfidf_vectorizer.transform(X_train)
X_test = tfidf_vectorizer.transform(X_test)

In [91]:
clf = MultinomialNB()

t0 = time()
clf.fit(X_train, Y_train)
t1 = time()
print('Training time: ', (t1-t0))

Training time:  0.017496585845947266


In [92]:
train_top5 = clf.predict_proba(X_train)
train_top1 = clf.predict(X_train)

In [93]:
test_top5 = clf.predict_proba(X_test)
test_top1 = clf.predict(X_test)

In [94]:
test_ret = np.empty((len(Y_test),), dtype=np.int)
train_ret = np.empty((len(Y_train),), dtype=np.int)

# test top-5 accuracy
for i in range(len(Y_test)):
    Top5_test = sorted(zip(clf.classes_, test_top5[i]), key=lambda x: x[1])[-5:]
    Top5_test = list(map(lambda x: x[0], Top5_test))
    
    if Y_test[i] in Top5_test:
        test_ret[i] = Y_test[i]
    else:
        test_ret[i] = Top5_test[-1]

# train top-5 accuracy
for i in range(len(Y_train)):
    Top5_train = sorted(zip(clf.classes_, train_top5[i]), key=lambda x: x[1])[-5:]
    Top5_train = list(map(lambda x: x[0], Top5_train))
    
    if Y_train[i] in Top5_train:
        train_ret[i] = Y_train[i]
    else:
        train_ret[i] = Top5_train[-1]

f1_s = f1_score(Y_test, test_ret, average='micro')

print("=" * 60)
print("Test top5 acc:%.3f,  Train top5 acc:%.3f" % (accuracy_score(Y_test, test_ret), accuracy_score(Y_train, train_ret)))
print("Test top1 acc:%.3f,  Train top1 acc:%.3f" % (accuracy_score(Y_test, test_top1),
                                               accuracy_score(Y_train, train_top1)))
print("F1_score:%.3f" % float(f1_s))
print("=" * 60)
#################

Test top5 acc:0.757,  Train top5 acc:0.841
Test top1 acc:0.472,  Train top1 acc:0.560
F1_score:0.757


In [95]:
type_c_index = type2idx(Type_c, Type_c)
result_dict = {}
total_dict = {}
for idx in type_c_index:
    category = Type_c[idx]
    total_count = 0
    account = 0
    for i in range(len(Y_test)):
        if Y_test[i] == idx:
            total_count += 1
            if Y_test[i] == test_ret[i]:
                account += 1

    result_dict[category] = account / total_count * 1.
    total_dict[category] = total_count

In [96]:
for cate in result_dict.keys():
    total_account = total_dict[cate]
    acc = result_dict[cate]
    print("%s (%d): %.3f" % (cate, total_account, acc))

Advertising (42): 0.786
Analytics (23): 0.000
Application Development (23): 0.130
Backend (27): 0.148
Banking (20): 0.400
Bitcoin (28): 0.821
Chat (16): 0.062
Cloud (33): 0.576
Data (28): 0.143
Database (27): 0.111
Domains (16): 0.625
Education (41): 0.805
Email (48): 0.896
Enterprise (79): 0.987
Entertainment (19): 0.158
Events (21): 0.333
File Sharing (16): 0.438
Financial (130): 0.992
Games (39): 0.795
Government (55): 0.945
Images (15): 0.000
Internet of Things (22): 0.500
Mapping (67): 0.970
Marketing (16): 0.000
Media (16): 0.000
Medical (21): 0.143
Messaging (97): 1.000
Music (37): 0.811
News Services (16): 0.250
Other (29): 0.000
Payments (85): 0.988
Photos (35): 0.714
Project Management (28): 0.750
Real Estate (21): 0.524
Reference (47): 0.617
Science (55): 0.891
Search (43): 0.767
Security (47): 0.638
Shipping (26): 0.731
Social (80): 0.988
Sports (43): 0.837
Stocks (19): 0.947
Storage (19): 0.105
Telephony (57): 0.982
Tools (146): 0.993
Transportation (42): 0.857
Travel (45)