In [21]:
import numpy as np
from sklearn import preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer
from pandas import read_hdf, concat
from sklearn.metrics import f1_score, accuracy_score
import matplotlib.pyplot as plt
from time import time

from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

%matplotlib inline 

In [22]:
def type2idx(Data_c,Type_c):
    n_samples=len(Data_c)
    target = np.empty((n_samples,), dtype=np.int)
    for idx in range(n_samples):
        if Data_c[idx] in Type_c:
            target[idx]=Type_c.index(Data_c[idx])
        else:
            target[idx] = -1
    return target

In [23]:
url = 'D:\python_projects\ServeNet_others\data\\ramdom_categorg_percent\RandomSplittedByCatagories9.h5'
TrainServices = read_hdf(url, key='Train')
TestServices = read_hdf(url, key='Test')
AllData = concat([TrainServices, TestServices])

In [24]:
data_train = list(TrainServices['Service Desciption'])
target_train = list(TrainServices['Service Classification'])
data_test = list(TestServices['Service Desciption'])
target_test = list(TestServices['Service Classification'])

X_train = data_train
Y_train = target_train
X_test = data_test
Y_test = target_test

In [25]:
Type_c = (list(np.unique(target_train)))
# Type_c

In [26]:
encoder = preprocessing.LabelEncoder()
Y_train = encoder.fit_transform(Y_train)
Y_test = encoder.fit_transform(Y_test)

In [27]:
max_features = 1500
tfidf_vectorizer = TfidfVectorizer(sublinear_tf=True, stop_words='english', max_features=max_features)
tfidf_vectorizer.fit(list(AllData['Service Desciption']))

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=1500, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words='english', strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [28]:
X_train = tfidf_vectorizer.transform(X_train)
X_test = tfidf_vectorizer.transform(X_test)

In [29]:
clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth=6), n_estimators=500, learning_rate=1.5)

t0 = time()
clf.fit(X_train, Y_train)
t1 = time()
print("Train time: ", t1 - t0)

Train time:  104.25097441673279


In [30]:
train_top5 = clf.predict_proba(X_train)
train_top1 = clf.predict(X_train)

test_pre_top5 = clf.predict_proba(X_test)
test_pre_top1 = clf.predict(X_test)

ret = np.empty((len(Y_test),), dtype=np.int)
train_ret = np.empty((len(Y_train),), dtype=np.int)
for i in range(len(Y_test)):
    Top5 = sorted(zip(clf.classes_, test_pre_top5[i]), key=lambda x: x[1])[-5:]
    Top5=list(map(lambda x: x[0], Top5))

    if Y_test[i] in Top5:
        ret[i] = Y_test[i]
    else:
        ret[i] = Top5[-1]

for i in range(len(Y_train)):
    Top5_train = sorted(zip(clf.classes_, train_top5[i]), key=lambda x: x[1])[-5:]
    Top5_train = list(map(lambda x: x[0], Top5_train))

    if Y_train[i] in Top5_train:
        train_ret[i] = Y_train[i]
    else:
        train_ret[i] = Top5_train[-1]

f1_s = f1_score(Y_test, ret, average='micro')

print("=" * 60)
print("Test top5 acc:%f,train top5  acc:%f" % (accuracy_score(Y_test, ret), accuracy_score(Y_train, train_ret)))
print("Test top1 acc:%f,train top1 acc:%f" % ( accuracy_score(Y_test, test_pre_top1),
                                               accuracy_score(Y_train, train_top1)))
print("F1_score:%f" % float(f1_s))
print("=" * 60)

Test top5 acc:0.659874,train top5  acc:0.984119
Test top1 acc:0.348860,train top1 acc:0.588945
F1_score:0.659874


In [31]:
type_c_index = type2idx(Type_c, Type_c)

result_dict = {}
total_dict = {}
for idx in type_c_index:
    category = Type_c[idx]
    total_count = account = 0
    for i in range(len(Y_test)):
        if Y_test[i] == idx:
            total_count += 1
            if Y_test[i] == ret[i]:
                account += 1

    result_dict[category] = account / total_count * 1.
    total_dict[category] = total_count

for cate in result_dict.keys():
    total_account = total_dict[cate]
    acc = result_dict[cate]
    print("%s (%d): %.3f" % (cate, total_account, acc))

Advertising (42): 0.619
Analytics (23): 0.217
Application Development (23): 0.348
Backend (27): 0.222
Banking (20): 0.350
Bitcoin (28): 0.821
Chat (16): 0.375
Cloud (33): 0.545
Data (28): 0.214
Database (27): 0.185
Domains (16): 0.625
Education (41): 0.415
Email (48): 0.667
Enterprise (79): 0.785
Entertainment (19): 0.158
Events (21): 0.524
File Sharing (16): 0.438
Financial (130): 0.923
Games (39): 0.590
Government (55): 0.564
Images (15): 0.333
Internet of Things (22): 0.500
Mapping (67): 0.746
Marketing (16): 0.438
Media (16): 0.188
Medical (21): 0.286
Messaging (97): 0.856
Music (37): 0.757
News Services (16): 0.188
Other (29): 0.172
Payments (85): 0.847
Photos (35): 0.514
Project Management (28): 0.286
Real Estate (21): 0.476
Reference (47): 0.617
Science (55): 0.855
Search (43): 0.744
Security (47): 0.617
Shipping (26): 0.500
Social (80): 0.850
Sports (43): 0.628
Stocks (19): 0.684
Storage (19): 0.211
Telephony (57): 0.684
Tools (146): 0.973
Transportation (42): 0.786
Travel (45)