In [3]:
import numpy as np
from sklearn import preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer
from pandas import read_hdf, concat
from sklearn.metrics import f1_score, accuracy_score
import matplotlib.pyplot as plt
from time import time

from sklearn.ensemble import RandomForestClassifier

%matplotlib inline 

In [4]:
def type2idx(Data_c,Type_c):
    n_samples=len(Data_c)
    target = np.empty((n_samples,), dtype=np.int)
    for idx in range(n_samples):
        if Data_c[idx] in Type_c:
            target[idx]=Type_c.index(Data_c[idx])
        else:
            target[idx] = -1
    return target

In [5]:
url = 'D:\python_projects\ServeNet_others\data\\ramdom_categorg_percent\RandomSplittedByCatagories9.h5'
TrainServices = read_hdf(url, key='Train')
TestServices = read_hdf(url, key='Test')
AllData = concat([TrainServices, TestServices])

In [6]:
data_train = list(TrainServices['Service Desciption'])
target_train = list(TrainServices['Service Classification'])
data_test = list(TestServices['Service Desciption'])
target_test = list(TestServices['Service Classification'])

X_train = data_train
Y_train = target_train
X_test = data_test
Y_test = target_test

In [7]:
Type_c = (list(np.unique(target_train)))
# Type_c

In [8]:
encoder = preprocessing.LabelEncoder()
Y_train = encoder.fit_transform(Y_train)
Y_test = encoder.fit_transform(Y_test)

In [9]:
max_features = 1500
tfidf_vectorizer = TfidfVectorizer(sublinear_tf=True, stop_words='english', max_features=max_features)
tfidf_vectorizer.fit(list(AllData['Service Desciption']))

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=1500, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words='english', strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [10]:
X_train = tfidf_vectorizer.transform(X_train)
X_test = tfidf_vectorizer.transform(X_test)

In [11]:
clf = RandomForestClassifier(n_estimators=1000, max_depth=40, random_state=0)

t0 = time()
clf.fit(X_train, Y_train)
t1 = time()
print("Train time: ", t1 - t0)

Train time:  36.77560567855835


In [12]:
train_top5 = clf.predict_proba(X_train)
train_top1 = clf.predict(X_train)

In [13]:
test_top5 = clf.predict_proba(X_test)
test_top1 = clf.predict(X_test)

In [14]:
test_ret = np.empty((len(Y_test),), dtype=np.int)
train_ret = np.empty((len(Y_train),), dtype=np.int)

# test top-5 accuracy
for i in range(len(Y_test)):
    Top5_test = sorted(zip(clf.classes_, test_top5[i]), key=lambda x: x[1])[-5:]
    Top5_test = list(map(lambda x: x[0], Top5_test))
    
    if Y_test[i] in Top5_test:
        test_ret[i] = Y_test[i]
    else:
        test_ret[i] = Top5_test[-1]

# train top-5 accuracy
for i in range(len(Y_train)):
    Top5_train = sorted(zip(clf.classes_, train_top5[i]), key=lambda x: x[1])[-5:]
    Top5_train = list(map(lambda x: x[0], Top5_train))
    
    if Y_train[i] in Top5_train:
        train_ret[i] = Y_train[i]
    else:
        train_ret[i] = Top5_train[-1]

f1_s = f1_score(Y_test, test_ret, average='micro')

print("=" * 60)
print("Test top5 acc:%.3f,  Train top5 acc:%.3f" % (accuracy_score(Y_test, test_ret), accuracy_score(Y_train, train_ret)))
print("Test top1 acc:%.3f,  Train top1 acc:%.3f" % (accuracy_score(Y_test, test_top1),
                                               accuracy_score(Y_train, train_top1)))
print("F1_score:%.3f" % float(f1_s))
print("=" * 60)
#################

Test top5 acc:0.800,  Train top5 acc:0.955
Test top1 acc:0.541,  Train top1 acc:0.835
F1_score:0.800


In [15]:
type_c_index = type2idx(Type_c, Type_c)
result_dict = {}
total_dict = {}
for idx in type_c_index:
    category = Type_c[idx]
    total_count = 0
    account = 0
    for i in range(len(Y_test)):
        if Y_test[i] == idx:
            total_count += 1
            if Y_test[i] == test_ret[i]:
                account += 1

    result_dict[category] = account / total_count * 1.
    total_dict[category] = total_count

In [16]:
for cate in result_dict.keys():
    total_account = total_dict[cate]
    acc = result_dict[cate]
    print("%s (%d): %.3f" % (cate, total_account, acc))

Advertising (42): 0.857
Analytics (23): 0.348
Application Development (23): 0.087
Backend (27): 0.333
Banking (20): 0.850
Bitcoin (28): 0.964
Chat (16): 0.688
Cloud (33): 0.909
Data (28): 0.143
Database (27): 0.185
Domains (16): 0.750
Education (41): 0.854
Email (48): 0.917
Enterprise (79): 0.924
Entertainment (19): 0.105
Events (21): 0.905
File Sharing (16): 0.688
Financial (130): 1.000
Games (39): 0.897
Government (55): 0.764
Images (15): 0.467
Internet of Things (22): 0.636
Mapping (67): 0.910
Marketing (16): 0.750
Media (16): 0.250
Medical (21): 0.714
Messaging (97): 0.969
Music (37): 0.892
News Services (16): 0.500
Other (29): 0.034
Payments (85): 0.918
Photos (35): 0.886
Project Management (28): 0.786
Real Estate (21): 0.857
Reference (47): 0.362
Science (55): 0.909
Search (43): 0.651
Security (47): 0.468
Shipping (26): 0.962
Social (80): 0.938
Sports (43): 0.791
Stocks (19): 0.947
Storage (19): 0.632
Telephony (57): 0.842
Tools (146): 1.000
Transportation (42): 0.833
Travel (45)