In [1]:
import numpy as np
import pandas as pd
import codecs
import jieba
from sklearn.feature_extraction.text import TfidfVectorizer
import re

In [2]:
# 1. 数据导入和预览
data = pd.read_csv('yuze_train.csv')
data.head()

Unnamed: 0,标签ID,标签,数据样本,税额,数量,加和
0,3070402000000000000,住宿服务,住宿费,37893,226,
1,3070402000000000000,住宿服务,住宿服务,7420,43,
2,3070402000000000000,住宿服务,房费,3454,38,
3,3070402000000000000,住宿服务,住宿,2388,12,
4,3070402000000000000,住宿服务,代订房费,272,2,


In [3]:
# 2. 从“其他” 类别里面随机选取 n 个数据，n设置为 正常标签的平均数
# 最后打印 每个类别的个数

other_sample = int(np.mean(data[data['标签'] != '其他']['标签'].value_counts()))
print('sample from other ',other_sample)
other_sample_data = data[data['标签'] == '其他'].sample(other_sample)
frames = [data[data['标签'] != '其他'], other_sample_data]
new_data = pd.concat(frames)
new_data['标签'].value_counts()

sample from other  73


文具及类似用品       312
电子计算机及其部件     300
企业管理服务        183
通信终端设备及零部件     96
物业管理服务         94
其他             73
卫生用纸制品         73
鉴证咨询服务         73
售电             65
车用油            60
工商用制冷、空调设备     52
日用杂品           45
增值电信服务         43
交通运输服务         43
通信终端设备         20
体育用品           18
热力生产及供热        17
水              15
玩具             10
人力资源服务          9
住宿服务            9
其他燃气            8
Name: 标签, dtype: int64

In [4]:
# 3. 提取 tf-idf 向量
# analyzer = 'word' 以分词级别，'char'以字符级别

# analyzer = 'word'
analyzer = 'char'

min_df = 1
max_df = 1.0
ngram = 1

print('min_df {} max_df {} ngram'.format(min_df,max_df,ngram))

def tokenizer(s):
    s = re.sub("[\s\.\!\/_,$%^*(+\"\'\]\|\[—！，。“”？\?;:\(\)、~@#￥%……&*（）=《》「」]+", " ",s)
    words = jieba.cut(s)
    tokens = []
    for word in words:
        if(word == ' '):continue
        tokens.append(word)
    return tokens
vectorizer = TfidfVectorizer(ngram_range=(1,ngram),tokenizer = tokenizer,analyzer=analyzer,
               min_df=min_df, max_df = max_df, strip_accents='unicode', use_idf=1,
               smooth_idf=1, sublinear_tf=False,norm ='l2',
                stop_words=None)

text = new_data['数据样本']

tfidf_doc = vectorizer.fit_transform(text.values.astype('str'))

min_df 1 max_df 1.0 ngram


In [5]:
# 预览一些tf-idf向量的结果

feature_names = vectorizer.get_feature_names()
import operator
for doc in range(0,2):
    print('--------------')
    print(new_data['数据样本'].iloc[doc])
    print(new_data['标签'].iloc[doc])
    feature_index = tfidf_doc[doc,:].nonzero()[1]
    tfidf_scores = zip(feature_index, [tfidf_doc[doc, x] for x in feature_index])
    tfidf_scores = sorted(tfidf_scores, key=operator.itemgetter(1),reverse=True)
    for w, s in [(feature_names[i], s) for (i, s) in tfidf_scores]:
        print(w, s)

--------------
住宿费
住宿服务
住 0.688580325806
宿 0.688580325806
费 0.227407717164
--------------
住宿服务
住宿服务
住 0.643100158106
宿 0.643100158106
服 0.298902665351
务 0.288966382012


In [6]:
# 4.将标签转化为 0,1,2..n 的方式

all_labels = []
label_map = {}
label_id_map = {}
label_count = 0
for label in new_data['标签']:
    if(label in label_map):
        all_labels.append(label_map[label])
    else:
        label_map[label] = label_count
        label_id_map[label_count] = label
        label_count += 1
        all_labels.append(label_map[label])
all_labels = np.array(all_labels)

In [7]:
# 5. 训练过程
# 可以选择 svm.LinearSVC()， 或者 MultinomialNB
# 可以选择是否打印预测结果 is_print 
is_print = False

from sklearn.naive_bayes import MultinomialNB
from sklearn import svm
from sklearn.model_selection import KFold
kf = KFold(n_splits=5, random_state=12, shuffle=True)
for train_index, test_index in kf.split(new_data):
    train_x = tfidf_doc[train_index]
    train_y = all_labels[train_index]

#     clf = MultinomialNB()
    clf = svm.LinearSVC()
    clf.fit(train_x, train_y)
    
    test_x = tfidf_doc[test_index]
    test_y = all_labels[test_index]
    test_text = new_data['数据样本'][test_index]
    
    
    print('正确率', clf.score(test_x, test_y))
    if(is_print):
        print(label_id_map)
        for idx,label in enumerate(clf.predict(test_x)):
            print('\t'.join([str(test_text.iloc[idx]), label_id_map[label], 
                  label_id_map[test_y[idx]]]))
        break

正确率 0.901234567901
正确率 0.842592592593
正确率 0.885802469136
正确率 0.885448916409
正确率 0.891640866873
