# 文本分类

## 概念
- 分词（词典与隐马尔可夫模型）
- 语言模型
- tf-idf

## 类别
- 二分类
- 多分类
- 多标签（最难）

## 应用
- 垃圾邮件识别（是否是垃圾邮件）
- 情感分析（好评、中评、差评）
- 电影分类（喜剧、战争、动作、悬疑）

## 方法
- 1 传统机器学习（sklearn）
    - LogisticRegression/DT/SVM
- 2 深度学习(keras, tensorflow)
    - 词向量/CNN(textCNN)/LSTM(textRNN)/fastText


In [1]:
import jieba
from sklearn import tree
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer

In [2]:
sentence = "我爱北京天安门"
jieba.lcut(sentence)

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.963 seconds.
Prefix dict has been built succesfully.


['我', '爱', '北京', '天安门']

##  计算词频矩阵

In [3]:
corpus = ['I come to China to travel','I like to travel in China','I like tea ']

In [4]:
stop_words = set()
countvectorizer = CountVectorizer(stop_words=stop_words)
countvectorizer.fit(corpus)
count = countvectorizer.transform(corpus)
print("词频矩阵：", count.toarray())
print("词典：", countvectorizer.get_feature_names())

词频矩阵： [[1 1 0 0 0 2 1]
 [1 0 1 1 0 1 1]
 [0 0 0 1 1 0 0]]
词典： ['china', 'come', 'in', 'like', 'tea', 'to', 'travel']


In [5]:
print("停用词表：", countvectorizer.get_stop_words())

停用词表： frozenset()


## 计算tf-idf矩阵

In [6]:
transformer = TfidfTransformer()
tfidf_matrix = transformer.fit_transform(count)
print(tfidf_matrix.toarray())

[[0.35970039 0.47296278 0.         0.         0.         0.71940077
  0.35970039]
 [0.41779577 0.         0.54935123 0.41779577 0.         0.41779577
  0.41779577]
 [0.         0.         0.         0.60534851 0.79596054 0.
  0.        ]]


In [7]:
import math

In [8]:
(math.log10(4/3)+1) * 0.2   # 怎么也算不出结果啊

0.22498774732166

## 文本分类例子

In [9]:
# 获取词集合
def get_word_set(filename, min_word_count):
    label_set = set()
    word_set = set()
    word_count_dic = dict()
    with open(filename, "r", encoding='utf-8') as f:
        for line in f:
            label, sentence = line.strip().split("\t")
            label_set.add(label)
            words = jieba.lcut(sentence)
            for word in words:
                if word not in word_count_dic:
                    word_count_dic[word] = 1
                else:
                    word_count_dic[word] += 1
                    
    word_count_dic_filter = dict()
    for w,n in word_count_dic.items():
        if n >= min_word_count:
            word_count_dic_filter[w] = n
            word_set.add(w)
            
    return word_set, word_count_dic_filter, label_set

In [10]:
data_file = "./data/news_train1.txt"
word_set, word_count_dic, label_set = get_word_set(data_file, 30)

In [12]:
word_set

{'保证',
 '简单',
 '打球',
 '大胜',
 '一支',
 '古装',
 '虽然',
 '关注',
 '追求',
 '28',
 '最新',
 '鼓励',
 '来看',
 '动作',
 '至今',
 '年度',
 '陈木胜',
 '兰多夫',
 '新人',
 '易建联',
 '快船',
 '演员',
 '出品',
 '特效',
 '2005',
 '靠',
 '主动',
 '明确',
 '等到',
 '豪斯',
 '走出',
 '万港元',
 '理解',
 '辛苦',
 '岁',
 '月份',
 '人',
 '两',
 '江湖',
 '库里',
 '女儿',
 '父母',
 '喜剧',
 '及',
 '起',
 '一天',
 '按照',
 '无奈',
 '年前',
 '韩国',
 '场景',
 '难以',
 '吸引',
 '人选',
 '寻找',
 '落后',
 '12',
 '黄渤',
 '样子',
 '哈里斯',
 '发布',
 '成为',
 '26',
 '成熟',
 '谢霆锋',
 '90',
 '有些',
 '钱',
 '投进',
 '如',
 '舞台',
 '说法',
 '接下来',
 '灰熊',
 '看法',
 '掘金',
 '消息',
 '两天',
 '太阳',
 '夺冠',
 '台湾',
 '热门',
 '第四节',
 '者',
 '换下',
 '一般',
 '各种',
 '不是',
 '尝试',
 '即将',
 '观影',
 '唐季',
 '加入',
 '期待',
 '三次',
 '网络',
 '昔日',
 '个',
 '镜头',
 '刘易斯',
 '日',
 '身',
 '这一',
 '巴丁',
 '均',
 '故事',
 '加索尔',
 '专家',
 '身为',
 '开拍',
 '训练',
 '交流',
 '通过',
 '阿联',
 '江苏',
 '讯',
 '金城武',
 '大',
 '一个月',
 '家',
 '变化',
 '缩小',
 '吴镇宇',
 '邀请',
 '创造',
 '一人',
 '表示',
 '辛里奇',
 '火箭队',
 '后者',
 '以及',
 '重庆',
 '成龙',
 '廖凡',
 '更加',
 '看来',
 '但',
 '浙江',
 '不久',
 '场面',
 '西',
 '学校',
 '伊尔',


In [13]:
label_set

{'体育', '娱乐'}

In [14]:
def gene_train_data(filename, word_set):
    label_list = []
    sentence_list = []
    with open(filename, "r", encoding='utf-8') as f:
        for line in f:
            label, sentence = line.strip().split("\t")
            label_list.append(label.strip())
            words = jieba.lcut(sentence)
            line_list = []
            for word in words:
                if word in word_set:
                    line_list.append(word)
            sentence_list.append(' '.join(line_list))
    print(label_list[499], label_list[500])
            
    countvectorizer = CountVectorizer()
    countvectorizer.fit(sentence_list)
    tf_matrix = countvectorizer.transform(sentence_list)   
    
    tfidf_transformer = TfidfTransformer()
    tfidf_transformer.fit(tf_matrix)
    tfidf_matrix = tfidf_transformer.transform(tf_matrix)
    tfidf_list = tfidf_matrix.toarray()
    
    y = [0 if i == "体育" else 1 for i in label_list]
    X_train, X_test, y_train, y_test = train_test_split(tfidf_list, y, test_size=0.3, random_state=123)
    return X_train, X_test, y_train, y_test

In [15]:
X_train, X_test, y_train, y_test = gene_train_data(data_file, word_set)

体育 娱乐


In [16]:
type(X_train)

numpy.ndarray

In [17]:
print(len(X_train), len(y_train))

700 700


In [18]:
len(X_test)

300

In [19]:
y_test[:20]

[0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0]

In [20]:
model = LogisticRegression()
model.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [21]:
y_pred = model.predict(X_test)

In [22]:
accu = metrics.accuracy_score(y_test, y_pred)
print("Accu:", accu)

Accu: 1.0


In [23]:
for i in range(len(y_test)):
    print(y_test[i], " ", y_pred[i])

0   0
0   0
0   0
1   1
0   0
1   1
0   0
0   0
1   1
0   0
0   0
0   0
0   0
0   0
0   0
0   0
1   1
0   0
0   0
0   0
0   0
0   0
0   0
1   1
1   1
1   1
1   1
0   0
1   1
0   0
1   1
0   0
1   1
0   0
1   1
1   1
0   0
0   0
0   0
1   1
1   1
0   0
1   1
0   0
1   1
1   1
0   0
1   1
0   0
0   0
0   0
1   1
1   1
1   1
0   0
0   0
1   1
0   0
1   1
0   0
1   1
1   1
0   0
1   1
0   0
1   1
1   1
1   1
1   1
0   0
0   0
1   1
1   1
0   0
1   1
1   1
1   1
1   1
0   0
1   1
0   0
0   0
1   1
0   0
1   1
0   0
1   1
1   1
0   0
0   0
1   1
1   1
1   1
0   0
0   0
1   1
1   1
1   1
1   1
1   1
1   1
1   1
1   1
1   1
1   1
1   1
0   0
1   1
1   1
0   0
1   1
1   1
0   0
0   0
0   0
0   0
1   1
1   1
0   0
1   1
1   1
1   1
0   0
1   1
1   1
0   0
1   1
0   0
0   0
0   0
1   1
0   0
1   1
0   0
0   0
1   1
1   1
0   0
1   1
1   1
0   0
1   1
0   0
0   0
0   0
1   1
1   1
0   0
1   1
0   0
1   1
0   0
1   1
1   1
0   0
1   1
0   0
0   0
0   0
1   1
0   0
1   1
0   0
0   0
0   0
0   0
1   

In [24]:
model1 = tree.DecisionTreeClassifier()
model1.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [25]:
y_pred1 = model1.predict(X_test)

In [26]:
print("Accu1:", metrics.accuracy_score(y_test, y_pred1))

Accu1: 0.9933333333333333


## 小结
- 1 获取词汇总数的集合，过滤掉低频词汇
- 2 计算tf-idf(文本表示)
- 3 生成X_train,X_test,y_train,y_test
- 4 训练和评测