# 贝叶斯文本分类

In [1]:
import os

data_dir = './Sample'
folder_list = os.listdir(data_dir)
# 分类标签
folder_list

['C000008',
 'C000010',
 'C000013',
 'C000014',
 'C000016',
 'C000020',
 'C000022',
 'C000023',
 'C000024']

In [2]:
import jieba

stop_words = set()
with open('stopwords_cn.txt','r', encoding='utf-8') as f:
    for line in f.readlines():
        stop_words.add(line.strip())

In [3]:
import random
import re
content_matrix = []
label_matrix = []
for folder in folder_list:
    new_folder_list = os.path.join(data_dir,folder)
    files = os.listdir(new_folder_list)
    # 每个分类最多取8个文件，其余文件作为真实的待分类文本
    # 这8个文件中又细分为训练集和测试集
    for file in files[:8]: 
        file_path = os.path.join(new_folder_list, file)
        with open(file_path, 'r', encoding='utf-8') as f:
            # print(folder,f.read()) # 当前文本分类，当前文本内容
            content = f.read()
        
        clean_word_cut = set()
        word_cut = jieba.lcut(content)
        pattern = re.compile(r'[\d.%(nbsp)]+') # 过滤数值型数据
        clean_word_cut = [word for word in word_cut if word not in stop_words \
                          and not word.isdigit() and word.strip() and len(word.strip()) > 1 and not re.search(pattern, word.strip())]

        #print(folder,clean_word_cut)
        content_matrix.append(clean_word_cut)
        label_matrix.append(folder)

content_label_zip = zip(content_matrix,label_matrix)
content_label_list = list(content_label_zip)
random.shuffle(content_label_list)
index = int(len(content_label_list) * 0.8) + 1
train_list = content_label_list[:index]
test_list = content_label_list[index:]

train_word_list, train_label_list = zip(*train_list)
test_word_list, test_label_list = zip(*test_list)

# 生成特征词词典，注意是以训练集数据生成的！
# 此过程目的是整合所有训练集文本特征作为整个数据处理的词袋
words_dict = {}
for words in train_word_list:
    for word in words:
        if word in words_dict.keys():
            words_dict[word] += 1
        else:
            words_dict[word] = 1

sorted_words_dict = sorted(words_dict.items(), key = lambda x : x[1], reverse=True)
feature_words = [word[0] for word in sorted_words_dict]

# 以基准特征词为主，用特征词表示训练集的文本信息（即：训练集的文本特征）
# 通俗点说就是将文本文档转换成对应的数值型数据，向量化阶段
train_feature_list = []
for text in train_word_list:
    text_words = set(text)
    train_feature_list.append([1 if word in text_words else 0 for word in feature_words])


Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\SYMBOL~1\AppData\Local\Temp\jieba.cache
Loading model cost 1.779 seconds.
Prefix dict has been built succesfully.


In [4]:
import sklearn
from sklearn.naive_bayes import MultinomialNB

# 测试集的文本特征
test_feature_list = []
for text in test_word_list:
    text_words = set(text)
    test_feature_list.append([1 if word in text_words else 0 for word in feature_words])
    
classifier = MultinomialNB().fit(train_feature_list, train_label_list)
# 分类器的准确率
print(classifier.score(test_feature_list, test_label_list))
# 预测类别 VS 实际类别
print(list(zip(classifier.predict(test_feature_list), test_label_list)))

0.571428571429
[('C000008', 'C000010'), ('C000010', 'C000013'), ('C000016', 'C000016'), ('C000008', 'C000022'), ('C000016', 'C000016'), ('C000020', 'C000022'), ('C000023', 'C000023'), ('C000020', 'C000020'), ('C000020', 'C000020'), ('C000023', 'C000023'), ('C000010', 'C000010'), ('C000008', 'C000023'), ('C000008', 'C000013'), ('C000024', 'C000024')]


In [8]:
# 注意 new_word_list 格式问题：二维列表，同理，new_feature_list 也是二维列表
new_word_list = [
    ['互联网','IT']
]
new_feature_list = []
for text in new_word_list:
    text_words = set(text)
    new_feature_list.append([1 if word in text_words else 0 for word in feature_words])
print(classifier.predict(new_feature_list))

['C000022']
