上接 [中文文本分类1](http://www.cnblogs.com/q735613050/p/9135789.html)

# 使用朴素贝叶斯进行分类
![](http://static.zybuluo.com/xinet/6043hb7nsr5n6c9jt2ltguzt/%E6%8D%95%E8%8E%B7.PNG)

In [1]:
import numpy as np

import os
import sys
sys.path.append('E:/xinlib')
from base.filename import FileName, nameBunch
import base.file as file

In [2]:
root = 'D:/MLBook/chapter02/train_corpus_small/'

f = FileName(root)
set_dict = f.target_bunch.size_dict    # 获取每个类别的样本数
set_dict

{'art': 248,
 'computer': 200,
 'economic': 325,
 'education': 220,
 'environment': 201,
 'medical': 204,
 'military': 249,
 'politics': 505,
 'sports': 450,
 'traffic': 214}

## 数据均衡处理

由上可知，数据集出现不均衡的现象，因此需要做数据均衡处理，我的做法是：
- 先找出每个类别的样本数，取最小值为 `size`
- 然后，在每个类别下随机选取 `size` 个样本作为训练集进行训练
- 最后，将训练集按比例随机划分为训练集和验证集

In [3]:
class CSegment(FileName):
    '''
    中文分词
    
    属性
    =====
    root::未分词分类语料库目录，例如："./train_corpus_small/"
    stopword_path::停用词路径，例如："E:/Data/中文语料库/hlt_stop_words.txt"
    '''
    def __init__(self, root, **kwargs):
        super().__init__(root, **kwargs)
        self.stopword_path = "E:/Data/中文语料库/hlt_stop_words.txt"
        self.seg_root = 'D:/seg_corpus/'
        self.train_dict, self.val_dict = self.split_dict()
        
    def make_seg_dir(self, seg_name):
        '''
        seg_name::分词后分类语料库目录，例如："train_corpus_seg/"
        '''
        seg_dir = [file.make_dir(self.seg_root + seg_name, dir_name) for dir_name in self.target_bunch.names]
        return seg_dir
            
    def save_seg(self, target_dir):
        for file_path in os.listdir(target_dir):               # 遍历类别目录下文件
            fullname = target_dir + file_path        # 拼出文件名全路径
            content = read_file(fullname).strip()     # 读取文件内容
            content = content.replace("\r\n", "")     # 删除换行和多余的空格
            content_seg = jieba.cut(content.strip())       # 为文件内容分词
            # 将处理后的文件保存到分词后语料目录
            save_file(self.seg_dir +  file_path, " ".join(content_seg))
        
    def file_list(self):
        '''
        Return
        =======
        seg_dir::分词后的目录
        '''
        catelist = os.listdir(self.root)
        file_names = []
        for i, target_name in enumerate(self.target_names):
            class_dir = self.root + target_name + "/"    # 拼出分类子目录
            file_list = os.listdir(class_dir)        # 获取 class_path 下的所有文件
            file_names.append([class_dir + name for name in file_list])
        return file_names

In [4]:
cs = CSegment(root)

In [5]:
train_dir = cs.make_seg_dir('train/')
val_dir = cs.make_seg_dir('val/')

In [37]:
for name in cs.target_bunch.names:
    

In [None]:
urllib.request.urlopen

In [None]:
for file_path in os.listdir(target_dir)

In [20]:
dB = cs.balancedBunch()

In [8]:
dB.name_dict['art']

'D:/MLBook/chapter02/train_corpus_small/art'

In [None]:
class Segment:
    def __init__(self, root):
        self.root = root
        self.stopword_path = "E:/Data/中文语料库/hlt_stop_words.txt"

    def get_seg(self, corpus_path):
        '''
        获取每个目录下所有的文件 mydir in catelist
        catelist = os.listdir(corpus_path)

        参数
        =====
        corpus_path::未分词分类语料库路径，例如："train_corpus_small/"
        seg_path::分词后分类语料库路径，例如："train_corpus_seg/"

        Return
        =======
        train_corpus_seg, val__corpus_seg::分词后的目录
        '''
        corpus_path = self.root + corpus_path
        train_corpus_seg = self.root + "train_corpus_seg/"
        catelist = os.listdir(corpus_path)
        start = time()
        for i, class_dir in enumerate(catelist):
            class_path = corpus_path + class_dir + "/"    # 拼出分类子目录的路径
            file_list = os.listdir(class_path)        # 获取 class_path 下的所有文件
            train_dir = make_seg_dir(train_corpus_seg, class_dir)
            save_seg(class_path, file_list, train_dir)
            if i == 0:
                print('  完成语料分词的类别依次为：')
            print('\t%i: %s' % (i, class_dir))
        print('--' * 20)
        print("总计花费时间 %g 秒，中文语料分词结束！！！" % (time() - start))
        return train_corpus_seg

    def create_bunch(self, wordbag_path, seg_path):
        '''
        获取每个目录下所有的文件 mydir in catelist
        catelist = os.listdir(corpus_path)

        参数
        =====
        wordbag_path::分词语料 Bunch 对象持久化路径，例如："train_word_bag/train_set.dat"
        seg_path::分词后分类语料库路径，例如："train_corpus_seg/"
        '''
        wordbag_dir = self.root + 'word_bag/'
        if not os.path.exists(wordbag_dir):           # 是否存在目录，如果没有创建
            os.makedirs(wordbag_dir)
        wordbag_path = wordbag_dir + wordbag_path
        catelist = os.listdir(seg_path)
        bunch = Bunch(target_name=[], label=[], filenames=[], contents=[])
        bunch.target_name.extend(catelist)        # 将类别信息保存到 Bunch 对象
        start = time()
        for i, mydir in enumerate(catelist):
            class_path = seg_path + mydir + "/"    # 拼出分类子目录的路径
            file_list = os.listdir(class_path)        # 获取 class_path 下的所有文件
            for k, file_path in enumerate(file_list):               # 遍历类别目录下文件
                fullname = class_path + file_path        # 拼出文件名全路径
                bunch.label.append(mydir)                # 保存当前文件的分类标签
                bunch.filenames.append(fullname)          # 保存当前文件的文件路径
                bunch.contents.append(read_file(fullname).strip())    # 保存文件词向量
                if k == 0 and i == 0:
                    print('构建文本对象中......')
                    print('--' * 20)
            if i == 0:
                print('  文本对象构建的类别依次为：')
            print('\t%i: %s' % (i, mydir))
        print('--' * 20)
        # 对象持久化
        with open(wordbag_path, "wb") as file_obj:
            pickle.dump(bunch, file_obj)
        print("总计花费时间 %g 秒，构建文本对象结束！！！" % (time() - start))
        print("")
        return wordbag_path

    def read_stopword(self):
        '''
        读取停用词表
        
        示例
        =======
        stopword_path = "E:/Data/中文语料库/hlt_stop_words.txt"
        '''
        return read_file(self.stopword_path).splitlines()

    def train_tfidf(self, space_name, seg_path):
        '''
        参数
        ======
        stopword_name::停用词
        seg_path::bunch 分词路径
        space_name::词向量词袋
        '''
        start = time()
        space_path = self.root + 'word_bag/' + space_name
        
        # 1. 读取停用词表
        stpwrdlst = self.read_stopword()

        # 2. 导入分词后的词向量 bunch 对象
        bunch = read_bunch(seg_path)
        # 3. 构建 tf-idf 词向量空间对象
        tfidfspace = Bunch(target_name=bunch.target_name, label=bunch.label,
                           filenames=bunch.filenames, tdm=[], vocabulary={})  #

        # 4. 使用 TfidfVectorizer 初始化向量空间模型
        vectorizer = TfidfVectorizer(
            stop_words=stpwrdlst, sublinear_tf=True, max_df=0.5)
        transformer = TfidfTransformer()  # 该类会统计每个词语的 tf-idf 权值

        # 文本转为词频矩阵,单独保存字典文件
        tfidfspace.tdm = vectorizer.fit_transform(bunch.contents)
        tfidfspace.vocabulary = vectorizer.vocabulary_

        # 创建词袋的持久化
        write_bunch(space_path, tfidfspace)
        print("花费时间：%g 秒，TF-IDF 词向量空间创建成功！！！" % (time() - start))
        return space_path


In [None]:

from segment import Word2Vector, Segment, read_bunch # 参考「中文文本分类1」


root = 'D:/MLBook/chapter02/'
# 分词
S = Segment(root)
corpus_path = "train_corpus_small/"
corpus_seg_dir = S.get_seg(corpus_path)
corpus_seg_dir 

In [None]:


# 创建 Bunch 对象
train_name = "corpus_seg.dat" 
train_path = S.create_bunch(train_name, corpus_seg_dir)
train_bunch = read_bunch(train_path)

train_path, val_path, train_target, val_target = \
train_test_split(train_bunch.filenames, train_bunch.label, test_size = 0.2)

In [None]:
train_name

In [None]:
WV = Word2Vector(root)
train_space_name = "train_tfdifspace.dat"          # 词向量词袋保存路径     
val_space_name = "val_tfdifspace.dat"          
train_space_path = WV.train_tfidf(train_space_name, train_path)
val_space_path = WV.train_tfidf(val_space_name, val_path)

In [None]:
train_set = read_bunch(train_space_path)
val_set = read_bunch(val_space_path)

In [None]:
# 导入多项式贝叶斯算法包
from sklearn.naive_bayes import MultinomialNB

In [None]:
# alpha 越小，迭代次数越多，精度越高

clf = MultinomialNB(alpha = 0.001).fit(train_set.tdm, train_set.label)

In [None]:
# 预测分类结果
predicted = clf.predict(val_set.tdm)
total = len(predicted)
rate = 0
for flabel, file_name, expct_cate in zip(val_set.label, val_set.filenames, predicted):
    if flabel != expct_cate:
        rate += 1
        print(file_name, '：实际类别：', flabel, '--> 预测类别：', expct_cate)
        
# 精度
print('error rate：', float(rate) * 100 / flaot(total), '%')

In [None]:
from sklearn.model_selection import train_test_split

# 划分数据集
X, Y = np.array(list(trainset.values())).T
X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size = .2)