<center><font size=4 style="color:#BA4A00"><strong>中文体育新闻二分类 （伯努利朴素贝叶斯）</strong></font></center>

### 导入中文新闻语料库

In [1]:
from nltk.corpus.reader import CategorizedTaggedCorpusReader

In [2]:
creader = CategorizedTaggedCorpusReader('./cn_news_tagged/', '.*',cat_pattern = r'(.+)/.+txt')

In [3]:
import re
import string

In [4]:
zh_char = re.compile(r'[\u4e00-\u9fa5]')

In [5]:
with open('./stopwords.txt') as f:
    stopwords = f.read().strip().split()
    
domain_stopwords = ['记者', '报道']

In [6]:
cn_filter = lambda item: len(zh_char.findall(item)) > 1 and item not in (stopwords + domain_stopwords) and not set(item) & set(string.punctuation)

### 构造标注数据集

In [7]:
# 设置随机数种子，便于复现结果
import random
random.seed(100)

In [8]:
# 随机采样1200条Sports新闻作为正样本
sport_files = creader.fileids(categories=['Sports'])
random.shuffle(sport_files)
sport_docs = []
for file_id in sport_files[:1200]:
    doc = creader.words(fileids=[file_id])
    clean_doc = list(filter(cn_filter, doc))
    sport_docs.append(clean_doc)

In [9]:
# 随机采样1200条非Sports新闻作为负样本
other_files = creader.fileids(categories=['Travel', 'Finance', 'Health'])
random.shuffle(other_files)
other_docs = []
for file_id in other_files[:1200]:
    doc = creader.words(fileids=[file_id])
    clean_doc = list(filter(cn_filter, doc))
    other_docs.append(clean_doc)

In [10]:
assert len(sport_docs) == 1200
assert len(other_docs) == 1200

### 文本向量化

In [11]:
from sklearn.model_selection import train_test_split

In [12]:
X = sport_docs + other_docs

In [13]:
# S表示标签Sports, O表示标签Other
y = ['S'] * 1200 + ['O'] * 1200

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=100)

In [15]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

In [16]:
# 将训练集向量化，二进制词频矩阵
count_vectorizer = CountVectorizer(tokenizer=lambda text:text, lowercase=False, min_df = 5)
X_train_count_matrix = count_vectorizer.fit_transform(X_train)
X_train_binary_matrix = np.where(X_train_count_matrix.toarray() > 0, 1, 0)

In [17]:
X_train_binary_matrix.shape

(1920, 9359)

In [18]:
print(X_train_binary_matrix)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [19]:
# 将测试集向量化，二进制词频矩阵

In [20]:
X_test_count_matrix = count_vectorizer.transform(X_test)
X_test_binary_matrix = np.where(X_test_count_matrix.toarray() > 0, 1, 0)

In [21]:
X_test_binary_matrix.shape

(480, 9359)

In [22]:
print(X_test_binary_matrix)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


### 训练和测试文本二分类器

In [23]:
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import accuracy_score, confusion_matrix

In [24]:
# 使用伯努利贝叶斯进行分类
clf = BernoulliNB()
clf.fit(X_train_binary_matrix, y_train)

# 在测试集上进行预测
y_pred = clf.predict(X_test_binary_matrix)

# 评估分类器性能
accuracy = accuracy_score(y_test, y_pred)
confusion = confusion_matrix(y_test, y_pred)

print("Accuracy:", accuracy)
print("Confusion Matrix:\n", confusion)

Accuracy: 0.98125
Confusion Matrix:
 [[216   8]
 [  1 255]]


In [25]:
clf.classes_

array(['O', 'S'], dtype='<U1')

In [26]:
# 混淆矩阵含义如下
# 行表示True
# 列表示Predicted

In [27]:
# 255表示把测试集中256条体育新闻中的255条正确地分类为Sports
# 8  表示把测试集中224条其他新闻中的 8 条错误地分类为Sports