In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report  # 精确率，召回率

In [3]:
news = fetch_20newsgroups(subset='all')

In [6]:
print(news.DESCR)

.. _20newsgroups_dataset:

The 20 newsgroups text dataset
------------------------------

The 20 newsgroups dataset comprises around 18000 newsgroups posts on
20 topics split in two subsets: one for training (or development)
and the other one for testing (or for performance evaluation). The split
between the train and test set is based upon a messages posted before
and after a specific date.

This module contains two loaders. The first one,
:func:`sklearn.datasets.fetch_20newsgroups`,
returns a list of the raw texts that can be fed to text feature
extractors such as :class:`sklearn.feature_extraction.text.CountVectorizer`
with custom parameters so as to extract feature vectors.
The second one, :func:`sklearn.datasets.fetch_20newsgroups_vectorized`,
returns ready-to-use features, i.e., it is not necessary to use a feature
extractor.

**Data Set Characteristics:**

    Classes                     20
    Samples total            18846
    Dimensionality               1
    Features       

In [7]:
news.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [12]:
len(news.data)

18846

In [17]:
print(len(news.target))
news.target

18846


array([10,  3, 17, ...,  3,  1,  7])

In [4]:
x_train, x_test, y_train, y_test = train_test_split(
    news.data, news.target, test_size=0.25)

In [5]:
# 数据处理
tf = TfidfVectorizer()
x_train = tf.fit_transform(x_train)
x_test = tf.transform(x_test)  # 注意，这里用trainsform，没有fit，要以train为标准

In [6]:
y_train

array([ 4,  7,  8, ..., 11, 19, 14])

In [7]:
# 创建朴素贝叶斯api对象
mlt = MultinomialNB(alpha=1.0)

In [8]:
# 进行训练
mlt.fit(x_train, y_train)

MultinomialNB()

In [11]:
# 预测值
y_pred = mlt.predict(x_test)

In [12]:
# 准确率
mlt.score(x_test, y_test)

0.8363752122241087

In [None]:
## print(classification_report(y_test, y_pred, target_names=news.target_names))

In [14]:
print(classification_report(y_test, y_pred, target_names=news.target_names))

                          precision    recall  f1-score   support

             alt.atheism       0.86      0.71      0.78       204
           comp.graphics       0.89      0.70      0.79       257
 comp.os.ms-windows.misc       0.84      0.86      0.85       229
comp.sys.ibm.pc.hardware       0.69      0.85      0.76       241
   comp.sys.mac.hardware       0.92      0.78      0.84       263
          comp.windows.x       0.95      0.84      0.89       257
            misc.forsale       0.91      0.61      0.73       251
               rec.autos       0.85      0.93      0.89       243
         rec.motorcycles       0.97      0.92      0.94       276
      rec.sport.baseball       0.94      0.94      0.94       240
        rec.sport.hockey       0.90      0.98      0.94       221
               sci.crypt       0.71      0.98      0.82       239
         sci.electronics       0.87      0.83      0.85       240
                 sci.med       0.95      0.88      0.92       240
         