# TwentyNewsGroup

20 Newsgroup 数据集包含了约 20000 篇来自于不同的新闻组的文档，最早由 Ken Lang 搜集整理。本部分包含了对于数据集的抓取、特征提取、简单分类器训练、主题模型训练等。

In [3]:
# 配置环境变量
import sys
sys.path.append('./')
sys.path.append('../')

# 引入外部的封装模块
from twenty_news_group import TwentyNewsGroup

In [4]:
# 实例化对象
twp = TwentyNewsGroup()

# 抓取数据
twp.fetch_data()

twenty_train = twp.data['train']

print("数据集结构", "->", twenty_train.keys())

print("文档数目", "->", len(twenty_train.data))

print("目标分类", "->",[ twenty_train.target_names[t] for t in twenty_train.target[:10]])

数据集结构 -> dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR', 'description'])
文档数目 -> 11314
目标分类 -> ['sci.space', 'comp.sys.mac.hardware', 'sci.electronics', 'comp.sys.mac.hardware', 'sci.space', 'rec.sport.hockey', 'talk.religion.misc', 'sci.med', 'talk.religion.misc', 'talk.politics.guns']


In [5]:
# 进行特征提取

# 构建文档-词矩阵（Document-Term Matrix）

from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer()

X_train_counts = count_vect.fit_transform(twenty_train.data)

print("DTM 结构","->",X_train_counts.shape)

# 查看某个词在词表中的下标
print("词对应下标","->", count_vect.vocabulary_.get(u'algorithm'))

DTM 结构 -> (11314, 130107)
词对应下标 -> 27366


In [6]:
# 构建文档的 TF 特征向量
from sklearn.feature_extraction.text import TfidfTransformer

tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)

print("某文档 TF 特征向量","->",X_train_tf)

# 构建文档的 TF-IDF 特征向量
from sklearn.feature_extraction.text import TfidfTransformer

tf_transformer = TfidfTransformer().fit(X_train_counts)
X_train_tfidf = tf_transformer.transform(X_train_counts)

print("某文档 TF-IDF 特征向量","->",X_train_tfidf)

某文档 TF 特征向量 ->   (0, 6447)	0.0380693493813
  (0, 37842)	0.0380693493813
  (0, 128402)	0.0380693493813
  (0, 47566)	0.0380693493813
  (0, 110220)	0.0380693493813
  (0, 61997)	0.0380693493813
  (0, 109061)	0.0380693493813
  (0, 124596)	0.152277397525
  (0, 125288)	0.0380693493813
  (0, 104717)	0.0380693493813
  (0, 89919)	0.0380693493813
  (0, 123796)	0.0380693493813
  (0, 114508)	0.0380693493813
  (0, 124597)	0.0380693493813
  (0, 87265)	0.0380693493813
  (0, 74675)	0.0380693493813
  (0, 110321)	0.0380693493813
  (0, 106960)	0.0380693493813
  (0, 117211)	0.0380693493813
  (0, 104813)	0.0380693493813
  (0, 80638)	0.0380693493813
  (0, 71079)	0.0380693493813
  (0, 101151)	0.0380693493813
  (0, 74757)	0.0380693493813
  (0, 111533)	0.0380693493813
  :	:
  (11313, 58076)	0.0129045687035
  (11313, 113763)	0.0129045687035
  (11313, 125053)	0.0129045687035
  (11313, 109035)	0.0516182748138
  (11313, 128387)	0.0516182748138
  (11313, 86864)	0.0516182748138
  (11313, 40015)	0.0258091374069
  (113

In [8]:
# 训练分类器
twp.train_classifier()

In [16]:
# 执行预测
docs_new = ['God is love', 'OpenGL on the GPU is fast']
predicted = twp.predict(docs_new)

for doc, category in zip(docs_new, predicted):
    print('%r => %s' % (doc, twenty_train.target_names[category]))
    
# 执行模型评测
twp.fetch_data(subset='test')

predicted = twp.predict(twp.data['test'].data)

import numpy as np

# 误差计算

# 简单误差均值
np.mean(predicted == twp.data['test'].target)   

# Metrics

from sklearn import metrics

print(metrics.classification_report(
    twp.data['test'].target, predicted,
    target_names=twp.data['test'].target_names))

# Confusion Matrix
metrics.confusion_matrix(twp.data['test'].target, predicted)

'God is love' => soc.religion.christian
'OpenGL on the GPU is fast' => rec.autos
                          precision    recall  f1-score   support

             alt.atheism       0.79      0.50      0.61       319
           comp.graphics       0.75      0.71      0.73       389
 comp.os.ms-windows.misc       0.78      0.71      0.74       394
comp.sys.ibm.pc.hardware       0.65      0.80      0.72       392
   comp.sys.mac.hardware       0.88      0.77      0.82       385
          comp.windows.x       0.86      0.77      0.81       395
            misc.forsale       0.89      0.83      0.86       390
               rec.autos       0.84      0.92      0.88       396
         rec.motorcycles       0.87      0.96      0.92       398
      rec.sport.baseball       0.89      0.93      0.91       397
        rec.sport.hockey       0.87      0.97      0.92       399
               sci.crypt       0.73      0.95      0.83       396
         sci.electronics       0.81      0.65      0.72     

array([[158,   0,   1,   1,   0,   1,   0,   3,   7,   1,   2,   6,   1,
          8,   3, 114,   6,   7,   0,   0],
       [  1, 278,  11,  15,   9,  21,   1,   5,   1,   7,   2,  16,   8,
          2,   7,   3,   1,   1,   0,   0],
       [  0,  17, 280,  44,   2,   9,   0,   4,   5,   5,   2,  15,   1,
          0,   5,   5,   0,   0,   0,   0],
       [  0,   6,  18, 312,  12,   3,   8,   6,   1,   0,   1,   3,  17,
          0,   5,   0,   0,   0,   0,   0],
       [  0,   3,  13,  32, 295,   2,   8,   5,   2,   4,   1,   6,   9,
          1,   3,   0,   1,   0,   0,   0],
       [  1,  30,  23,  11,   1, 306,   2,   1,   2,   2,   0,   8,   0,
          2,   4,   1,   1,   0,   0,   0],
       [  0,   4,   4,  19,   7,   1, 322,   8,   2,   3,   6,   1,   7,
          3,   2,   1,   0,   0,   0,   0],
       [  0,   1,   1,   2,   0,   1,   6, 365,   4,   2,   2,   0,   5,
          1,   3,   0,   2,   0,   1,   0],
       [  0,   0,   0,   1,   0,   0,   2,   9, 384,   0,   0,  

In [35]:
# 进行主题提取

twp.topics_by_lda()


Topic 0 : stream s1 astronaut zoo laurentian maynard s2 gtoal pem fpu
Topic 1 : 145 cx 0d bh sl 75u 6um m6 sy gld
Topic 2 : apartment wpi mars nazis monash palestine ottoman sas winner gerard
Topic 3 : livesey contest satellite tamu mathew orbital wpd marriage solntze pope
Topic 4 : x11 contest lib font string contrib visual xterm ahl brake
Topic 5 : ax g9v b8f a86 1d9 pl 0t wm 34u giz
Topic 6 : printf null char manes behanna senate handgun civilians homicides magpie
Topic 7 : buf jpeg chi tor bos det que uwo pit blah
Topic 8 : oracle di t4 risc nist instruction msg postscript dma convex
Topic 9 : candida cray yeast viking dog venus bloom symptoms observatory roby
Topic 10 : cx ck hz lk mv cramer adl optilink k8 uw
Topic 11 : ripem rsa sandvik w0 bosnia psuvm hudson utk defensive veal
Topic 12 : db espn sabbath br widgets liar davidian urartu sdpa cooling
Topic 13 : ripem dyer ucsu carleton adaptec tires chem alchemy lockheed rsa
Topic 14 : ingr sv alomar jupiter borland het intergraph

[(0,
  [('stream', 0.0077211017660912575),
   ('s1', 0.0051827365319261492),
   ('astronaut', 0.0050496371416436066),
   ('zoo', 0.0047744390567267493),
   ('laurentian', 0.0043804887823957984),
   ('maynard', 0.0043653680341875188),
   ('s2', 0.0041050479793097476),
   ('gtoal', 0.0041026151529143327),
   ('pem', 0.0039248670974374065),
   ('fpu', 0.0038304011470178261)]),
 (1,
  [('145', 0.02684621934751372),
   ('cx', 0.015146954155967162),
   ('0d', 0.01120484561796793),
   ('bh', 0.0072231553690531814),
   ('sl', 0.0059666667248342306),
   ('75u', 0.0054369260017737078),
   ('6um', 0.0051805441512121509),
   ('m6', 0.0047949629236150784),
   ('sy', 0.0047903110764948124),
   ('gld', 0.0047379217243407297)]),
 (2,
  [('apartment', 0.0072609642043499501),
   ('wpi', 0.0053815745581271305),
   ('mars', 0.0043967649874930084),
   ('nazis', 0.0037905780291177587),
   ('monash', 0.0037466535736985556),
   ('palestine', 0.0035695717910101754),
   ('ottoman', 0.0031380911092148108),
   ('