- 中英文新闻数据聚类分析

# 基本设置

In [1]:
from sklearn.datasets import fetch_20newsgroups

In [2]:
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
%load_ext autoreload
%autoreload 2

In [3]:
from sklearn import metrics

def clustering_metrics(labels_pred, labels_true = None, feature = None):
    '''
    聚类算法结果评估
    需要真实标签：
        兰德指数 ARI: 输入参数没有顺序要求，ARI值的范围是[-1,1]，
            负的结果都是较差的，说明标签是独立分布的，相似分布的ARI结果是正的，
            1是最佳结果，说明两种标签的分布完全一致
        互信息 AMI：输入参数没有顺序要求，最好的值为1，最差的值（与labels_true不相关），其结果为非正值
        同质性、完整性、两者的调和平均V-measure：从0到1反应出最差到最优的表现
        Fowlkes-Mallows指数：针对训练集和验证集数据之间求得的查全率和查准率的几何平均值
        
    不需要真实标签：        
        轮廓系数：取值范围是[-1,1]，同类别样本距离越相近不同类别样本距离越远，分数越高。
        Calinski-Harabaz Index：分数值越大则聚类效果越好        
    '''
    
    if labels_true is not None:
        print u'兰德指数 ARI: ', metrics.adjusted_rand_score(labels_true, labels_pred)
        print u'互信息 AMI: ', metrics.adjusted_mutual_info_score(labels_true, labels_pred)
        print u'同质性、完整性、两者的调和平均V-measure: ', metrics.homogeneity_completeness_v_measure(labels_true, labels_pred)
        print u'Fowlkes-Mallows指数 FMI: ', metrics.fowlkes_mallows_score(labels_true, labels_pred)
        
    if feature is not None:
        print u'轮廓系数: ', metrics.silhouette_score(feature, labels_pred, metric='euclidean')
        print u'Calinski-Harabaz Index: ', metrics.calinski_harabaz_score(feature, labels_pred) 


In [4]:
labels_true = None
labels_true is not None

False

# English dataset

## 20newsgroups

In [5]:
sample_cate = ['alt.atheism', 'soc.religion.christian','comp.graphics', 'sci.med', 'rec.sport.baseball']
newsgroups_train = fetch_20newsgroups(subset='train',categories=sample_cate,shuffle=True, 
                                      random_state=42,remove = ('headers', 'footers', 'quotes'))
newsgroups_test = fetch_20newsgroups(subset='test', categories=sample_cate,shuffle=True, 
                                     random_state=42,remove = ('headers', 'footers', 'quotes'))

# newsgroups_train.target, newsgroups_train.data
print(len(newsgroups_train.data), len(newsgroups_test.data))

(2854, 1899)


In [6]:
newsgroups_test.data

[u"It depends. If, in the judgment of the umpire the batter made no\nattempt to avoid getting hit, the batter is awarded first for a\nbase on balls. If the umpire rules he did try to get out of the\nway, he's awarded first because of a hit batsman.\n\nRyan Robbins\nPenobscot Hall\nUniversity of Maine",
 u'I read a mesg. somewhere on GENIE about Intel coming out with a \ngraphics standard called PCI, which would supplant VESA standards.  Is\nthis a rumor, or is there some substance to it. If any of y\'all have\nheard of this "standard" please e-mail me on how I might obtain more info',
 u"\n\nFine, are you willing to bet that he will bat .400 the rest of the way?\n\nThe point is that he has hurt the Rockies so far; it's that he *will* hurt\nthem, eventually.  Just as much as he hurt the Expos and the Cardinals the\npast couple seasons.\n\n\nIt has happened for the past 3+ seasons; where have you been?\n\n\nWe'll see come September.  (I have an outstanding bet with someone that\nGalarrag

In [7]:
train_targe = newsgroups_train.target
test_targe = newsgroups_test.target

from collections import Counter
Counter(train_targe)

Counter({0: 480, 1: 584, 2: 597, 3: 594, 4: 599})

## news data

# Feature

## TF-IDF

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words='english',lowercase=True)
train_vector = vectorizer.fit_transform(newsgroups_train.data)
print(train_vector.shape)
 
test_vector = vectorizer.transform(newsgroups_test.data)
print(test_vector.shape)

(2854, 31057)
(1899, 31057)


# Clustering

## K-means

In [9]:
from sklearn.cluster import KMeans

k = len(sample_cate)
print 'k: ',k
labels_pred = KMeans(n_clusters=k, random_state=9).fit_predict(train_vector)

k:  5


In [10]:
clustering_metrics(labels_pred, labels_true = train_targe, feature = train_vector.toarray())
print metrics.classification_report(train_targe, labels_pred)
print metrics.confusion_matrix(train_targe, labels_pred)

兰德指数 ARI:  0.23258349597157577
互信息 AMI:  0.37796164387174314
同质性、完整性、两者的调和平均V-measure:  (0.37905346928372147, 0.4793982836042738, 0.4233612010401955)
Fowlkes-Mallows指数 FMI:  0.4454366344757007
轮廓系数:  0.009611675581713221
Calinski-Harabaz Index:  12.851271959374374
             precision    recall  f1-score   support

          0       0.20      0.16      0.18       480
          1       0.00      0.00      0.00       584
          2       0.14      0.35      0.20       597
          3       1.00      0.12      0.21       594
          4       0.03      0.02      0.03       599

avg / total       0.28      0.13      0.12      2854

[[ 79   0 399   0   2]
 [  0   1 154   0 429]
 [  1 363 210   0  23]
 [  1   1 489  71  32]
 [314   2 269   0  14]]


## DBSCAN

In [44]:
from sklearn.cluster import DBSCAN

db = DBSCAN(eps=0.8, min_samples=4).fit(train_vector)
labels_pred = db.labels_  # 类别标签

clustering_metrics(labels_pred, labels_true = train_targe, feature = train_vector.toarray())
print metrics.classification_report(train_targe, labels_pred)
print metrics.confusion_matrix(train_targe, labels_pred)

兰德指数 ARI:  0.00031786711413928754
互信息 AMI:  0.005045583740525484
同质性、完整性、两者的调和平均V-measure:  (0.007069251104499306, 0.06757518101591008, 0.012799505856320871)
Fowlkes-Mallows指数 FMI:  0.4338580571053762
轮廓系数:  -0.25076102542419115
Calinski-Harabaz Index:  3.5656796512396776
             precision    recall  f1-score   support

         -1       0.00      0.00      0.00         0
          0       0.15      0.03      0.04       480
          1       1.00      0.01      0.01       584
          2       0.00      0.00      0.00       597
          3       0.00      0.00      0.00       594
          4       0.00      0.00      0.00       599

avg / total       0.23      0.01      0.01      2854

[[  0   0   0   0   0   0]
 [461  12   0   0   7   0]
 [564  16   4   0   0   0]
 [572  25   0   0   0   0]
 [572  16   0   6   0   0]
 [590   9   0   0   0   0]]


In [45]:
help(DBSCAN)

Help on class DBSCAN in module sklearn.cluster.dbscan_:

class DBSCAN(sklearn.base.BaseEstimator, sklearn.base.ClusterMixin)
 |  Perform DBSCAN clustering from vector array or distance matrix.
 |  
 |  DBSCAN - Density-Based Spatial Clustering of Applications with Noise.
 |  Finds core samples of high density and expands clusters from them.
 |  Good for data which contains clusters of similar density.
 |  
 |  Read more in the :ref:`User Guide <dbscan>`.
 |  
 |  Parameters
 |  ----------
 |  eps : float, optional
 |      The maximum distance between two samples for them to be considered
 |      as in the same neighborhood.
 |  
 |  min_samples : int, optional
 |      The number of samples (or total weight) in a neighborhood for a point
 |      to be considered as a core point. This includes the point itself.
 |  
 |  metric : string, or callable
 |      The metric to use when calculating distance between instances in a
 |      feature array. If metric is a string or callable, it must 