- 中英文新闻数据聚类分析

# 基本设置

In [2]:
from sklearn.datasets import fetch_20newsgroups

In [15]:
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
%load_ext autoreload
%autoreload 2

In [23]:
from sklearn import metrics

def clustering_metrics(labels_pred, labels_true = None, feature = None):
    '''
    聚类算法结果评估
    需要真实标签：
        兰德指数 ARI: 输入参数没有顺序要求，ARI值的范围是[-1,1]，
            负的结果都是较差的，说明标签是独立分布的，相似分布的ARI结果是正的，
            1是最佳结果，说明两种标签的分布完全一致
        互信息 AMI：输入参数没有顺序要求，最好的值为1，最差的值（与labels_true不相关），其结果为非正值
        同质性、完整性、两者的调和平均V-measure：从0到1反应出最差到最优的表现
        Fowlkes-Mallows指数：针对训练集和验证集数据之间求得的查全率和查准率的几何平均值
        
    不需要真实标签：        
        轮廓系数：取值范围是[-1,1]，同类别样本距离越相近不同类别样本距离越远，分数越高。
        Calinski-Harabaz Index：分数值越大则聚类效果越好        
    '''
    
    if labels_true is not None:
        print u'兰德指数 ARI: ', metrics.adjusted_rand_score(labels_true, labels_pred)
        print u'互信息 AMI: ', metrics.adjusted_mutual_info_score(labels_true, labels_pred)
        print u'同质性、完整性、两者的调和平均V-measure: ', 
              metrics.homogeneity_completeness_v_measure(labels_true, labels_pred)
        print u'Fowlkes-Mallows指数 FMI: ', metrics.fowlkes_mallows_score(labels_true, labels_pred)
        
    if feature is not None:
        print u'轮廓系数: ', metrics.silhouette_score(feature, labels_pred, metric='euclidean')
        print u'Calinski-Harabaz Index: ', metrics.calinski_harabaz_score(feature, labels_pred) 


IndentationError: unexpected indent (<ipython-input-23-3949a61fd11a>, line 23)

In [11]:
labels_true = None
labels_true is not None

False

# English dataset

## 20newsgroups

In [3]:
sample_cate = ['alt.atheism', 'soc.religion.christian','comp.graphics', 'sci.med', 'rec.sport.baseball']
newsgroups_train = fetch_20newsgroups(subset='train',categories=sample_cate,shuffle=True, 
                                      random_state=42,remove = ('headers', 'footers', 'quotes'))
newsgroups_test = fetch_20newsgroups(subset='test', categories=sample_cate,shuffle=True, 
                                     random_state=42,remove = ('headers', 'footers', 'quotes'))

# newsgroups_train.target, newsgroups_train.data
print(len(newsgroups_train.data), len(newsgroups_test.data))

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


(2854, 1899)


In [20]:
train_targe = newsgroups_train.target
test_targe = newsgroups_test.target

from collections import Counter
Counter(train_targe)

Counter({0: 480, 1: 584, 2: 597, 3: 594, 4: 599})

# Feature

## TF-IDF

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words='english',lowercase=True)
train_vector = vectorizer.fit_transform(newsgroups_train.data)
print(train_vector.shape)
 
test_vector = vectorizer.transform(newsgroups_test.data)
print(test_vector.shape)

(2854, 31057)
(1899, 31057)


# Clustering

## K-means

In [8]:
from sklearn.cluster import KMeans

k = len(sample_cate)
labels_pred = KMeans(n_clusters=k, random_state=9).fit_predict(train_vector)

In [22]:
clustering_metrics(labels_pred, labels_true = train_targe, feature = train_vector.toarray())

(u'\u5170\u5fb7\u6307\u6570 ARI: ', 0.23258349597157577)
(u'\u4e92\u4fe1\u606f AMI: ', 0.37796164387174314)
(u'\u540c\u8d28\u6027\u3001\u5b8c\u6574\u6027\u3001\u4e24\u8005\u7684\u8c03\u548c\u5e73\u5747V-measure: ', (0.37905346928372147, 0.4793982836042738, 0.4233612010401955))
(u'Fowlkes-Mallows\u6307\u6570 FMI: ', 0.4454366344757007)
(u'\u8f6e\u5ed3\u7cfb\u6570: ', 0.009611675581713221)
(u'Calinski-Harabaz Index: ', 12.851271959374374)


## DBSCAN