# 基本设置

In [7]:
import jieba
import sys
import re
import time
import string

import numpy as np
import pandas as pd
# import pre_cor
import os
from sqlalchemy import create_engine
from pandas.io import sql

In [8]:
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [9]:
from sklearn import cluster
import chardet

## 一些函数

In [56]:
import sys
from datetime import datetime
# from multiprocessing.pool import ThreadPool as Pool
from multiprocessing import Pool

def multi_process(func_name, func, param):
    '''
    多线程处理
    '''
    if 'win' in sys.platform:
        njobs = 3 # 一半
    elif 'linux' in sys.platform:
        njobs = 6
    
    t1 = datetime.now()
#     logger.info('starting func: %s. num: %s'%(func_name, len(param)))
    print 'starting func: %s. njobs: %s, num: %s'%(func_name, njobs, len(param))
    p = Pool(processes = njobs) # 创建5条进程
    result = p.map(func,param)
    p.close() # 关闭进程池，不再接受请求
    p.join() # 等待所有的子进程结束
    
    result_list = [(p, re) for p, re in zip(param, result)]
    
    t2 = datetime.now()
    elapsed_time = '%0.2f'%((t2 - t1).seconds)
#     logger.info('end func: %s. elapsed_time: %s, num_sent: %s'%(func_name, elapsed_time, 
#                                                                 len(result_list)))
    print 'end func: %s. elapsed_time: %s, num_sent: %s'%(func_name, elapsed_time, len(result_list))
    return result_list
                

In [10]:
from sklearn import metrics

def clustering_metrics(labels_pred, labels_true = None, feature = None):
    '''
    聚类算法结果评估
    需要真实标签：
        兰德指数 ARI: 输入参数没有顺序要求，ARI值的范围是[-1,1]，
            负的结果都是较差的，说明标签是独立分布的，相似分布的ARI结果是正的，
            1是最佳结果，说明两种标签的分布完全一致
        互信息 AMI：输入参数没有顺序要求，最好的值为1，最差的值（与labels_true不相关），其结果为非正值
        同质性、完整性、两者的调和平均V-measure：从0到1反应出最差到最优的表现
        Fowlkes-Mallows指数：针对训练集和验证集数据之间求得的查全率和查准率的几何平均值
        
    不需要真实标签：        
        轮廓系数：取值范围是[-1,1]，同类别样本距离越相近不同类别样本距离越远，分数越高。
        Calinski-Harabaz Index：分数值越大则聚类效果越好        
    '''
    
    if labels_true is not None:
        print u'兰德指数 ARI: ', metrics.adjusted_rand_score(labels_true, labels_pred)
        print u'互信息 AMI: ', metrics.adjusted_mutual_info_score(labels_true, labels_pred)
        print u'同质性、完整性、两者的调和平均V-measure: ', metrics.homogeneity_completeness_v_measure(labels_true, labels_pred)
        print u'Fowlkes-Mallows指数 FMI: ', metrics.fowlkes_mallows_score(labels_true, labels_pred)
        
    if feature is not None:
        print u'轮廓系数: ', metrics.silhouette_score(feature, labels_pred, metric='euclidean')
        print u'Calinski-Harabaz Index: ', metrics.calinski_harabaz_score(feature, labels_pred) 

# load data

In [5]:
raw_data = pd.read_excel('news_data_20190401/news_data.xls')
print(raw_data.shape)
raw_data.head()

(7396, 5)


Unnamed: 0,id,title,publishtime,site_name,content
0,29064424,Serie A Roundup: Moise Kean keeps up scoring f...,2019-03-31 13:35:20,印度快报,Moise Kean can’t seem to miss lately. Days aft...
1,29102555,"Motor racing-We will come back stronger, says ...",2019-04-01 04:45:23,每日邮报,"MANAMA, March 31 (Reuters) - Charles Leclerc p..."
2,29087495,Two women reveal how tragedy made them 'really...,2019-04-01 00:15:39,每日邮报,Georgina Brown (pictured) was hit by tragedy w...
3,29099506,Kevin Maher: I promise to cook more often than...,2019-04-01 07:01:00,泰晤士报,"Young couples in love, what’s wrong with you? ..."
4,29067200,Farewell to the Telegraph and its readers afte...,2019-03-31 15:00:00,每日电讯报,Farewell to the Telegraph and its readers afte...


# baseline

In [4]:
from dao.mysql.base_data_view import BaseDataView

import hierarchical_cluster
'''
perform_cluster(is_manual, cluster_type, manual_id, subtopic_id, language_type,
                    corpus, min_sample, save_group_id)
'''

2019-04-02 18:01:48 16776 16196 textcleaner.py [line:37] INFO 'pattern' package not found; tag filters are not available for English
2019-04-02 18:09:33 16776 16196 nltk_multipro_new.py [line:170] INFO start cut_tasks ....
2019-04-02 18:09:33 16776 16196 nltk_multipro_new.py [line:181] INFO end cut_tasks ...., 0s
2019-04-02 18:09:33 16776 16196 nltk_multipro_new.py [line:223] INFO start multi_process ....
2019-04-02 18:11:51 16776 16196 nltk_multipro_new.py [line:238] INFO end multi_process ...., 138s
2019-04-02 18:11:51 16776 16196 hierarchical_cluster.py [line:326] INFO first_cut_results end....
2019-04-02 18:11:52 16776 16196 hierarchical_cluster.py [line:336] INFO first_tfidf_matrix end....
2019-04-02 18:27:57 16776 16196 hierarchical_cluster.py [line:341] INFO first_cluster_results end....
2019-04-02 18:27:57 16776 16196 nltk_multipro_new.py [line:170] INFO start cut_tasks ....
2019-04-02 18:27:57 16776 16196 nltk_multipro_new.py [line:181] INFO end cut_tasks ...., 0s
2019-04-02 1

In [5]:
is_manual = 0
cluster_type = 0
manual_id = 0
subtopic_id = 0
language_type = 1
min_sample = 1
save_group_id = 5

base_data_view = BaseDataView()
dict_data = raw_data.to_dict('records')
corpus = BaseDataView.parse_corpus_records(dict_data, language_type=1)

In [37]:
print raw_data.shape
len(corpus)
# corpus[0]

7302

In [12]:
cluster_results = hierarchical_cluster.perform_cluster(is_manual, cluster_type, manual_id, 
                                                       subtopic_id, language_type,corpus, 
                                                       min_sample, save_group_id)

In [14]:
len(cluster_results)
cluster_results[0]

<entity.cluster_result.MergerClusterResult at 0x301a9748>

## save baseline result

In [38]:
baseline_result = []

for fir_cluster_ind, merger_cluster_result in enumerate(cluster_results):
    fir_topic = merger_cluster_result.topic
    fir_member = merger_cluster_result.member
    fir_member_count = merger_cluster_result.member_count
    for sub_cluster_result in merger_cluster_result.sub_cluster_results:
        member_list = sub_cluster_result.member.split('^A')
        topic = sub_cluster_result.topic        
        member_count = sub_cluster_result.member_count
        site_count = sub_cluster_result.site_count
        cluster_id = sub_cluster_result.cluster_id
        c_id = str(fir_cluster_ind) + '_' + str(cluster_id)
        for member in member_list:
            baseline_result.append([member, topic, c_id, fir_topic, cluster_id, fir_cluster_ind, 
                                    member_count, site_count, fir_member, fir_member_count])
        
baseline_result[0]

['29077036',
 u"Warren: Decision on 2020 up to Biden after woman's claim(accuser/biden/warren)",
 '0_3085',
 u'Ex-U.S. Vice President Biden denies inappropriate conduct over alleged kiss(biden/joe/woman/kiss/act/inappropriate)',
 3085,
 0,
 3,
 3,
 '29077036^A29076426^A29054671^A29103572^A29089361^A29082773^A29086886^A29084555^A29081648^A29087088^A29086922^A29087906^A29089186^A29099413^A29103568^A29084836^A29099059^A29092063^A29083362^A29085250^A29071826^A29100379^A29101552^A29085788^A29100658^A29085775^A29091997^A29081865^A29094944^A29121113^A29081501^A29090051^A29103137^A29117505^A29090770^A29080284^A29081512^A29098701^A29082701^A29083242^A29082429^A29086277^A29084773^A29080298^A29080912^A29066161^A29093998^A29090846^A29086676^A29094440^A29101521^A29092325^A29087122^A29090847^A29094774^A29085927^A29084956^A29090062^A29096712^A29088431^A29093997^A29081605^A29081375^A29085011',
 64]

In [40]:
col = '''member, topic, c_id, fir_topic, cluster_id, fir_cluster_ind, member_count, site_count, fir_member, fir_member_count'''
baseline_result = pd.DataFrame(baseline_result, columns = col.split(', '))
baseline_result.shape

(7302, 10)

In [47]:
raw_data['id'] = raw_data['id'].astype(str)
baseline_result = pd.merge(baseline_result, raw_data, how = 'left', left_on = 'member', right_on = 'id')
baseline_result.shape
# baseline_result.head()
# baseline_result.to_csv('baseline_result_0403.csv', index = False)

In [50]:
import xlsxwriter
bb = pd.ExcelWriter('baseline_result_0403.xlsx',engine='xlsxwriter')
baseline_result.to_excel(bb, sheet_name='Sheet1')
bb.save()

## filtered data

In [11]:
# filtered_data = baseline_result[['id', 'title', 'content']]
filtered_data = pd.read_excel('baseline_result_0403.xlsx')
filtered_data.shape

(7302, 15)

In [12]:
filtered_data.head()

Unnamed: 0,member,topic,c_id,fir_topic,cluster_id,fir_cluster_ind,member_count,site_count,fir_member,fir_member_count,id,title,publishtime,site_name,content
0,29077036,Warren: Decision on 2020 up to Biden after wom...,0_3085,Ex-U.S. Vice President Biden denies inappropri...,3085,0,3,3,29077036^A29076426^A29054671^A29103572^A290893...,64,29077036,"Event organizer: Biden, accuser were never alo...",2019-03-31 19:53:39,国会山报,\nThe organizer of a Nevada campaign rally sa...
1,29076426,Warren: Decision on 2020 up to Biden after wom...,0_3085,Ex-U.S. Vice President Biden denies inappropri...,3085,0,3,3,29077036^A29076426^A29054671^A29103572^A290893...,64,29076426,Warren and Castro back Biden accuser – but don...,2019-03-31 19:23:53,卫报,Lucy Flores says then-VP kissed her at 2014 Ne...
2,29054671,Warren: Decision on 2020 up to Biden after wom...,0_3085,Ex-U.S. Vice President Biden denies inappropri...,3085,0,3,3,29077036^A29076426^A29054671^A29103572^A290893...,64,29054671,Warren: Decision on 2020 up to Biden after wom...,2019-03-31 10:00:13,美国广播公司,Some Democratic presidential candidates are ex...
3,29103572,Biden responds to allegation of unwanted touch...,0_3086,Ex-U.S. Vice President Biden denies inappropri...,3086,0,7,3,29077036^A29076426^A29054671^A29103572^A290893...,64,29103572,Flores: Interaction with Biden 'a violation of...,2019-04-01 09:02:46,国会山报,"Lucy Flores, a former Nevada state assemblyw..."
4,29089361,Biden responds to allegation of unwanted touch...,0_3086,Ex-U.S. Vice President Biden denies inappropri...,3086,0,7,3,29077036^A29076426^A29054671^A29103572^A290893...,64,29089361,Bernie Sanders: 'No Reason' To Doubt Lucy Flor...,2019-04-01 01:33:39,新闻周刊,By Benjamin Fearnow On 3/31/19 at 1:33 PM ED...


# pre

In [13]:
import nltk
from nltk.stem import WordNetLemmatizer
from string import digits
import re

stopwords = {}
stw = open("corpus/stopwords.txt")
for ws in stw:
    ws = ws.replace("\n", "")
    ws = ws.replace("\r", "")
    stopwords[ws] = 1
stw.close()

# stopwords


def clean_text(text):
    # 去网页中的特殊编码字符串
    text = " ".join(text.split())
    word_list = []
    for word in text.split():
        try :
#             print 'word: ', word, chardet.detect(bytes(word))
            word_code = word.encode('raw_unicode_escape').decode('utf-8', "ignore")
#             print 'word_code: ', word_code, chardet.detect(bytes(word_code))
            clear_flag = False
            if "\\u" in word_code:
                clear_flag = True
                clear_num = 6
            elif '\u' in word_code:
                clear_flag = True
                clear_num = 5                
            if clear_flag:
                if len(word_code) >= (word_code.index('\\u') +6):
                    w = word_code[word_code.index('\\u'): word_code.index('\\u') +6]
                else :
                    w = word_code[word_code.index('\\u'): ]
                word_code = word_code.replace(w, ' ')
            word_list.append(word_code) 
        except :
            print '---', word, chardet.detect(bytes(word))
            
    text = " ".join(word_list)
            
    text = text.replace("\n", " ").replace('\r',' ').replace('\r\n',' ').replace('\t', ' ')
    reobj = re.compile('//@(.*?)[:\s]')
    text = reobj.sub("", text)
    reobj = re.compile("@(.*?)[:\s]")
    text = reobj.sub("", text)
    reobj = re.compile(r"\[[^\[\]]*?\]")
    text = reobj.sub("", text)

    return text

def clean_word(s):  
    # 去除标点和特殊字符、数字、汉字
    regex = re.compile(r"[^a-zA-Z]")
    s = regex.sub('', s)
    
    # 去除字符串中的数字 s = 'abc123def456ghi789zero0'
#     remove_digits = str.maketrans('', '', digits)
#     res = s.translate(remove_digits)
    res = s
    return res

In [15]:
def handle_content(content):
    line = ""
#     try :
#         content = str(content.decode('utf-8'))
    content = content.strip()    
    if content != "":       
        # 1 清理字符串
        content = clean_text(content)

        # 2 分句
        sent_tokenize_list = nltk.sent_tokenize(content)

        # 3 清理句子
        clean_sent_list = [clean_text(sent) for sent in sent_tokenize_list]

        # 4 分词 
        # 去掉长度小于3、去掉数字、去掉标点符号/去掉 non-alpha 词
        word_tokenize_list = []
        for sent in clean_sent_list:
#                 word_t_l = filter(lambda x: len(x) > 2, map(clean_word, nltk.word_tokenize(sent)))
            word_list = nltk.word_tokenize(sent)
            word_pos_tag = nltk.pos_tag(word_list)
            word_t_l = []
            for word, pos in word_pos_tag:
                if len(word) > 2:
                    word = clean_word(word)
                    item = ''
                    if pos.startswith('NN'):  # 名词
                        item = word
                    if pos.startswith('VB'):  # 动词
                        item = word
                    if pos.startswith('JJ'):  # 形容词
                        item = word  
                    if len(item)>0:word_t_l.append(item)
            word_tokenize_list += list(word_t_l)

        # 5 清理词
        # 去掉停用词、，小写化
        word_list = [word.lower() for word in word_tokenize_list if word.lower() not in stopwords]

        # 6 词形还原
        wnl = WordNetLemmatizer()
        word_list = [wnl.lemmatize(word) for word in word_list]

        line = " ".join(word_list)
#     except Exception as e:
#         print e, content
#         return line
    return line


In [16]:
def handle_contents(l_contents):
    lines = multi_process('handle_content', handle_content, l_contents)
#     lines = []
#     for line in l_contents:
#         lines.append(handle_content(line))
    return lines    

In [78]:
clean_title[0]
clean_text(a)
handle_content(a)

a = filtered_data['title'].tolist()[0]
print a
chardet.detect(bytes(a))

u'event organizer biden accuser'

## title and content

In [17]:
clean_title = handle_contents(filtered_data['title'].tolist())
print len(clean_title)
clean_title[0]

starting func: handle_content. njobs: 6, num: 7302
end func: handle_content. elapsed_time: 4.00, num_sent: 7302
7302


u'event organizer biden accuser'

In [18]:
clean_content = handle_contents(filtered_data['content'].tolist())
len(clean_content)

starting func: handle_content. njobs: 6, num: 7302
end func: handle_content. elapsed_time: 56.00, num_sent: 7302


7302

In [106]:
clean_content = handle_contents(filtered_data['content'].tolist())
len(clean_content)

7302

In [38]:
b = filtered_data['content'].tolist()[0]
b

u' \nThe organizer of\xa0a\xa0Nevada\xa0campaign rally said late Saturday that   Joe Biden      Joseph (Joe) Robinette Biden      Former Dem politician accuses Biden of \\\'inappropriate\\\' contact      Poll: Americans more likely to say Ocasio-Cortez is \\\'bad\\\' than \\\'good\\\' for Democrats      The Hill\\\'s 12:30 Report: Trump raises stakes with threat to close Mexican border      MORE and the former state assemblywoman who  accused the former vice president of inappropriate contact at the event were never alone together.\n\n\u201cI have thoroughly reviewed photographic documentation from the event, and spoken to nearly every principle in attendance, as well as staff associated with the event. To the best of our recollection, at no time were Lucy Flores and Vice President Biden alone,\u201d Henry Munoz, co-founder of Latino Victory Project, said in a statement.\n    ADVERTISEMENT \nFlores on Friday accused former Biden, who is expected to announce his 2020 presidential campai

In [74]:
clean_content[0]

u'organizer nevada campaign rally late saturday joe biden joseph joe robinette biden dem politician accuses biden inappropriate contact poll american ocasiocortez bad democrat hill report trump raise stake threat close mexican border assemblywoman accused vice president inappropriate contact event reviewed photographic documentation event spoken principle attendance staff event recollection time lucy flores vice president biden henry munoz cofounder latino victory project statement advertisement flores friday accused biden expected presidential campaign coming week inappropriately touching rally running nevada lieutenant governor flores wrote cut biden hand shoulder leaned smell hair kissed head stood stage campaign event brain couldn process happening embarrassed shocked confused wrote flores experienced blatantly inappropriate unnerving munoz statement biden holding event flores waited organization leader advance staff campaign staff moment vice president candidate onstage address su

In [109]:
clean_content[0]

u'organizer nevada campaign rally late saturday joe biden joseph joe robinette biden dem politician accuses biden inappropriate contact poll american ocasiocortez bad democrat hill report trump raise stake threat close mexican border assemblywoman accused vice president inappropriate contact event reviewed photographic documentation event spoken principle attendance staff event recollection time lucy flores vice president biden henry munoz cofounder latino victory project statement advertisement flores friday accused biden expected presidential campaign coming week touching rally running nevada lieutenant governor flores wrote cut biden hand shoulder leaned smell hair kissed head stood stage campaign event brain couldn process happening embarrassed shocked confused wrote flores experienced inappropriate unnerving munoz statement biden holding event flores waited organization leader advance staff campaign staff moment vice president candidate onstage address supporter press stage left s

# feature

## get feature

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer

def get_tfidf_matrix(clean_content, flag = 'OneStep'):
    if flag == 'OneStep':
        vectorizer = TfidfVectorizer()
        tfidf_matrix = vectorizer.fit_transform(clean_content).toarray() 
    elif flag == 'TwoStep':
        vectorizer = CountVectorizer(max_df=0.95, min_df=2)
        transformer = TfidfTransformer() # norm=None 不归一化

        vec_count = vectorizer.fit_transform(corpus)
        tfidf_matrix = transformer.fit_transform(vec_count)  
    return tfidf_matrix.toarray()

In [36]:
clean_content[:1]

[u'organizer nevada campaign rally late saturday joe biden joseph joe robinette biden dem politician accuses biden inappropriate contact poll american ocasiocortez bad democrat hill report trump raise stake threat close mexican border assemblywoman accused vice president inappropriate contact event reviewed photographic documentation event spoken principle attendance staff event recollection time lucy flores vice president biden henry munoz cofounder latino victory project statement advertisement flores friday accused biden expected presidential campaign coming week touching rally running nevada lieutenant governor flores wrote cut biden hand shoulder leaned smell hair kissed head stood stage campaign event brain couldn process happening embarrassed shocked confused wrote flores experienced inappropriate unnerving munoz statement biden holding event flores waited organization leader advance staff campaign staff moment vice president candidate onstage address supporter press stage left 

### tf-idf

In [44]:
vectorizer = TfidfVectorizer()
tfidf_matrix_1 = vectorizer.fit_transform(clean_content).toarray()

print tfidf_matrix_1.shape
tfidf_matrix_1[0][:100]

(7302, 76986)


array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [51]:
vectorizer = CountVectorizer(max_df=0.95, min_df=2)
transformer = TfidfTransformer()

vec_count = vectorizer.fit_transform(clean_content)
tfidf_matrix_2 = transformer.fit_transform(vec_count).toarray()

print tfidf_matrix_2.shape
print tfidf_matrix_2[0][:100]

(7302, 38904)
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0.]


In [55]:
max(tfidf_matrix_2[1])

0.45507857715012556

## feature selection

In [23]:
from sklearn.decomposition import PCA, TruncatedSVD

def get_decomposition_feature(feature_matrix, flag, n = 1000):
    
    if flag == 'pca':
        pca = PCA(n_components=n, copy=False)
        decomposition_matrix = pca.fit_transform(feature_matrix) 
    elif flag == 'svd':
        pca = TruncatedSVD(n_components=n, algorithm='arpack')
        decomposition_matrix = pca.fit_transform(feature_matrix)
    elif flag == 'agg':
        agglo = cluster.FeatureAgglomeration(n_clusters=n)
        decomposition_matrix = agglo.fit_transform(feature_matrix)
    elif flag == 'tfidf':
        decomposition_matrix = feature_matrix       
        
    return decomposition_matrix

In [24]:
tfidf_matrix = tfidf_matrix_2.toarray()

### PCA

In [25]:
n = 1000
pca = PCA(n_components=n, copy=False)
fea_pca_matrix = pca.fit_transform(tfidf_matrix)
print fea_pca_matrix.shape
fea_pca_matrix[0][:10]

(7302, 1000)

In [42]:
fea_pca_matrix[0][:10]

array([ -2.97236305,   1.52667073,  -0.69822775,  10.07828765,
         0.18059459,  20.34658564,  -6.26308804, -29.12818558,
       -20.57586251,   3.00293467])

### TruncatedSVD

In [122]:
n = 1000
svd = TruncatedSVD(n_components=n, algorithm='arpack')
fea_svd_matrix = pca.fit_transform(tfidf_matrix)
print fea_svd_matrix.shape

(7302, 1000)

### feature agglomeration
- feature agglomeration with Ward hierarchical clustering
- 时间太长，不实用

In [None]:
n = 1000
agglo = cluster.FeatureAgglomeration(n_clusters=n)
fea_agg_matrix = agglo.fit_transform(tfidf_matrix)
fea_agg_matrix.shape

# clustering

In [27]:
def get_labels(feature_matrix,flag, t = None):
    if flag =='sci_h':
        labels = hierarchy_cluster(feature_matrix, t)
    elif flag =='birch':
        brc = cluster.Birch(branching_factor=50, n_clusters=None, 
                            threshold=t,compute_labels=True)
        labels = brc.fit_predict(feature_matrix) 
    elif flag =='dbscan':
        clustering = cluster.DBSCAN(eps=t, min_samples=2).fit(feature_matrix)
        labels = clustering.labels_       
    elif flag =='meanshift':
        clustering = cluster.MeanShift(bandwidth=t).fit(feature_matrix)
        labels = clustering.labels_      
    elif flag =='affinity':
        clustering = AffinityPropagation().fit(feature_matrix)
        labels = clustering.labels_
        
    return labels   

In [28]:
feature_matrix = fea_pca_matrix

## scipy hierarchy

In [28]:
import scipy.cluster.hierarchy as sch

def hierarchy_cluster(tfidf_matrix, t):

    # 1. 层次聚类
    # 生成点与点之间的距离矩阵,这里用的cos距离
    disMat = sch.distance.pdist(tfidf_matrix, 'cosine')

    # 进行层次聚类:
    # Z = sch.linkage(disMat)
    Z = sch.linkage(disMat, method='average')

    # 将层级聚类结果以树状图表示出来并保存为plot_dendrogram.png
    # P=sch.dendrogram(Z)
    # plt.savefig('plot_dendrogram.png')

    # 根据linkage matrix Z得到聚类结果:
    labels = sch.fcluster(Z, t=t, criterion='distance')
    return labels

In [None]:
labels = hierarchy_cluster(feature_matrix, 0.6)
print 'cluster num: ', len(np.unique(labels))
clustering_metrics(labels, labels_true = None, feature = feature_matrix)

In [30]:
for t in [0.6, 0.7, 0.8, 0.9, 1.0]:    
    labels = hierarchy_cluster(feature_matrix, t)
    silhouette_score = metrics.silhouette_score(feature_matrix, labels, metric='cosine')
    print '---- t: %s, cluster num: %s, silhouette_score: %s' % (t, len(np.unique(labels)), silhouette_score)

轮廓系数:  0.01723679201198816
Calinski-Harabaz Index:  5.49469950420941
t: 0.5, silhouette_score: None
轮廓系数:  -0.018624661631729766
Calinski-Harabaz Index:  6.089848575294732
t: 0.6, silhouette_score: None
轮廓系数:  -0.06433405481253106
Calinski-Harabaz Index:  6.7473834739780365
t: 0.7, silhouette_score: None
轮廓系数:  -0.10583192554093637
Calinski-Harabaz Index:  7.751228063124646
t: 0.8, silhouette_score: None
轮廓系数:  -0.11526961064008873
Calinski-Harabaz Index:  8.932294779975003
t: 0.9, silhouette_score: None
轮廓系数:  -0.011609053534844316
Calinski-Harabaz Index:  119.09626862234866
t: 1.0, silhouette_score: None


## sklearn BIRCH

In [None]:
brc = cluster.Birch(branching_factor=50, n_clusters=None, threshold=0.5,compute_labels=True)
labels = brc.fit_predict(feature_matrix) 
print 'cluster num: ', len(np.unique(labels))
clustering_metrics(labels, labels_true = None, feature = feature_matrix)

cluster num:  7257
轮廓系数: 

In [None]:
for t in [0.3, 0.4, 0.5, 0.7, 0.9]:    
    brc = cluster.Birch(branching_factor=50, n_clusters=None, threshold=t,compute_labels=True)
    labels = brc.fit_predict(feature_matrix) 
    silhouette_score = metrics.silhouette_score(feature_matrix, labels, metric='euclidean')
    print '---- t: %s, cluster num: %s, silhouette_score: %s' % (t, len(np.unique(labels)), silhouette_score)

In [64]:
def get_birch_labels(t):
    brc = cluster.Birch(branching_factor=50, n_clusters=None, threshold=t,compute_labels=True)
    labels = brc.fit_predict(feature_matrix)  
    return labels

In [None]:
feature_matrix = tfidf_matrix_2
param = [0.3, 0.4, 0.5, 0.7, 0.9]
result_list = multi_process('get_birch_labels', get_birch_labels, param)

starting func: get_birch_labels. njobs: 6, num: 5


Exception in thread Thread-16:
Traceback (most recent call last):
  File "/usr/lib/python2.7/threading.py", line 801, in __bootstrap_inner
    self.run()
  File "/usr/lib/python2.7/threading.py", line 754, in run
    self.__target(*self.__args, **self.__kwargs)
  File "/usr/lib/python2.7/multiprocessing/pool.py", line 328, in _handle_workers
    pool._maintain_pool()
  File "/usr/lib/python2.7/multiprocessing/pool.py", line 232, in _maintain_pool
    self._repopulate_pool()
  File "/usr/lib/python2.7/multiprocessing/pool.py", line 225, in _repopulate_pool
    w.start()
  File "/usr/lib/python2.7/multiprocessing/process.py", line 130, in start
    self._popen = Popen(self)
  File "/usr/lib/python2.7/multiprocessing/forking.py", line 121, in __init__
    self.pid = os.fork()
OSError: [Errno 12] Cannot allocate memory

Process PoolWorker-30:
Traceback (most recent call last):
  File "/usr/lib/python2.7/multiprocessing/process.py", line 267, in _bootstrap
    self.run()
  File "/usr/lib/py

In [None]:
for (t, labels) in result_list:     
    silhouette_score = metrics.silhouette_score(feature_matrix, labels, metric='cosine')
    print '---- t: %s, cluster num: %s, silhouette_score: %s' % (t, len(np.unique(labels)), silhouette_score)

## sklearn DBSCAN

In [29]:
clustering = cluster.DBSCAN(eps=1, min_samples=2).fit(feature_matrix)
labels = clustering.labels_
print 'cluster num: ', len(np.unique(labels))
clustering_metrics(labels, labels_true = None, feature = feature_matrix)

cluster num:  44
轮廓系数:  -0.4322634632199418
Calinski-Harabaz Index:  1.753317512942638


In [57]:
def get_dbscan_labels(t):
    clustering = cluster.DBSCAN(eps=t, min_samples=2).fit(feature_matrix)
    labels = clustering.labels_   
    return labels

In [60]:
feature_matrix = tfidf_matrix_2
param = [0.5, 0.7, 1.0, 1.5, 2.0]
result_list = multi_process('get_dbscan_labels', get_dbscan_labels, param)

starting func: get_dbscan_labels. njobs: 6, num: 5
end func: get_dbscan_labels. elapsed_time: 4587.00, num_sent: 5


In [62]:
for (t, labels) in result_list:     
    silhouette_score = metrics.silhouette_score(feature_matrix, labels, metric='cosine')
    print '---- t: %s, cluster num: %s, silhouette_score: %s' % (t, len(np.unique(labels)), silhouette_score)

---- t: 0.5, cluster num: 553, silhouette_score: -0.016488248821948426
---- t: 0.7, cluster num: 653, silhouette_score: 0.03634354577836485
---- t: 1.0, cluster num: 70, silhouette_score: -0.061784057133209315


ValueError: Number of labels is 1. Valid values are 2 to n_samples - 1 (inclusive)

In [43]:
feature_matrix = tfidf_matrix_2
for t in [0.5, 0.7, 1.0, 1.5, 2.0]:    
    clustering = cluster.DBSCAN(eps=t, min_samples=2).fit(feature_matrix)
    labels = clustering.labels_    
    silhouette_score = metrics.silhouette_score(feature_matrix, labels, metric='euclidean')
    print '---- t: %s, cluster num: %s, silhouette_score: %s' % (t, len(np.unique(labels)), silhouette_score)

---- t: 0.5, cluster num: 36, silhouette_score: -0.3648390771398579
---- t: 0.7, cluster num: 36, silhouette_score: -0.3648390771398579
---- t: 1.0, cluster num: 36, silhouette_score: -0.3648390771398579
---- t: 1.5, cluster num: 36, silhouette_score: -0.3648390771398579
---- t: 2.0, cluster num: 36, silhouette_score: -0.3648390771398579


## sklearn mean-shift

In [None]:
clustering = cluster.MeanShift(bandwidth=2).fit(feature_matrix)
labels = clustering.labels_
print 'cluster num: ', len(np.unique(labels))
clustering_metrics(labels, labels_true = None, feature = feature_matrix)

In [None]:
for t in [0.5, 1, 2, 3]:    
    clustering = cluster.MeanShift(bandwidth=2).fit(feature_matrix)
    labels = clustering.labels_
    silhouette_score = metrics.silhouette_score(feature_matrix, labels, metric='euclidean')
    print '---- t: %s, cluster num: %s, silhouette_score: %s' % (t, len(np.unique(labels)), silhouette_score)

## sklearn Affinity Propagation

In [None]:
clustering = AffinityPropagation().fit(feature_matrix)
labels = clustering.labels_
silhouette_score = metrics.silhouette_score(feature_matrix, labels, metric='euclidean')
print 'cluster num: %s, silhouette_score: %s' % (len(np.unique(labels)), silhouette_score)

# compare and plot

In [None]:
result = []
cluster_dict = {'sci_h':[0.6, 0.7, 0.8, 0.9], 
                'birch':[0.1, 0.3, 0.5, 0.7, 0.9], 
                'dbscan':[0.5, 0.7, 1.0, 1.5, 2.0], 
                'meanshift':[0.5, 1, 2, 3], 
                'affinity':[]}
fea_sel_list = ['pca', 'svd', 'tfidf'] # , 'agg'
tfidf_list = ['OneStep', 'TwoStep']

for tfidf_flag in tfidf_list:
    tfidf_matrix = get_tfidf_matrix(clean_content, flag = tfidf_flag)
    for fea_flag in fea_sel_list:
        decomposition_matrix = get_decomposition_feature(tfidf_matrix, fea_flag, n = 1000)
        for key, value in cluster_dict.items():
            if value:
                for t in value:
                    labels = get_labels(decomposition_matrix,flag, t)
                    silhouette_score = metrics.silhouette_score(decomposition_matrix, labels, metric='euclidean')
                    ret = [tfidf_flag, fea_flag, key, t, silhouette_score, 
                           len(np.unique(labels)), len(labels), labels]
                    result += ret
                    print ret
            else :
                labels = get_labels(decomposition_matrix,flag)
                silhouette_score = metrics.silhouette_score(decomposition_matrix, labels, metric='euclidean')
                ret = [tfidf_flag, fea_flag, key, t, silhouette_score, 
                       len(np.unique(labels)), len(labels), labels]
                result += ret
                print ret
                    