# 基本设置

In [178]:
import jieba
import sys
import re
import time
import string

%matplotlib inline
import numpy as np
import pandas as pd
# import pre_cor
import os
from sqlalchemy import create_engine
from pandas.io import sql

import warnings
warnings.filterwarnings('ignore')

In [None]:
from sklearn import cluster

## 一些函数

In [None]:
from sklearn import metrics

def clustering_metrics(labels_pred, labels_true = None, feature = None):
    '''
    聚类算法结果评估
    需要真实标签：
        兰德指数 ARI: 输入参数没有顺序要求，ARI值的范围是[-1,1]，
            负的结果都是较差的，说明标签是独立分布的，相似分布的ARI结果是正的，
            1是最佳结果，说明两种标签的分布完全一致
        互信息 AMI：输入参数没有顺序要求，最好的值为1，最差的值（与labels_true不相关），其结果为非正值
        同质性、完整性、两者的调和平均V-measure：从0到1反应出最差到最优的表现
        Fowlkes-Mallows指数：针对训练集和验证集数据之间求得的查全率和查准率的几何平均值
        
    不需要真实标签：        
        轮廓系数：取值范围是[-1,1]，同类别样本距离越相近不同类别样本距离越远，分数越高。
        Calinski-Harabaz Index：分数值越大则聚类效果越好        
    '''
    
    if labels_true is not None:
        print u'兰德指数 ARI: ', metrics.adjusted_rand_score(labels_true, labels_pred)
        print u'互信息 AMI: ', metrics.adjusted_mutual_info_score(labels_true, labels_pred)
        print u'同质性、完整性、两者的调和平均V-measure: ', metrics.homogeneity_completeness_v_measure(labels_true, labels_pred)
        print u'Fowlkes-Mallows指数 FMI: ', metrics.fowlkes_mallows_score(labels_true, labels_pred)
        
    if feature is not None:
        print u'轮廓系数: ', metrics.silhouette_score(feature, labels_pred, metric='euclidean')
        print u'Calinski-Harabaz Index: ', metrics.calinski_harabaz_score(feature, labels_pred) 

# load data

In [3]:
raw_data = pd.read_excel('news_data_20190401/news_data.xls')
print(raw_data.shape)
raw_data.head()

(7396, 5)


Unnamed: 0,id,title,publishtime,site_name,content
0,29064424,Serie A Roundup: Moise Kean keeps up scoring f...,2019-03-31 13:35:20,印度快报,Moise Kean can’t seem to miss lately. Days aft...
1,29102555,"Motor racing-We will come back stronger, says ...",2019-04-01 04:45:23,每日邮报,"MANAMA, March 31 (Reuters) - Charles Leclerc p..."
2,29087495,Two women reveal how tragedy made them 'really...,2019-04-01 00:15:39,每日邮报,Georgina Brown (pictured) was hit by tragedy w...
3,29099506,Kevin Maher: I promise to cook more often than...,2019-04-01 07:01:00,泰晤士报,"Young couples in love, what’s wrong with you? ..."
4,29067200,Farewell to the Telegraph and its readers afte...,2019-03-31 15:00:00,每日电讯报,Farewell to the Telegraph and its readers afte...


# baseline

In [4]:
from dao.mysql.base_data_view import BaseDataView

import hierarchical_cluster
'''
perform_cluster(is_manual, cluster_type, manual_id, subtopic_id, language_type,
                    corpus, min_sample, save_group_id)
'''

2019-04-02 18:01:48 16776 16196 textcleaner.py [line:37] INFO 'pattern' package not found; tag filters are not available for English
2019-04-02 18:09:33 16776 16196 nltk_multipro_new.py [line:170] INFO start cut_tasks ....
2019-04-02 18:09:33 16776 16196 nltk_multipro_new.py [line:181] INFO end cut_tasks ...., 0s
2019-04-02 18:09:33 16776 16196 nltk_multipro_new.py [line:223] INFO start multi_process ....
2019-04-02 18:11:51 16776 16196 nltk_multipro_new.py [line:238] INFO end multi_process ...., 138s
2019-04-02 18:11:51 16776 16196 hierarchical_cluster.py [line:326] INFO first_cut_results end....
2019-04-02 18:11:52 16776 16196 hierarchical_cluster.py [line:336] INFO first_tfidf_matrix end....
2019-04-02 18:27:57 16776 16196 hierarchical_cluster.py [line:341] INFO first_cluster_results end....
2019-04-02 18:27:57 16776 16196 nltk_multipro_new.py [line:170] INFO start cut_tasks ....
2019-04-02 18:27:57 16776 16196 nltk_multipro_new.py [line:181] INFO end cut_tasks ...., 0s
2019-04-02 1

In [5]:
is_manual = 0
cluster_type = 0
manual_id = 0
subtopic_id = 0
language_type = 1
min_sample = 1
save_group_id = 5

base_data_view = BaseDataView()
dict_data = raw_data.to_dict('records')
corpus = BaseDataView.parse_corpus_records(dict_data, language_type=1)

In [37]:
print raw_data.shape
len(corpus)
# corpus[0]

7302

In [12]:
cluster_results = hierarchical_cluster.perform_cluster(is_manual, cluster_type, manual_id, 
                                                       subtopic_id, language_type,corpus, 
                                                       min_sample, save_group_id)

In [14]:
len(cluster_results)
cluster_results[0]

<entity.cluster_result.MergerClusterResult at 0x301a9748>

## save baseline result

In [38]:
baseline_result = []

for fir_cluster_ind, merger_cluster_result in enumerate(cluster_results):
    fir_topic = merger_cluster_result.topic
    fir_member = merger_cluster_result.member
    fir_member_count = merger_cluster_result.member_count
    for sub_cluster_result in merger_cluster_result.sub_cluster_results:
        member_list = sub_cluster_result.member.split('^A')
        topic = sub_cluster_result.topic        
        member_count = sub_cluster_result.member_count
        site_count = sub_cluster_result.site_count
        cluster_id = sub_cluster_result.cluster_id
        c_id = str(fir_cluster_ind) + '_' + str(cluster_id)
        for member in member_list:
            baseline_result.append([member, topic, c_id, fir_topic, cluster_id, fir_cluster_ind, 
                                    member_count, site_count, fir_member, fir_member_count])
        
baseline_result[0]

['29077036',
 u"Warren: Decision on 2020 up to Biden after woman's claim(accuser/biden/warren)",
 '0_3085',
 u'Ex-U.S. Vice President Biden denies inappropriate conduct over alleged kiss(biden/joe/woman/kiss/act/inappropriate)',
 3085,
 0,
 3,
 3,
 '29077036^A29076426^A29054671^A29103572^A29089361^A29082773^A29086886^A29084555^A29081648^A29087088^A29086922^A29087906^A29089186^A29099413^A29103568^A29084836^A29099059^A29092063^A29083362^A29085250^A29071826^A29100379^A29101552^A29085788^A29100658^A29085775^A29091997^A29081865^A29094944^A29121113^A29081501^A29090051^A29103137^A29117505^A29090770^A29080284^A29081512^A29098701^A29082701^A29083242^A29082429^A29086277^A29084773^A29080298^A29080912^A29066161^A29093998^A29090846^A29086676^A29094440^A29101521^A29092325^A29087122^A29090847^A29094774^A29085927^A29084956^A29090062^A29096712^A29088431^A29093997^A29081605^A29081375^A29085011',
 64]

In [40]:
col = '''member, topic, c_id, fir_topic, cluster_id, fir_cluster_ind, member_count, site_count, fir_member, fir_member_count'''
baseline_result = pd.DataFrame(baseline_result, columns = col.split(', '))
baseline_result.shape

(7302, 10)

In [47]:
raw_data['id'] = raw_data['id'].astype(str)
baseline_result = pd.merge(baseline_result, raw_data, how = 'left', left_on = 'member', right_on = 'id')
baseline_result.shape
# baseline_result.head()
# baseline_result.to_csv('baseline_result_0403.csv', index = False)

In [50]:
import xlsxwriter
bb = pd.ExcelWriter('baseline_result_0403.xlsx',engine='xlsxwriter')
baseline_result.to_excel(bb, sheet_name='Sheet1')
bb.save()

## filtered data

In [51]:
filtered_data = baseline_result[['id', 'title', 'content']]
filtered_data.shape

(7302, 3)

In [152]:
filtered_data.head()

Unnamed: 0,id,title,content
0,29077036,"Event organizer: Biden, accuser were never alo...",\nThe organizer of a Nevada campaign rally sa...
1,29076426,Warren and Castro back Biden accuser – but don...,Lucy Flores says then-VP kissed her at 2014 Ne...
2,29054671,Warren: Decision on 2020 up to Biden after wom...,Some Democratic presidential candidates are ex...
3,29103572,Flores: Interaction with Biden 'a violation of...,"Lucy Flores, a former Nevada state assemblyw..."
4,29089361,Bernie Sanders: 'No Reason' To Doubt Lucy Flor...,By Benjamin Fearnow On 3/31/19 at 1:33 PM ED...


# pre

In [55]:
import nltk
from nltk.stem import WordNetLemmatizer
from string import digits
import re

stopwords = {}
stw = open("corpus/stopwords.txt")
for ws in stw:
    ws = ws.replace("\n", "")
    ws = ws.replace("\r", "")
    stopwords[ws] = 1
stw.close()

# stopwords

In [243]:
def handle_content(content):
    content = str(content)
    raw = content.strip()
    line = ""
    if raw != "":       
        # 1 清理字符串
        content = clean_sent(content)
#         print '--------------  content: ', content

        # 2 分句
        sent_tokenize_list = nltk.sent_tokenize(content)
#         print '--------------  sent_tokenize_list: ', sent_tokenize_list
        
        # 3 清理句子
        clean_sent_list = [clean_sent(sent) for sent in sent_tokenize_list]
#         print '--------------  clean_sent_list: ', clean_sent_list
        
        # 4 分词 
        # 去掉长度小于3、去掉数字、去掉标点符号/去掉 non-alpha 词
        word_tokenize_list = []
        for sent in clean_sent_list:
            word_t_l = filter(lambda x: len(x) > 2, map(clean_word, nltk.word_tokenize(sent)))
            word_tokenize_list += list(word_t_l)
            
#         print '--------------  word_tokenize_list: ', word_tokenize_list
        
        # 5 清理词
        # 去掉停用词、，小写化
        word_list = [word.lower() for word in word_tokenize_list if word.lower() not in stopwords]
#         print '--------------  word_list: ', word_list
        
        # 6 词形还原
        wnl = WordNetLemmatizer()
        word_list = [wnl.lemmatize(word) for word in word_list]
#         print '--------------  WordNetLemmatizer  word_list: ', word_list

        line = " ".join(word_list)
    return line

def remove_special_symbol(sentence):
    '''去除特殊符号'''
    normal_symbol_en = ",.!;:?''""<>()%+#· "
    normal_symbol_ch = "，。！；：“”‘’？《》（）%+、…【】"
    normal_symbol = normal_symbol_en + normal_symbol_ch
    regex = re.compile(u"[a-zA-Z0-9]")
    removed_sent = ''
    special_symbol = set()
    for s in sentence:
        if regex.match(s): 
            removed_sent += s
        elif s in normal_symbol: 
            removed_sent += s
        else :
            special_symbol.add(s)
#     print 'special_symbol: %s'%special_symbol
    return removed_sent

def clean_sent(sent):
    # 去网页中的特殊编码字符串
    sent = " ".join(sent.split())
    word_list = []
    for word in sent.split():
        try :
            word_code = word.encode('raw_unicode_escape').decode('utf-8', "ignore")
            if "\\u" in word_code:
                if len(word_code) >= (word_code.index('\\u') +6):
                    w = word_code[word_code.index('\\u'): word_code.index('\\u') +6]
                else :
                    w = word_code[word_code.index('\\u'): ]
                word_code = word_code.replace(w, ' ')
            word_list.append(word_code) 
        except :
            print '---', s
            
    sent = " ".join(word_list)
            
    sent = sent.replace("\n", " ").replace('\r',' ').replace('\r\n',' ').replace('\t', ' ')
    reobj = re.compile('//@(.*?)[:\s]')
    sent = reobj.sub("", sent)
    reobj = re.compile("@(.*?)[:\s]")
    sent = reobj.sub("", sent)
    reobj = re.compile(r"\[[^\[\]]*?\]")
    sent = reobj.sub("", sent)

    sent = sent.replace("，", ",")
    sent = sent.replace("。", ".")
    sent = sent.replace("！", "!")
    sent = sent.replace("？", "?")
    reobj = re.compile("//(.*?)[:\s]")
    sent = reobj.sub("", sent)
    
    
    removed_sent = remove_special_symbol(sent)
    return sent

def clean_word(s):  
    # 去除标点和特殊字符、数字、汉字
    regex = re.compile(r"[^a-zA-Z]")
    s = regex.sub('', s)
    
    # 去除字符串中的数字 s = 'abc123def456ghi789zero0'
#     remove_digits = str.maketrans('', '', digits)
#     res = s.translate(remove_digits)
    res = s
    return res

In [259]:
import sys
from datetime import datetime
from multiprocessing.pool import ThreadPool as Pool

def multi_process(func_name, func, param):
    '''
    多线程处理
    '''
    if 'win' in sys.platform:
        njobs = 3 # 一半
    elif 'linux' in sys.platform:
        njobs = 6
    
    t1 = datetime.now()
#     logger.info('starting func: %s. num: %s'%(func_name, len(param)))
    print 'starting func: %s. njobs: %s, num: %s'%(func_name, njobs, len(param))
    p = Pool(processes = njobs) # 创建5条进程
    result = p.map(func,param)
    p.close() # 关闭进程池，不再接受请求
    p.join() # 等待所有的子进程结束
    
    result_list = [re for re in result]
    
    t2 = datetime.now()
    elapsed_time = '%0.2f'%((t2 - t1).seconds)
#     logger.info('end func: %s. elapsed_time: %s, num_sent: %s'%(func_name, elapsed_time, 
#                                                                 len(result_list)))
    print 'end func: %s. elapsed_time: %s, num_sent: %s'%(func_name, elapsed_time, len(result_list))
    return result_list
                

In [261]:
# lines = multi_process('handle_content', handle_content, filtered_data['content'].tolist())
# len(lines)

7302

In [264]:
def handle_contents(l_contents):
    lines = multi_process('handle_content', handle_content, l_contents)
#     lines = []
#     for line in l_contents:
#         lines.append(handle_content(line))
    return lines    

In [265]:
clean_title = handle_contents(filtered_data['title'].tolist())
len(clean_title)

7302

In [266]:
clean_content = handle_contents(filtered_data['content'].tolist())
len(clean_content)

7302

In [267]:
clean_content[0]

u'organizer ofanevadacampaign rally late saturday biden joseph robinette biden politician accuses biden inappropriate contact poll american ocasiocortez democrat hill report trump raise stake threat close mexican border assemblywoman accused vice president inappropriate contact event reviewed photographic documentation event spoken principle attendance staff event recollection time lucy flores vice president biden henry munoz cofounder latino victory project statement advertisement flores friday accused biden expected presidential campaign coming week inappropriately touching atthe rally running nevada lieutenant governor flores wrote biden hand shoulder leaned smell hair kissed head stood stage campaign event brain couldn process happening embarrassed shocked confused wrote flores experienced blatantly inappropriate unnerving munoz statement biden hisown holding event flores waited andorganization leader advance staff campaign staff moment vice president candidate onstage address supp

# feature

## get feature

### tf-idf

In [269]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [271]:
vectorizer = TfidfVectorizer()
tf_matrix = vectorizer.fit_transform(clean_content).toarray()

In [272]:
tf_matrix.shape

(7302L, 79291L)

## feature selection

### PCA

In [273]:
from sklearn.decomposition import PCA, TruncatedSVD

In [None]:
n = 1000
pca = PCA(n_components=n, copy=False)
tf_matrix = pca.fit_transform(tf_matrix)
tf_matrix.shape

### feature agglomeration
- feature agglomeration with Ward hierarchical clustering

In [None]:
n = 1000
agglo = cluster.FeatureAgglomeration(n_clusters=n)
tf_matrix = agglo.fit_transform(tf_matrix)
tf_matrix.shape

# clustering

In [None]:
feature_matrix = tf_matrix

## scipy hierarchy

In [None]:
import scipy.cluster.hierarchy as sch

def hierarchy_cluster(tfidf_matrix, t):

    # 1. 层次聚类
    # 生成点与点之间的距离矩阵,这里用的cos距离
    disMat = sch.distance.pdist(tfidf_matrix, 'cosine')

    # 进行层次聚类:
    # Z = sch.linkage(disMat)
    Z = sch.linkage(disMat, method='average')

    # 将层级聚类结果以树状图表示出来并保存为plot_dendrogram.png
    # P=sch.dendrogram(Z)
    # plt.savefig('plot_dendrogram.png')

    # 根据linkage matrix Z得到聚类结果:
    labels = sch.fcluster(Z, t=t, criterion='distance')
    return labels

In [None]:
labels = hierarchy_cluster(feature_matrix, t)

In [None]:
clustering_metrics(labels, labels_true = None, feature = feature_matrix)

## sklearn BIRCH

In [None]:
brc = cluster.Birch(branching_factor=50, n_clusters=None, threshold=0.5,compute_labels=True)
labels = brc.fit_predict(feature_matrix) 

In [None]:
clustering_metrics(labels, labels_true = None, feature = feature_matrix)

## sklearn DBSCAN

In [None]:
clustering = cluster.DBSCAN(eps=3, min_samples=2).fit(feature_matrix)
labels = clustering.labels_

In [None]:
clustering_metrics(labels, labels_true = None, feature = feature_matrix)

## sklearn mean-shift

In [None]:
clustering = cluster.MeanShift(bandwidth=2).fit(feature_matrix)
labels = clustering.labels_

In [None]:
clustering_metrics(labels, labels_true = None, feature = feature_matrix)

## sklearn Affinity Propagation

In [None]:
clustering = AffinityPropagation().fit(feature_matrix)
labels = clustering.labels_

In [None]:
clustering_metrics(labels, labels_true = None, feature = feature_matrix)