- 贾承斌
    - 接口和定时聚类程序都试下数据

# 基本设置

In [1]:
import pandas as pd
import sys

import langid # 语种检测
langid.classify("I do not speak english")[0]

'en'

In [2]:
import os
pwd = os.getcwd()
print(pwd)
sys.path.append(pwd)

D:\XH\Python_Project\notebook\tmp_proj\clustering_jiachengbing


In [3]:
class ClusterMessageObj(object):
    """
    聚类实体类
    """

    def __init__(self, messageId=0, messageTitle='', messagePublishtime='', messageContent='', site_name=''):
        self.messageId = messageId
        self.messageTitle = messageTitle
        self.messagePublishtime = messagePublishtime
        self.messageContent = messageContent
        self.site_name = site_name

In [4]:
def parse_corpus_records(records, language_type=None):
    data = []
    for row in records:
        id = row['id']

        title = row['title']
        title = title.encode('utf-8')

        content = row['content']
        content = content.encode('utf-8')

        publish_time = row['publishtime'].encode('utf-8') # .strftime("%Y-%m-%d %H:%M:%S")
        site_name = row['site_name']
        site_name = site_name.encode('utf-8')

        cluster_obj = ClusterMessageObj(id, title, publish_time, content, site_name)

        # 进行语言过滤
        if language_type is not None:
            if BaseDataView.is_valid_language(language_type, content):
                data.append(cluster_obj)
        else:
            data.append(cluster_obj)
    return data

In [5]:
sys.getdefaultencoding()

'utf-8'

In [6]:
from toolkits.nlp.langconv import *

def Traditional2Simplified(sentence):
    '''
    将sentence中的繁体字转为简体字
    :param sentence: 待转换的句子
    :return: 将句子中繁体字转换为简体字之后的句子
    '''
    sentence = Converter('zh-hans').convert(sentence)
    return sentence

In [7]:
from sklearn import metrics

def clustering_metrics(labels_pred, labels_true = None, feature = None):
    '''
    聚类算法结果评估
    需要真实标签：
        兰德指数 ARI: 输入参数没有顺序要求，ARI值的范围是[-1,1]，
            负的结果都是较差的，说明标签是独立分布的，相似分布的ARI结果是正的，
            1是最佳结果，说明两种标签的分布完全一致
        互信息 AMI：输入参数没有顺序要求，最好的值为1，最差的值（与labels_true不相关），其结果为非正值
        同质性、完整性、两者的调和平均V-measure：从0到1反应出最差到最优的表现
        Fowlkes-Mallows指数：针对训练集和验证集数据之间求得的查全率和查准率的几何平均值
        
    不需要真实标签：        
        轮廓系数：取值范围是[-1,1]，同类别样本距离越相近不同类别样本距离越远，分数越高。
        Calinski-Harabaz Index：分数值越大则聚类效果越好        
    '''
    
    if labels_true is not None:
        print(u'兰德指数 ARI: ', metrics.adjusted_rand_score(labels_true, labels_pred))
        print(u'互信息 AMI: ', metrics.adjusted_mutual_info_score(labels_true, labels_pred))
        print(u'同质性、完整性、两者的调和平均V-measure: ', metrics.homogeneity_completeness_v_measure(labels_true, labels_pred))
        print(u'Fowlkes-Mallows指数 FMI: ', metrics.fowlkes_mallows_score(labels_true, labels_pred))
        
    if feature is not None:
        print(u'轮廓系数: ', metrics.silhouette_score(feature, labels_pred, metric='euclidean'))
        print(u'Calinski-Harabaz Index: ', metrics.calinski_harabaz_score(feature, labels_pred) )

In [8]:
import sys
from datetime import datetime
# from multiprocessing.pool import ThreadPool as Pool
from multiprocessing import Pool

def multi_process(func_name, func, param):
    '''
    多线程处理
    '''
    if 'win' in sys.platform:
        njobs = 3 # 一半
    elif 'linux' in sys.platform:
        njobs = 6
    
    t1 = datetime.now()
    print('starting func: %s. njobs: %s, num: %s'%(func_name, njobs, len(param)))
    p = Pool(processes = njobs) # 创建5条进程
    result = p.map(func,param)
    p.close() # 关闭进程池，不再接受请求
    p.join() # 等待所有的子进程结束
    
    result_list = [(p, re) for p, re in zip(param, result)]
    
    t2 = datetime.now()
    elapsed_time = '%0.2f'%((t2 - t1).seconds)
    print('end func: %s. elapsed_time: %s, num_sent: %s'%(func_name, elapsed_time, len(result_list)))
    return result_list
                

# load data

In [99]:
# file_list = ['7_20190507155552.xlsx', '7_20190507161908.xlsx', '7_20190507163150.xlsx']
# data = pd.DataFrame()
# for filename in file_list:
#     file_path = 'raw_data/20190507/' + filename
#     tmp_data = pd.read_excel(file_path) # u'外媒_20190426070934.xlsx'
#     print(filename, data.shape)
#     data = pd.concat([data, tmp_data], axis = 0)

filename = '7_20190508082141.xlsx'
file_path = 'raw_data/20190508/' + filename
data = pd.read_excel(file_path) # u'外媒_20190426070934.xlsx'

print(data.shape)
print(data.columns)
data.iloc[:1, :]

(3011, 15)
Index(['site_id', 'site_name', 'title', 'publishtime', 'url', 'author',
       'click', 'top', 'reply', 'retweet', 'keywords', 'txt_file', 'content',
       'group_id', 'id'],
      dtype='object')


Unnamed: 0,site_id,site_name,title,publishtime,url,author,click,top,reply,retweet,keywords,txt_file,content,group_id,id
0,299507,华尔街日报中文网,隔夜市场回顾,2019-05-08 07:50:00.0,https://cn.wsj.com/articles/隔夜市场回顾-11557271810,,0,0,0,0,cn.wsj.com,,美股收盘重挫，道指跌近500点；美元基本持平；美国国债走高，受贸易紧张局势影响；油价下跌，投...,7,1


## 检测语种

In [100]:
# data['language'] = data['标题'].astype('str').apply(lambda x: langid.classify(x)[0])
data['language'] = data['title'].astype('str').apply(lambda x: langid.classify(x)[0])

In [101]:
print(data.shape)
data.iloc[:1, :]

(3011, 16)


Unnamed: 0,site_id,site_name,title,publishtime,url,author,click,top,reply,retweet,keywords,txt_file,content,group_id,id,language
0,299507,华尔街日报中文网,隔夜市场回顾,2019-05-08 07:50:00.0,https://cn.wsj.com/articles/隔夜市场回顾-11557271810,,0,0,0,0,cn.wsj.com,,美股收盘重挫，道指跌近500点；美元基本持平；美国国债走高，受贸易紧张局势影响；油价下跌，投...,7,1,zh


In [102]:
# 繁体
# data[data['id'] == 7914]

In [103]:
print(data['language'].value_counts().sum())
data['language'].value_counts()

3011


en    1401
zh     819
ja     304
pt     223
ar     122
tl      26
id      16
it      14
fr      13
de      12
es       9
da       9
gl       5
xh       4
ms       4
br       4
nl       3
sw       2
cy       2
la       2
ca       2
se       1
no       1
fi       1
jv       1
sl       1
fa       1
zu       1
nn       1
et       1
af       1
hr       1
oc       1
ur       1
rw       1
pl       1
Name: language, dtype: int64

## 英文数据

In [148]:
# data_en = data[data['language'] == 'en'][['id', '标题', '正文', '发布时间', '来源']]
# data_en.columns = ['id', 'title', 'content', 'publishtime', 'site_name']
data_en = data[data['language'] == 'en'][['id', 'title', 'content', 'publishtime', 'site_name']]
# data_en = data_en.iloc[:1000, :]
print(data_en.shape)
data_en.iloc[:1, :]

(1401, 5)


Unnamed: 0,id,title,content,publishtime,site_name
18,19,Taiwan's National Security Is at Risk,Taiwan’s military must feel pressured on two f...,2019-05-08 04:55:09.0,国家利益杂志


In [149]:
file_path =  'mlUtil_MOE/corpus/corpus_en_' + filename.replace('.xlsx', '.json')
print('file_path: ', file_path)
with open(file_path,'w',encoding='utf-8') as json_file:
        json.dump(data_en.to_dict('records'),json_file,ensure_ascii=False)  

file_path:  mlUtil_MOE/corpus/corpus_en_7_20190508082141.json


In [16]:
# corpus_en = parse_corpus_records(data_en.to_dict('records'), language_type=None)
# dir(corpus_en[0])
# corpus_en[0].messageTitle

In [17]:
# from sklearn.externals import joblib
# import pickle
# file_path =  "mlUtil_MOE/corpus/corpus_en_20190426070934.pkl"
# pickle.dump(corpus_en, file_path)
# pipeline_train = joblib.load( "model/previous_model/cbrc_pipeline_20181108.pkl.z")

In [18]:
# py2 与 py3 兼容
# # 在python3中将数据格式转换成python2可读的数据格式
# with open(file_path, 'rb') as f:
#     w = pickle.load(f)
# pickle.dump(w, open(file_path,"wb"), protocol=2)

## 中文数据

In [132]:
# data_zh = data[data['language'] == 'zh'][['id', '标题', '正文', '发布时间', '来源']]
data_zh = data[data['language'] == 'zh'][['id', 'title', 'content', 'publishtime', 'site_name']]
data_zh.columns = ['id', 'title_raw', 'content_raw', 'publishtime', 'site_name']
data_zh = data_zh.iloc[:1000, :]

# 繁体转简体
data_zh['title'] = data_zh['title_raw'].apply(lambda x: Traditional2Simplified(x))
data_zh['content'] = data_zh['content_raw'].apply(lambda x: Traditional2Simplified(x))
print(data_zh.shape)
data_zh.iloc[:1, :]

(819, 7)


Unnamed: 0,id,title_raw,content_raw,publishtime,site_name,title,content
0,1,隔夜市场回顾,美股收盘重挫，道指跌近500点；美元基本持平；美国国债走高，受贸易紧张局势影响；油价下跌，投...,2019-05-08 07:50:00.0,华尔街日报中文网,隔夜市场回顾,美股收盘重挫，道指跌近500点；美元基本持平；美国国债走高，受贸易紧张局势影响；油价下跌，投...


In [133]:
# data_zh.loc[7914, :]

In [134]:
# file_path =  "mlUtil_MOE/corpus/corpus_zh_20190426070934.json"
file_path =  'mlUtil_MOE/corpus/corpus_zh_' + filename.replace('.xlsx', '.json')
print('file_path: ', file_path)
data_zh.to_excel(file_path.replace('.json', '.xlsx'), index = False)
with open(file_path,'w',encoding='utf-8') as json_file:
        json.dump(data_zh.to_dict('records'),json_file,ensure_ascii=False)  

file_path:  mlUtil_MOE/corpus/corpus_zh_7_20190508082141.json


In [22]:
# corpus_zh = parse_corpus_records(data_zh.to_dict('records'), language_type=None)
# dir(corpus_zh[0])
# corpus_zh[0].messageTitle.decode('utf-8')

In [23]:
# from sklearn.externals import joblib
# joblib.dump(corpus_zh, "mlUtil_MOE/corpus/corpus_en_20190426070934.pkl.z")

# 定时结果解析

## 中文聚类 DBSCAN

In [135]:
# file_path = "mlUtil_MOE/result/result_zh_20190426070934.xlsx"
file_path =  'mlUtil_MOE/result/result_zh_' + filename
print('file_path: ', file_path)
result_zh = pd.read_excel(file_path)
print('data num: ', data_zh.shape[0])
print('data cluster num: ', result_zh['cluster_member_count'].sum())
result_zh = pd.read_excel(file_path)
print(result_zh.shape)
print(result_zh.columns)
result_zh.iloc[:1, :]

file_path:  mlUtil_MOE/result/result_zh_7_20190508082141.xlsx
data num:  819
data cluster num:  819
(708, 17)
Index(['id', 'cluster_id', 'cluster_topic', 'cluster_begin', 'cluster_end',
       'group_id', 'cluster_member', 'language_type', 'cluster_member_count',
       'site_count', 'is_manual', 'manual_id', 'subtopic_id', 'create_time',
       'cluster_type', 'keyword_id', 'order_id'],
      dtype='object')


Unnamed: 0,id,cluster_id,cluster_topic,cluster_begin,cluster_end,group_id,cluster_member,language_type,cluster_member_count,site_count,is_manual,manual_id,subtopic_id,create_time,cluster_type,keyword_id,order_id
0,460918472,232,英业达4月营收392.09亿元 年增2.18%,2019-05-07 19:10:18.0,2019-05-07 23:30:36.0,3,2183^A2333^A2721,0,3,1,0,1,3,2019-05-08 15:31:52,0,16,1


In [136]:
print(result_zh['cluster_member_count'].sum())
result_zh[['cluster_id', 'cluster_member_count']]

819


Unnamed: 0,cluster_id,cluster_member_count
0,232,3
1,279,3
2,142,2
3,222,2
4,252,2
5,542,2
6,15,1
7,35,1
8,54,1
9,55,1


In [137]:
data_zh['cluster_id'] = -1
for index in result_zh.index:
    cluster_member_list = result_zh.loc[index, 'cluster_member'].split('^A')
    cluster_id = result_zh.loc[index, 'cluster_id']
#     cluster_topic = result_zh.loc[index, 'cluster_topic']
    for c_id in cluster_member_list:
        data_zh.loc[data_zh['id'] == int(c_id),'cluster_id'] = cluster_id
#         data_zh.loc[data_zh['id'] == int(c_id),'cluster_topic'] = cluster_topic
        
# data_zh = pd.merge(data_zh, result_zh.loc[:, ['cluster_id', 'cluster_topic', 
#                                               'cluster_member_count','site_count']], 
#                    on = 'cluster_id', how  = 'left')
# file_path = "mlUtil_MOE/result/raw_data_result_zh_20190426070934.xlsx"
file_path =  'mlUtil_MOE/result/raw_data_result_zh_' + filename
print('file_path: ', file_path)
data_zh.to_excel(file_path, index = False)
print(data_zh.shape)
data_zh.iloc[:1, :]

file_path:  mlUtil_MOE/result/raw_data_result_zh_7_20190508082141.xlsx
(819, 8)


Unnamed: 0,id,title_raw,content_raw,publishtime,site_name,title,content,cluster_id
0,1,隔夜市场回顾,美股收盘重挫，道指跌近500点；美元基本持平；美国国债走高，受贸易紧张局势影响；油价下跌，投...,2019-05-08 07:50:00.0,华尔街日报中文网,隔夜市场回顾,美股收盘重挫，道指跌近500点；美元基本持平；美国国债走高，受贸易紧张局势影响；油价下跌，投...,0


In [139]:
combined_zh = pd.merge(data_zh, result_zh.loc[:, ['cluster_id', 'cluster_topic', 
                                              'cluster_member_count','site_count']], 
                   on = 'cluster_id', how  = 'left')

file_path =  'mlUtil_MOE/result/combined_zh_' + filename
combined_zh.to_excel(file_path, index = False)

combined_zh.head()

Unnamed: 0,id,title_raw,content_raw,publishtime,site_name,title,content,cluster_id,cluster_topic,cluster_member_count,site_count
0,1,隔夜市场回顾,美股收盘重挫，道指跌近500点；美元基本持平；美国国债走高，受贸易紧张局势影响；油价下跌，投...,2019-05-08 07:50:00.0,华尔街日报中文网,隔夜市场回顾,美股收盘重挫，道指跌近500点；美元基本持平；美国国债走高，受贸易紧张局势影响；油价下跌，投...,0,隔夜市场回顾,1,1
1,2,TikTok为何让年轻人上瘾？,23岁的霍莉‧格蕾丝(Holly Grace)是田纳西州纳什维尔一名全职护士和业余歌手，当她...,2019-05-07 16:20:00.0,华尔街日报中文网,TikTok为何让年轻人上瘾？,23岁的霍莉‧格蕾丝(Holly Grace)是田纳西州纳什维尔一名全职护士和业馀歌手，当她...,1,TikTok为何让年轻人上瘾？,1,1
2,3,Facebook可将用户群组变成金矿,群组的意义对Facebook (FB)用户来说不言而喻，但对净利润并非如此。改变这点可能打开...,2019-05-08 08:05:00.0,华尔街日报中文网,Facebook可将用户群组变成金矿,群组的意义对Facebook (FB)用户来说不言而喻，但对净利润并非如此。改变这点可能打开...,2,Facebook可将用户群组变成金矿,1,1
3,4,贸易紧张局势加剧拖累美股重挫,美股跌势周二加剧，道琼斯指数下跌逾450点，投资者为美国越来越有可能在本周晚些时候提高对华关...,2019-05-08 07:55:00.0,华尔街日报中文网,贸易紧张局势加剧拖累美股重挫,美股跌势周二加剧，道琼斯指数下跌逾450点，投资者为美国越来越有可能在本周晚些时候提高对华关...,3,贸易紧张局势加剧拖累美股重挫,1,1
4,5,中国同意恢复与美国的贸易谈判,中国将派遣高级贸易特使前往华盛顿，恢复谈判，应对中国政府回避的美方要求，并将处理美国总统特朗...,2019-05-08 07:30:00.0,华尔街日报中文网,中国同意恢复与美国的贸易谈判,中国将派遣高级贸易特使前往华盛顿，恢复谈判，应对中国政府回避的美方要求，并将处理美国总统特朗...,4,中国同意恢复与美国的贸易谈判,1,1


In [131]:
data_zh['cluster_id'].value_counts()#.count()

587    8
255    6
124    5
319    4
279    3
232    3
396    3
225    3
151    3
574    3
243    3
251    3
399    3
315    3
534    3
23     3
633    2
469    2
78     2
66     2
281    2
69     2
156    2
388    2
644    2
646    2
542    2
289    2
382    2
222    2
      ..
443    1
442    1
441    1
440    1
438    1
437    1
436    1
434    1
455    1
458    1
479    1
460    1
478    1
477    1
476    1
475    1
474    1
473    1
472    1
471    1
470    1
468    1
467    1
466    1
465    1
464    1
463    1
462    1
461    1
0      1
Name: cluster_id, Length: 708, dtype: int64

In [38]:
# result_zh.loc[0, 'cluster_member'].split('^A')

In [39]:
# data_zh['cluster_id'] = -1
# data_zh.loc[data_zh['id'] == int('551'),'cluster_id'] = 1
# data_zh[data_zh['id'] == int('551')]

## 英文聚类

In [159]:
# file_path = "mlUtil_MOE/result/result_en_20190426070934.xlsx"
file_path =  'mlUtil_MOE/result/result_en_' + filename
print('file_path: ', file_path)
result_en_1 = pd.read_excel(file_path)
print('data num: ', result_en_1.shape[0])
print('data cluster num: ', result_en_1['cluster_member_count'].sum())
print(result_en_1.shape)
result_en_1.iloc[:1, :]

file_path:  mlUtil_MOE/result/result_en_7_20190508082141.xlsx
data num:  272
data cluster num:  939
(272, 17)


Unnamed: 0,id,cluster_id,cluster_topic,cluster_begin,cluster_end,cluster_member_count,site_count,cluster_type,language_type,group_id,is_manual,manual_id,subtopic_id,create_time,cluster_member,keyword_id,order_id
0,126286511,30,Teachers Can Eat for FREE at Chipotle Today. H...,2019-05-07 16:09:00.0,2019-05-08 03:48:22.0,4,3,1,1,3,0,2,3,2019-05-08 15:38:44,352^A82^A124^A214,16,1


In [152]:
# file_path = "mlUtil_MOE/result/result_en_20190426070934_sub.xlsx"
file_path =  'mlUtil_MOE/result/result_en_' + filename.replace('.xlsx', '_sub.xlsx')
print('file_path: ', file_path)
result_en = pd.read_excel(file_path)
print('data cluster num: ', result_en['cluster_member_count'].sum())
print(result_en.shape)
print(result_en.columns)
result_en.iloc[:1, :]

file_path:  mlUtil_MOE/result/result_en_7_20190508082141_sub.xlsx
data cluster num:  939
(544, 15)
Index(['cluster_id', 'cluster_topic', 'cluster_begin', 'cluster_end',
       'cluster_member', 'cluster_member_count', 'site_count',
       'cluster_result_id', 'cluster_type', 'language_type', 'group_id',
       'is_manual', 'manual_id', 'subtopic_id', 'create_time'],
      dtype='object')


Unnamed: 0,cluster_id,cluster_topic,cluster_begin,cluster_end,cluster_member,cluster_member_count,site_count,cluster_result_id,cluster_type,language_type,group_id,is_manual,manual_id,subtopic_id,create_time
0,49,2020 Dems take to Twitter to celebrate Teacher...,2019-05-08 03:48:22.0,2019-05-08 03:48:22.0,352,1,1,126286511,1,1,3,0,2,3,2019-05-08 15:38:44


In [None]:
for index in result_en.index:
    cluster_member_list = result_en.loc[index, 'cluster_member'].split('^A')
    for c_id in cluster_member_list:

In [157]:
data_en['cluster_id'] = -1
data_en['sub_cluster_id'] = -1
for index in result_en.index:
    cluster_member_list = result_en.loc[index, 'cluster_member'].split('^A')
    cluster_id = result_en.loc[index, 'cluster_result_id']
    sub_cluster_id = result_en.loc[index, 'cluster_id']
    sub_cluster_topic = result_en.loc[index, 'cluster_topic']
    sub_cluster_member_count = result_en.loc[index, 'cluster_member_count']
    sub_site_count = result_en.loc[index, 'site_count']
    for c_id in cluster_member_list:
        data_en.loc[data_en['id'] == int(c_id),'cluster_id'] = cluster_id        
        data_en.loc[data_en['id'] == int(c_id),'sub_cluster_id'] = sub_cluster_id
        data_en.loc[data_en['id'] == int(c_id),'sub_cluster_topic'] = sub_cluster_topic
        data_en.loc[data_en['id'] == int(c_id),'sub_cluster_member_count'] = sub_cluster_member_count
        data_en.loc[data_en['id'] == int(c_id),'sub_site_count'] = sub_site_count
        
data_en = data_en[data_en['cluster_id'] != -1]
# data_en = pd.merge(data_en, result_en_1.loc[:, ['id', 'cluster_topic']], how = 'left', 
#                                        left_on = 'cluster_id', right_on = 'id')
# file_path = "mlUtil_MOE/result/raw_data_result_en_20190426070934.xlsx"
file_path =  'mlUtil_MOE/result/raw_data_result_en_' + filename
print('file_path: ', file_path)
data_en.to_excel(file_path, index = False)
print(data_en.shape)
data_en.iloc[:1, :]

file_path:  mlUtil_MOE/result/raw_data_result_en_7_20190508082141.xlsx
(924, 10)


Unnamed: 0,id,title,content,publishtime,site_name,cluster_id,sub_cluster_id,sub_cluster_member_count,sub_site_count,sub_cluster_topic
18,19,Taiwan's National Security Is at Risk,Taiwan’s military must feel pressured on two f...,2019-05-08 04:55:09.0,国家利益杂志,932226311,492,2.0,2.0,Will Xi Jinping Blockade Taiwan?(taiwan/blocka...


In [162]:
combined_en = pd.merge(data_en, result_en_1.loc[:, ['id', 'cluster_topic', 
                                              'cluster_member_count','site_count']], 
                   left_on = 'cluster_id', right_on = 'id', how  = 'left')

file_path =  'mlUtil_MOE/result/combined_en_' + filename
combined_en.to_excel(file_path, index = False)

combined_en.head()

Unnamed: 0,id_x,title,content,publishtime,site_name,cluster_id,sub_cluster_id,sub_cluster_member_count,sub_site_count,sub_cluster_topic,id_y,cluster_topic,cluster_member_count,site_count
0,19,Taiwan's National Security Is at Risk,Taiwan’s military must feel pressured on two f...,2019-05-08 04:55:09.0,国家利益杂志,932226311,492,2.0,2.0,Will Xi Jinping Blockade Taiwan?(taiwan/blocka...,932226311,Will Xi Jinping Blockade Taiwan?(taiwan/jinpin...,2,2
1,20,"McConnell Says 'Case Closed,' But is It?",The two-word Republican mantra has changed fro...,2019-05-08 05:33:46.0,国家利益杂志,1844525250,432,8.0,5.0,Schumer: McConnell trying to 'whitewash' Muell...,1844525250,Schumer: McConnell trying to 'whitewash' Muell...,9,5
2,21,Could 1989 Have Led to Democracy in China?,There are two great “what if” questions in mod...,2019-05-08 05:35:04.0,国家利益杂志,1857878505,485,2.0,2.0,Could 1989 Have Led to Democracy in China?(chi...,1857878505,Could 1989 Have Led to Democracy in China?(chi...,4,4
3,22,Diplomacy Is the Best Way Forward with Iran,Editor’s Note: This statement was released tod...,2019-05-08 05:49:50.0,国家利益杂志,90215590,408,4.0,4.0,Iran may partially revive nuclear program(iran...,90215590,Secretary of State Pompeo Makes Surprise Trip ...,12,6
4,23,This Fighter Jet Terrorized North Korea: Check...,"By 1954, the superior swept-wing F-84F Thunder...",2019-05-07 22:48:50.0,国家利益杂志,1834530930,101,1.0,1.0,This Fighter Jet Terrorized North Korea: Check...,1834530930,This Fighter Jet Terrorized North Korea: Check...,2,1


In [83]:
print(data_en['cluster_id'].value_counts().count())
data_en['sub_cluster_id'].value_counts().count()
# data_en['cluster_id'].value_counts()

272


544

# 聚类分析

## 中文

In [40]:
clustering_data = data_zh[['id', 'title', 'content']]
clustering_data['title_content'] = clustering_data['title'] + '。' + clustering_data['content']
print(clustering_data.shape)
clustering_data.iloc[:1, :]

(1000, 4)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,id,title,content,title_content
0,1,Facebook曾开白名单，一些公司可获得有关用户好友的额外信息,根据法院文件、公司官员和知情人士，Facebook Inc. (FB)和选定的一些公司签署了...,Facebook曾开白名单，一些公司可获得有关用户好友的额外信息。根据法院文件、公司官员和知...


### 预处理

In [41]:
from toolkits.nlp import pre_text

Building prefix dict from the default dictionary ...
Dumping model to file cache C:\Users\ADMINI~1\AppData\Local\Temp\jieba.cache
Loading model cost 1.165 seconds.
Prefix dict has been built succesfully.


In [42]:
title_content_list = pre_text.handle_contents(clustering_data['title_content'].tolist())
print(len(title_content_list))
title_content_list[:2]

1000


['曾开 白名单 公司 获得 用户 好友 额外 信息 法院 文件 公司 官员 知情 人士 选定 公司 签署 订制 数据 分享 协议 协议 公司 年 之后 很长 一段时间 仍能 取得 用户 记录 社交 媒体 巨头 年 表示 不再 开发商 提供 信息 知情 人士 透露 内部 称做 白名单 未 披露 协议 公司 取得 用户 好友 额外 信息 人士 称 信息 包括 手机号码 一种 称做 好友 关联 指标 用于 衡量 用户 好友 之间 亲密 程度 订制 协议 独立 上周 披露 至少 家 设备 制造商 达成 数据 分享 协议 几名 议员 监管 机构 表示 需 进一步 调查 设备 制造商 协议 知情 人士 称 白名单 协议 理由 具有 价值 广告 客户 合作伙伴 签署 公司 包括 加拿大 皇家 银行 日产 汽车 协议 显示 特殊 数据 准入 权给 之前 所知 更 广泛 公司 引发 更 疑问 取得 数十亿 用户 数据 取得 数据 理由 国会 正 要求 数据 转移 负 责任 一名 发言人 表示 公司 开发商 签署 少量 协议 主要 提升 用户 体验 测试 新 功能 帮助 特定 合作伙伴 结束 之前 数据 分享 项目 发言人 表示 所有人 关闭 之后 少数 公司 取得 用户 朋友 数据 他称 获 展延 权限 长 达 数周 数月 至少 部分 公司 逾 一年 取得 数据 ',
 '隔夜 市场 回顾 美联储 重申 利率 保持 耐心 导致 美股 收跌 美元 走强 美国 国债 收益率 走高 黄金 下跌 美国 油价 收跌 美国 原油 库存 增加 铜价 下跌 以下 隔夜 各大 市场 收盘 综述 股票市场 美国股市 周三 收盘 下跌 此前 美联储 维持 利率 不变 重申 会 保持 耐心 通胀 近期 疲弱 道琼斯 指数 收盘 下跌 点至点 跌幅 去年 纪录 高点 跌 标普 指数 下跌 点至点 跌幅 纳斯达克 综合 指数 下跌 点至点 跌幅 欧洲 斯托克 指数 下跌 不到 交投 依旧 清淡 外汇市场 美元 周三 走高 美联储 主席 鲍威尔 称 最近 通胀 压力 疲软 可能 暂时 华尔街日报 美元 指数 上涨 指数 去年 月 最高 水平 附近 徘徊 投资者 关注 焦点 欧洲 经济 增长 放缓 迹象 美联储 官员 周三 维持 利率 不变 指出 第一季度 主要 经济 活动 放缓 美元 最初 应声 继续 下跌 鲍

In [43]:
id_list = clustering_data['id'].tolist()
print(len(id_list))
id_list[:2]

1000


[1, 2]

### featrue 

#### tfidf

In [44]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer

In [45]:
vectorizer = CountVectorizer(max_df=0.95, min_df=2)
transformer = TfidfTransformer() # norm=None 不归一化

vec_count = vectorizer.fit_transform(title_content_list)
tfidf_matrix = transformer.fit_transform(vec_count)  

In [46]:
tfidf_matrix.shape

(1000, 9529)

#### N-gram

### DBSCAN

In [47]:
from sklearn import cluster

In [48]:
feature_matrix = tfidf_matrix.toarray()

In [49]:
clustering = cluster.DBSCAN(eps=1, min_samples=2).fit(feature_matrix)
labels = clustering.labels_
print('data num: ', feature_matrix.shape[0])
print('cluster num: ', len(np.unique(labels)))
clustering_metrics(labels, labels_true = None, feature = feature_matrix)

data num:  1000
cluster num:  170
轮廓系数:  0.2444755345626969
Calinski-Harabaz Index:  4.0066027212767565


In [50]:
pd.Series(labels).value_counts()

-1      401
 18      24
 10      21
 26      17
 163     16
 55      15
 52      15
 154     13
 51       9
 62       7
 56       7
 69       7
 49       7
 16       7
 15       6
 122      6
 80       6
 35       6
 91       6
 165      5
 64       5
 108      5
 40       5
 4        5
 158      5
 46       5
 54       4
 8        4
 88       4
 160      4
       ... 
 87       2
 101      2
 100      2
 99       2
 98       2
 97       2
 93       2
 74       2
 106      2
 72       2
 117      2
 131      2
 94       2
 127      2
 125      2
 124      2
 119      2
 118      2
 58       2
 60       2
 71       2
 116      2
 114      2
 113      2
 65       2
 111      2
 67       2
 110      2
 109      2
 84       2
Length: 170, dtype: int64

In [51]:
clustering = cluster.DBSCAN(eps=1, min_samples=1).fit(feature_matrix)
labels = clustering.labels_
print('data num: ', feature_matrix.shape[0])
print('cluster num: ', len(np.unique(labels)))
clustering_metrics(labels, labels_true = None, feature = feature_matrix)

data num:  1000
cluster num:  570
轮廓系数:  0.2685670252865887
Calinski-Harabaz Index:  4.457234919957061


In [52]:
pd.Series(labels).value_counts()

49     24
32     21
80     17
443    16
150    15
147    15
383    13
146     9
151     7
46      7
159     7
143     7
172     7
239     6
42      6
185     6
197     6
126     6
396     5
218     5
139     5
20      5
132     5
473     5
161     5
38      4
262     4
48      4
145     4
158     4
       ..
355     1
356     1
357     1
358     1
359     1
360     1
361     1
362     1
363     1
364     1
365     1
366     1
367     1
368     1
369     1
370     1
371     1
372     1
373     1
374     1
375     1
377     1
378     1
379     1
380     1
381     1
382     1
384     1
385     1
0       1
Length: 570, dtype: int64

#### 参数优化

In [53]:
import datetime

In [54]:
eps_list = [round(0.5 + 0.1*a, 1) for a in range(1, 10)] # [0.5, 0.7, 1.0, 1.5, 2.0]
min_list = [1, 2]
index = 0

result_list = []
for m in min_list:
    for eps in eps_list:
        index += 1
        silhouette_score = 0
        print('-------------  index: %s; min_s: %s; eps: %s'%(index, m, eps))
        starttime = datetime.datetime.now()
        clustering = cluster.DBSCAN(eps=eps, min_samples=m).fit(feature_matrix)
        endtime = datetime.datetime.now()
        elapse_time = (endtime - starttime).seconds
        print('elapse_time: %s s'%(elapse_time))
        labels = clustering.labels_
        print('data num: ', feature_matrix.shape[0])
        print('cluster num: ', len(np.unique(labels)))
        if len(np.unique(labels)) > 1:
            clustering_metrics(labels, labels_true = None, feature = feature_matrix)
            silhouette_score = metrics.silhouette_score(feature_matrix, labels, metric='euclidean')
            
        result_list.append([index, m, eps, elapse_time, feature_matrix.shape[0], 
                            len(np.unique(labels)), silhouette_score])

-------------  index: 1; min_s: 1; eps: 0.6
elapse_time: 14 s
data num:  1000
cluster num:  797
轮廓系数:  0.29928072503559006
Calinski-Harabaz Index:  40.10177612026728
-------------  index: 2; min_s: 1; eps: 0.7
elapse_time: 19 s
data num:  1000
cluster num:  781
轮廓系数:  0.30271698821147014
Calinski-Harabaz Index:  24.857388995254393
-------------  index: 3; min_s: 1; eps: 0.8
elapse_time: 26 s
data num:  1000
cluster num:  742
轮廓系数:  0.3099784381993435
Calinski-Harabaz Index:  13.519989104178558
-------------  index: 4; min_s: 1; eps: 0.9
elapse_time: 31 s
data num:  1000
cluster num:  668
轮廓系数:  0.3156745263788334
Calinski-Harabaz Index:  7.268661711796795
-------------  index: 5; min_s: 1; eps: 1.0
elapse_time: 41 s
data num:  1000
cluster num:  570
轮廓系数:  0.2685670252865887
Calinski-Harabaz Index:  4.457234919957061
-------------  index: 6; min_s: 1; eps: 1.1
elapse_time: 41 s
data num:  1000
cluster num:  419
轮廓系数:  0.16333324995210663
Calinski-Harabaz Index:  2.4237023582889257
----

In [66]:
[round(0.5 + 0.1*a, 1) for a in range(1, 10)]

[0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.4]

In [65]:
round(0.3002, 1)

0.3

#### combined result

In [55]:
data_zh.shape

(1000, 8)

In [60]:
clustering = cluster.DBSCAN(eps=0.9, min_samples=1).fit(feature_matrix)
labels = clustering.labels_
print('data num: ', feature_matrix.shape[0])
print('cluster num: ', len(np.unique(labels)))
clustering_metrics(labels, labels_true = None, feature = feature_matrix)

data num:  1000
cluster num:  668
轮廓系数:  0.3156745263788334
Calinski-Harabaz Index:  7.268661711796795


In [61]:
data_zh['label'] = labels
file_path =  'mlUtil_MOE/result/new_raw_data_result_zh_' + filename
print('file_path: ', file_path)
data_zh.to_excel(file_path, index = False)
print(data_zh.shape)
data_zh.iloc[:1, :]

file_path:  mlUtil_MOE/result/new_raw_data_result_zh_7_20190507161908.xlsx
(1000, 9)


Unnamed: 0,id,title_raw,content_raw,publishtime,site_name,title,content,cluster_id,label
0,1,Facebook曾开白名单，一些公司可获得有关用户好友的额外信息,根据法院文件、公司官员和知情人士，Facebook Inc. (FB)和选定的一些公司签署了...,2019-05-07 10:16:00.0,华尔街日报中文网,Facebook曾开白名单，一些公司可获得有关用户好友的额外信息,根据法院文件、公司官员和知情人士，Facebook Inc. (FB)和选定的一些公司签署了...,-1,0
