# 本文件说明
- 数据库里导出数据，本地模型、线上模型测试

# 基本设置

In [1]:
import numpy as np
import pandas as pd

import os

import requests,json
from sklearn.externals import joblib

import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

In [2]:
from toolkits.setup.date_time import get_day_list
from toolkits.setup import specific_func

from toolkits.nlp import pre_cor_circ
from toolkits.nlp import pre_cor_cbrc

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\ADMINI~1\AppData\Local\Temp\jieba.cache
Loading model cost 0.927 seconds.
Prefix dict has been built succesfully.


## 一些函数

In [3]:
def get_server_res_yjh(data, url, col_name):
    '''
    服务器接口测试程序
    传入 dict, 传出 DataFrame
    '''
    # data = {'record':[{'id':0,'title':'ss','content':'zzz'},]}
    # data = {"record":marked_human_data.iloc[:5,:3].to_dict(orient = 'records')}
    # url "http://47.93.77.19:10000/correlation_negative"
    headers={'content-type':'application/json'}
    result = requests.post(url,
                      data = json.dumps(data),
                      headers=headers, allow_redirects=True)
    # print(result.text)
    json_data = json.loads(result.text)
    parse_data = []
    elapsed_time = json_data['elapsed_time']
    for i in range(len(json_data['docs'])):
        parse_data.append([json_data['docs'][i]['id'],
                          json_data['docs'][i][col_name]])
    parse_data = pd.DataFrame(parse_data, columns = ['id', col_name])    
    return parse_data , elapsed_time

In [4]:
def get_serve_data_yjh(day_list, sql_one_day, url, col_name, save_filename):    
    chunksize = 1000
    for day_select in day_list:
        print('-- day_select: ', day_select)
        mysql_data = pd.read_sql(eval(sql_one_day), engine, chunksize= chunksize)
        num = 1
        combined_data = pd.DataFrame()
        for tmp_data in mysql_data:  
            print('---- loop num: ', num, 'tmp_data: ', tmp_data.shape)
            data = {"record":tmp_data.loc[:,['id', 'title' ,'content']].to_dict(orient = 'records')}
            parse_data = get_server_res_yjh(data, url, col_name)

            parse_data.columns = ['id', 'predict_label']
            
            parse_data['label'] = ''
            combined_tmp = pd.merge(parse_data, tmp_data, on = 'id', how = 'inner')
            combined_data = pd.concat([combined_tmp, combined_data])

        combined_data['predict_label'] = combined_data['predict_label'].apply(lambda x:class_name_dict[x])
        combined_data['group_id'] = combined_data['group_id'].apply(lambda x:group_dict[str(x)])
        combined_data.to_excel(eval(save_filename), index = False)
        print(combined_data['predict_label'].value_counts())

In [5]:
def get_server_res(data, url, col_name):
    '''
    服务器接口测试程序
    传入 dict, 传出 DataFrame
    '''
    # data = {'record':[{'id':0,'title':'ss','content':'zzz'},]}
    # data = {"record":marked_human_data.iloc[:5,:3].to_dict(orient = 'records')}
    # url "http://47.93.77.19:10000/correlation_negative"
    headers={'content-type':'application/json'}
    result = requests.post(url,
                      data = json.dumps(data),
                      headers=headers, allow_redirects=True)
    # print(result.text)
    json_data = json.loads(result.text)
    parse_data = []
    elapsed_time = json_data['elapsed_time']
    for i in range(len(json_data['docs'])):
        parse_data.append([json_data['docs'][i]['id'],
                          json_data['docs'][i][col_name]])
    parse_data = pd.DataFrame(parse_data, columns = ['id', col_name])    
    return parse_data, elapsed_time

In [6]:
def get_serve_data(day_list, sql_one_day, url, col_name):
    combined_data = pd.DataFrame()
    for day_select in day_list:
        print('-- day_select: ', day_select)
        mysql_data = pd.read_sql(eval(sql_one_day), engine)
        print('去空值前：', mysql_data.shape)
        mysql_data = mysql_data.drop_duplicates(subset = ['title', 'content'])
        print('去空值后：', mysql_data.shape)
        data = {"record":mysql_data.loc[:,['id', 'title' ,'content']].to_dict(orient = 'records')}
        
        parse_data, elapsed_time = get_server_res(data, url)
        print('elapsed_time: ', elapsed_time)
        
        parse_data.columns = ['id', 'predict_label']
        parse_data['predict_label'] = parse_data['predict_label'].apply(lambda x:class_name_dict[x])
        parse_data['label'] = ''
        combined_cor = pd.merge(parse_data, mysql_data, on = 'id', how = 'inner')
        combined_data = pd.concat([combined_data, combined_cor], axis = 0)

        print(combined_cor['predict_label'].value_counts())
    return combined_data

# 基本信息

In [7]:
label_dic={'补录':0,'监管':1,'行业':2,'产品销售':3,'资本市场':4,'公司内部管理':5,
           '消费服务':6,'其他相关报道':7,'噪音':8,'交通':9,'环保':10}
class_name_dict = {v: k for k, v in label_dic.items()}
class_name_dict

{0: '补录',
 1: '监管',
 2: '行业',
 3: '产品销售',
 4: '资本市场',
 5: '公司内部管理',
 6: '消费服务',
 7: '其他相关报道',
 8: '噪音',
 9: '交通',
 10: '环保'}

In [8]:
group = '1-新闻，2-论坛，3-博客，4-微博，5-纸媒，6-视频，7-外媒，8-广播，9-电视，11-微信，13-新闻客户端，15-推特'
group_dict = dict([x.split('-') for x in group.split('，')])
group_dict

{'1': '新闻',
 '11': '微信',
 '13': '新闻客户端',
 '15': '推特',
 '2': '论坛',
 '3': '博客',
 '4': '微博',
 '5': '纸媒',
 '6': '视频',
 '7': '外媒',
 '8': '广播',
 '9': '电视'}

In [9]:
proj_dic={'银监会':1,'保监会':2,'中国人寿':3,'建行北分':4,'中国人保':5}
proj_name_dict = {v: k for k, v in proj_dic.items()}
proj_name_dict

{1: '银监会', 2: '保监会', 3: '中国人寿', 4: '建行北分', 5: '中国人保'}

In [10]:
gather_type_dic={'系统采集':0,'补录':1,'校正':2,'导入数据':3,'其它':4}
gather_type_name_dict = {v: k for k, v in gather_type_dic.items()}
gather_type_name_dict

{0: '系统采集', 1: '补录', 2: '校正', 3: '导入数据', 4: '其它'}

In [11]:
file_path = 'cbirc_result\pom.json'

with open(file_path,'r',encoding='utf-8-sig') as json_file:
    cbrc_data = json.load(json_file)  
    
# cbrc_data = pd.DataFrame.from_dict(json_data['record'], orient='index' ) 
# cbrc_data.shape

In [12]:
# from langconv import *
from toolkits.nlp.langconv import *

def Traditional2Simplified(sentence):
    '''
    将sentence中的繁体字转为简体字
    :param sentence: 待转换的句子
    :return: 将句子中繁体字转换为简体字之后的句子
    '''
    sentence = Converter('zh-hans').convert(sentence)
    return sentence


In [13]:
index = 481
# data = {"record":[cbrc_data['record'][index],]}
# url = "http://47.93.77.19:6001/judge_correlation_yjh"
data = {"record":[{'id':'1', 
                   'title': Traditional2Simplified(data['record'][0]['title']),  
                   'content': Traditional2Simplified(data['record'][0]['content'])},]}
url = "http://192.168.0.104:8100/judge_correlation_yjh"
col_name = 'sec'

parse_data , elapsed_time = get_server_res_yjh(data, url, col_name)
print(index, '  耗时：%s s'%(elapsed_time))

NameError: name 'data' is not defined

In [None]:
data['record'][0]['content']

In [None]:
{"record":[cbrc_data['record'][152:155],]}

In [None]:
{"record":[cbrc_data['record'][154],]}

In [None]:
# data = {"record":combined_data.loc[:,['id', 'title' ,'content']].to_dict(orient = 'records')}
import time
for index in range(len(cbrc_data['record'])):
#     data = {"record":[cbrc_data['record'][index],]}
#     url = "http://47.93.77.19:6001/judge_correlation_yjh"
    data = {"record":[{'id':cbrc_data['record'][index]['id'], 
                       'title': Traditional2Simplified(cbrc_data['record'][index]['title']),  
                       'content': Traditional2Simplified(cbrc_data['record'][index]['content'])},]}
    url = "http://192.168.0.104:8100/judge_correlation_yjh"
    col_name = 'sec'
    
    parse_data , elapsed_time = get_server_res_yjh(data, url, col_name)
    print(index, '  耗时：%s s'%(elapsed_time))

In [None]:
import time
for index in range(len(cbrc_data['record'])):
    data = {"record":[cbrc_data['record'][index],]}
#     url = "http://47.93.77.19:6001/judge_correlation_yjh"
    url = "http://192.168.0.104:8100/judge_correlation_yjh"
    col_name = 'sec'
    
    parse_data , elapsed_time = get_server_res_yjh(data, url, col_name)
    print(index, '  耗时：%s s'%(elapsed_time))

In [None]:
data = {"record":cbrc_data['record']}
#     url = "http://47.93.77.19:6001/judge_correlation_yjh"
url = "http://192.168.0.104:8100/judge_correlation_yjh"
col_name = 'sec'

parse_data , elapsed_time = get_server_res_yjh(data, url, col_name)
print(index, '  耗时：%s s'%(elapsed_time))

In [None]:
parse_data

In [None]:
# data = {"record":combined_data.loc[:,['id', 'title' ,'content']].to_dict(orient = 'records')}
import time
for index in range(len(cbrc_data['record'])):
    data = {"record":[cbrc_data['record'][index],]}
#     url = "http://47.93.77.19:6001/judge_correlation_yjh"
    url = "http://192.168.0.104:8100/judge_correlation_yjh"
    col_name = 'sec'
    
    parse_data , elapsed_time = get_server_res_yjh(data, url, col_name)
    print(index, '  耗时：%s s'%(elapsed_time))

In [None]:
data = {"record":[cbrc_data['record'][0], ]}
data

In [None]:
cbrc_data['record'][0]
len(cbrc_data['record'])

# 保险业--旧

In [43]:
engine = specific_func.get_engine('circ')

## mysql 数据

### 八分类

In [42]:
# day_select = '2018-09-09'
day_list = get_day_list('2018-09-01', '2018-12-03')
print(day_list)

['2018-09-02', '2018-09-03', '2018-09-04', '2018-09-05', '2018-09-06', '2018-09-07', '2018-09-08', '2018-09-09', '2018-09-10', '2018-09-11', '2018-09-12', '2018-09-13', '2018-09-14', '2018-09-15', '2018-09-16', '2018-09-17', '2018-09-18', '2018-09-19', '2018-09-20', '2018-09-21', '2018-09-22', '2018-09-23', '2018-09-24', '2018-09-25', '2018-09-26', '2018-09-27', '2018-09-28', '2018-09-29', '2018-09-30', '2018-10-01', '2018-10-02', '2018-10-03', '2018-10-04', '2018-10-05', '2018-10-06', '2018-10-07', '2018-10-08', '2018-10-09', '2018-10-10', '2018-10-11', '2018-10-12', '2018-10-13', '2018-10-14', '2018-10-15', '2018-10-16', '2018-10-17', '2018-10-18', '2018-10-19', '2018-10-20', '2018-10-21', '2018-10-22', '2018-10-23', '2018-10-24', '2018-10-25', '2018-10-26', '2018-10-27', '2018-10-28', '2018-10-29', '2018-10-30', '2018-10-31', '2018-11-01', '2018-11-02', '2018-11-03', '2018-11-04', '2018-11-05', '2018-11-06', '2018-11-07', '2018-11-08', '2018-11-09', '2018-11-10', '2018-11-11', '2018

#### 获取数据--系统采集

In [28]:
gather_types = '采集'

for day_select in day_list:
    print('-- day_select: ', day_select)

    # 相关数据
    sql_one_day = "select t1.id, t1.group_id,t1.classify as predict_label,\
                        t1.title,t2.center as content, t1.publishtime as publishtime \
                        from wise_web_docinfo t1, wise_web_docinfo_center t2 \
                            where t1.id=t2.doc_id \
                                  and  date_format(t1.publishtime, '%%Y-%%m-%%d') = '{0}' \
                                  and t1.gather_type = 0 \
                                  group by t1.titlehash".format(day_select) # 
    # # titlehash 去重后
    circ_cor = pd.read_sql(sql_one_day, engine)
    print('circ_cor: ', circ_cor.shape  )
    
    # 不相关数据
    sql_one_day = "select t1.id, t1.group_id,t1.title,t2.center as content, t1.publishtime as publishtime \
                        from wise_web_docinfo_uncorr t1, wise_web_docinfo_center_uncurr t2 \
                            where t1.id=t2.doc_id \
                                  and t1.publishtime >= '{0} 8:00:00' \
                              and t1.publishtime <= '{0} 14:00:00'".format(day_select)
    # 一段时间
    circ_uncor = pd.read_sql(sql_one_day, engine)
    circ_uncor.insert(2, 'predict_label', 8) # 噪音
    print('circ_uncor: ', circ_uncor.shape)

    circ_data = pd.concat([circ_cor, circ_uncor], axis = 0)
    print('去重前：', circ_data.shape)
    circ_data = circ_data.drop_duplicates(subset = 'title')
    print('去重后：', circ_data.shape)  
    circ_data = circ_data.dropna(subset = ['content'], axis = 0)
    print('去空值后：', circ_data.shape)  

    circ_data['predict_label'] = circ_data['predict_label'].apply(lambda x:class_name_dict[x])
    circ_data['group_id'] = circ_data['group_id'].apply(lambda x:group_dict[str(x)])
    circ_data.insert(3, 'label', '')
    fea_filename = 'circ_result_class/result/%s_circ_class_predict_mysql_%s.xlsx'%(gather_types, day_select)
    circ_data.to_excel(fea_filename, index = False)
    print(circ_data.shape)
    print(circ_data['predict_label'].value_counts())

-- day_select:  2018-11-22
circ_cor:  (8462, 6)
circ_uncor:  (25505, 6)
去重前： (33967, 6)
去重后： (22798, 6)
去空值后： (22798, 6)
(22798, 7)
噪音        14336
资本市场       2472
消费服务       1247
监管         1043
产品销售       1030
行业         1003
其他相关报道      929
公司内部管理      736
补录            2
Name: predict_label, dtype: int64
-- day_select:  2018-11-23
circ_cor:  (9350, 6)
circ_uncor:  (26999, 6)
去重前： (36349, 6)
去重后： (23964, 6)
去空值后： (23964, 6)
(23964, 7)
噪音        14614
资本市场       3130
监管         1256
消费服务       1113
其他相关报道     1038
行业         1021
产品销售        962
公司内部管理      825
补录            5
Name: predict_label, dtype: int64


#### 合并 & 保存

In [29]:
combined_data = pd.DataFrame()
for day_select in day_list:
    file_name = 'circ_result_class/result/%s_circ_class_predict_mysql_%s.xlsx'%(gather_types, day_select)
    if os.path.isfile(file_name):
        print(file_name)
        tmp_data = pd.read_excel(file_name)
        combined_data = pd.concat([combined_data, tmp_data], axis = 0)

if gather_types != '补录':
    combined_data = combined_data[combined_data['predict_label'] != '补录']
print(combined_data.shape)  
print('去重前：', combined_data.shape)
combined_data = combined_data.drop_duplicates(subset = 'title')
print('去重后：', combined_data.shape)  
combined_data = combined_data.dropna(subset = ['content'], axis = 0)
print('去空值后：', combined_data.shape)  

print(combined_data['predict_label'].value_counts())
combined_data.head()

(46755, 7)
去重前： (46755, 7)
去重后： (45047, 7)
去空值后： (45003, 7)
噪音        28088
资本市场       5411
消费服务       2261
监管         2164
其他相关报道     1889
行业         1886
产品销售       1833
公司内部管理     1471
Name: predict_label, dtype: int64


Unnamed: 0,id,group_id,predict_label,label,title,content,publishtime
0,12252939,微信,消费服务,,交通事故发生后，受害人治疗其他疾病能否得到赔偿？,【新疆巴州律师魏娜为您提供专业的法律咨询，联系电话：138 9900 7303】 前言 ...,2018-11-22 09:35:37
1,12256481,新闻,其他相关报道,,西藏银保监局筹备组关于核准中国农业发展银行西藏自治区分行熊壮任职资格的批复,西藏银保监局筹备组关于核准中国农业发展银行西藏自治区分行熊壮任职资格的批复 藏银保监（筹）〔...,2018-11-22 00:00:00
2,12252533,新闻,监管,,银行“理财子公司”来了,银行“理财子公司”来了 经济日报·中国经济网记者 郭子源 设立理财子公司开展资管业...,2018-11-22 08:54:00
3,12250479,微信,产品销售,,和优秀的人在一起，有多重要？,- 欢迎关注 - 知乎上有个话题，你的舍友能上进到什么地步？ 底下近千条的答案，都在述说着许...,2018-11-22 08:02:48
4,12248558,新闻,其他相关报道,,复星保德信人寿总裁储良荣获2018年度中国保险业“教育培训年度贡献者”称号,(图片)(图片)11月21日，由中国保险行业协会主办的2018年度中国保险业“教育培训年度贡...,2018-11-22 06:15:48


In [30]:
fea_filename = 'circ_result_class/result/%s_circ_class_predict_mysql_20181126(1122-1123).xlsx'%gather_types
# sel_col = ['行业','资本市场', '消费服务', '公司内部管理', '监管']
# sel_col = ['其他相关报道','行业',  '公司内部管理', '监管']
sel_col = combined_data['predict_label'].unique().tolist()
sel_data = combined_data[combined_data['predict_label'].isin(sel_col)]
print(sel_data['predict_label'].value_counts())
with pd.ExcelWriter(fea_filename) as writer:
    for label in sel_data['predict_label'].unique():
        tmp_data = sel_data[sel_data['predict_label'] == label]
        if gather_types == '补录':
            N = tmp_data.shape[0]
        else :
            if tmp_data.shape[0] > 100:
                N = 100
            else :
                N = tmp_data.shape[0]
    #         if label == '公司内部管理': 
    #             N = 200
        tmp_data.sample(n = N, axis = 0, random_state=3).to_excel(writer,label, index = False)
    
    writer.save()

噪音        28088
资本市场       5411
消费服务       2261
监管         2164
其他相关报道     1889
行业         1886
产品销售       1833
公司内部管理     1471
Name: predict_label, dtype: int64


### 倾向性

In [None]:
# day_select = '2018-09-09'
day_list = get_day_list('2018-11-05', '2018-11-08')
print(day_list)

#### 获取数据

In [None]:
for day_select in day_list:
    print('-- day_select: ', day_select)

    # 相关数据
    sql_one_day = "select t1.id, t1.group_id,t1.classify as predict_label, t1.tendency,\
                        t1.title,t2.center as content, t1.publishtime as publishtime \
                        from wise_web_docinfo t1, wise_web_docinfo_center t2 \
                            where t1.id=t2.doc_id \
                                  and  date_format(t1.publishtime, '%%Y-%%m-%%d') = '{0}' \
                                  and t1.gather_type = 0 \
                                  group by t1.titlehash".format(day_select) # 
    # # titlehash 去重后
    circ_cor = pd.read_sql(sql_one_day, engine)
    print('circ_cor: ', circ_cor.shape  )

    circ_data = circ_cor
    print('去重前：', circ_data.shape)
    circ_data = circ_data.drop_duplicates(subset = 'title')
    print('去重后：', circ_data.shape)  
    circ_data = circ_data.dropna(subset = ['content'], axis = 0)
    print('去空值后：', circ_data.shape)  

    circ_data['predict_label'] = circ_data['predict_label'].apply(lambda x:class_name_dict[x])
    circ_data['group_id'] = circ_data['group_id'].apply(lambda x:group_dict[str(x)])
    circ_data.insert(4, 'label', '')
    fea_filename = 'circ_result_tendency/result/circ_tendency_predict_mysql_%s.xlsx'%day_select
    circ_data.to_excel(fea_filename, index = False)
    print(circ_data.shape)
    
    print(circ_data.pivot_table(index = ['tendency'], columns = ['predict_label'], 
                          values = 'title', aggfunc=len, 
                          fill_value=0, margins=True))

#### 合并 & 保存

In [None]:
combined_data = pd.DataFrame()
for day_select in day_list:
    tmp_data = pd.read_excel('circ_result_tendency/result/circ_tendency_predict_mysql_%s.xlsx'%day_select)
    combined_data = pd.concat([combined_data, tmp_data], axis = 0)

combined_data = combined_data[combined_data['predict_label'] != '补录']
print(combined_data.shape)  
print('去重前：', combined_data.shape)
combined_data = combined_data.drop_duplicates(subset = 'title')
print('去重后：', combined_data.shape)  
combined_data = combined_data.dropna(subset = ['content'], axis = 0)
print('去空值后：', combined_data.shape)  

print(combined_data['tendency'].value_counts())
combined_data.pivot_table(index = ['tendency', 'group_id'], 
                                columns = ['predict_label'], 
                                values = 'title', aggfunc=len, 
                                fill_value=0, margins=True)
# combined_data.head()

In [None]:
fea_filename = 'circ_result_tendency/result/circ_tendency_predict_mysql_20181112(1106-1108).xlsx'
# sel_col = combined_data['predict_label'].unique().tolist()
# sel_data = combined_data[combined_data['tendency'].isin(sel_col)]
print(combined_data['tendency'].value_counts())

N = 200 # 每类 N 条数据
class_n = int(combined_data['predict_label'].unique().shape[0])
n = int(N / class_n) + 20

print('正负各 %s 条，共 %s 类， 每类各 %s 条'%(N, class_n, n))
with pd.ExcelWriter(fea_filename) as writer:
    for tendency in combined_data['tendency'].unique():
        tmp_data = pd.DataFrame()
        sel_data = combined_data[combined_data['tendency'] == tendency]        
        for predict_label in combined_data['predict_label'].unique():
            label_data = sel_data[sel_data['predict_label'] == predict_label]
            if label_data.shape[0] > n:
                sel_label_data = label_data.sample(n = n, axis = 0, random_state=3)
            else :
                sel_label_data = label_data
            tmp_data = pd.concat([tmp_data, sel_label_data], axis = 0)        
            print('tendency: %s, predict_label: %s, size: %s'%(tendency, predict_label, tmp_data.shape))
        
        if tmp_data.shape[0] > N:
            t_n = N
        else :
            t_n = tmp_data.shape[0]
        
        tmp_data = tmp_data.sample(n = N, axis = 0, random_state=3)
        tmp_data.to_excel(writer,str(tendency), index = False)
        print(tmp_data.pivot_table(index = ['tendency'], 
                                    columns = ['predict_label'], 
                                    values = 'title', aggfunc=len, 
                                    fill_value=0, margins=True))    
    writer.save()

### 补录数据

In [58]:
# 人工补录
sql_human_additional = "select t1.id, t1.group_id, date_format(t1.publishtime,'%%Y-%%m-%%d') as publishtime,  \
                            t1.gather_type, t1.tendency,t1.classify as mysql_label, \
                            t1.title, t2.center as content\
                            from wise_web_docinfo t1, wise_web_docinfo_center t2 \
                                where (date_format(publishtime, '%%Y-%%m-%%d') >= '{0}' and \
                                      date_format(publishtime, '%%Y-%%m-%%d') <= '{1}') and \
                                      t1.id = t2.doc_id and \
                                      t1.gather_type in (1,3) \
                            group by t1.titlehash".format('2018-09-16', '2018-12-03') 

human_additional = pd.read_sql(sql_human_additional, engine)
human_additional['group_id'] = human_additional['group_id'].apply(lambda x:group_dict[str(x)])
print('title 去重前：', human_additional.shape)
human_additional = human_additional.drop_duplicates(subset = 'title')
print('title 去重后：', human_additional.shape)  
human_additional = human_additional.drop_duplicates(subset = ['content'])
print('content 去重后：', human_additional.shape)  
human_additional = human_additional.dropna(subset = ['title'], axis = 0)
print('title 去空值后：', human_additional.shape) 
human_additional.head()

title 去重前： (573, 8)
title 去重后： (573, 8)
content 去重后： (573, 8)
title 去空值后： (573, 8)


Unnamed: 0,id,group_id,publishtime,gather_type,tendency,mysql_label,title,content
0,10933233,微信,2018-09-20,1,0,0,外资股东全搜罗：隐身中资险企，财险、寿险市场份额双双接近10%,(图片)\n\n说到外资在国内保险市场的表现，很多人首先就会想到市场份额低这一点。确实，自...
1,10910821,微信,2018-09-18,1,0,0,从“天鸽”到“山竹”，保险业用这些方法更好的“管住”风险,(图片)\n\n台风“山竹”在广东西部沿海过境已经3天，得益于国家及地方政府相关部门的及早...
2,11483454,微信,2018-10-20,1,-1,0,突发，上市公司举报“平安养老风控总监伪造公章夺控制权”,"\n\t\t\t\t\t 关于<font color=""#FF0000"">平安养老保险</..."
3,11666987,微信,2018-10-26,1,-1,0,上半年持续亏损 董事长遭逮捕华安保险精达股份玩起“二人转” | 保险,"\n\t\t\t\t\t 特华投资与<font color=""#FF0000"">华安保险<..."
4,11765888,纸媒,2018-10-31,1,0,0,2018年10月31日--视点--业内专家分析如何保障重疾险消费者权益,"\n\t\t\t\t\t \n\t\t\t\t\t今年6月,中国<font color=""..."


In [59]:
human_additional['title'] = human_additional['title'].astype(str) 
human_additional['content'] = human_additional['content'].astype(str)
data = {"record":human_additional.loc[:,['id', 'title' ,'content']].to_dict(orient = 'records')}
url = "http://47.93.77.19:10000/judge_correlation_i"
col_name = 'cor'
parse_data, elapsed_time = get_server_res(data, url, col_name)
parse_data.columns = ['id', 'predict_label']
human_additional = pd.merge(human_additional, parse_data, on = 'id', how = 'left')
human_additional['predict_label'] = human_additional['predict_label'].apply(lambda x:class_name_dict[x])
human_additional.insert(6, 'label', '')
print(human_additional['predict_label'].value_counts())
human_additional.head()

行业        180
监管        109
噪音        106
公司内部管理    102
资本市场       44
消费服务       20
产品销售       12
Name: predict_label, dtype: int64


Unnamed: 0,id,group_id,publishtime,gather_type,tendency,mysql_label,label,title,content,predict_label
0,10933233,微信,2018-09-20,1,0,0,,外资股东全搜罗：隐身中资险企，财险、寿险市场份额双双接近10%,(图片)\n\n说到外资在国内保险市场的表现，很多人首先就会想到市场份额低这一点。确实，自...,行业
1,10910821,微信,2018-09-18,1,0,0,,从“天鸽”到“山竹”，保险业用这些方法更好的“管住”风险,(图片)\n\n台风“山竹”在广东西部沿海过境已经3天，得益于国家及地方政府相关部门的及早...,行业
2,11483454,微信,2018-10-20,1,-1,0,,突发，上市公司举报“平安养老风控总监伪造公章夺控制权”,"\n\t\t\t\t\t 关于<font color=""#FF0000"">平安养老保险</...",资本市场
3,11666987,微信,2018-10-26,1,-1,0,,上半年持续亏损 董事长遭逮捕华安保险精达股份玩起“二人转” | 保险,"\n\t\t\t\t\t 特华投资与<font color=""#FF0000"">华安保险<...",公司内部管理
4,11765888,纸媒,2018-10-31,1,0,0,,2018年10月31日--视点--业内专家分析如何保障重疾险消费者权益,"\n\t\t\t\t\t \n\t\t\t\t\t今年6月,中国<font color=""...",噪音


In [60]:
fea_filename = 'circ_result_class/result/补录_保监会（旧）_class_predict_mysql_20181203(0917-1203).xlsx'
print(fea_filename)

sel_col = ['噪音', '消费服务', '公司内部管理', '监管',
           '行业', '资本市场', '其他相关报道','产品销售','交通','环保']
sel_data = human_additional[human_additional['predict_label'].isin(sel_col)]
print(sel_data['predict_label'].value_counts())
print()

c_data = pd.DataFrame()
with pd.ExcelWriter(fea_filename) as writer:
    for label in sel_data['predict_label'].unique():
        tmp_data = sel_data[sel_data['predict_label'] == label]
        N = tmp_data.shape[0]            
        save_data = tmp_data.sample(n = N, axis = 0, random_state=42)
        save_data.to_excel(writer,label, index = False)
        c_data = pd.concat([c_data, save_data], axis = 0)
    print(c_data.pivot_table(index = ['group_id'], 
                                columns = ['predict_label'], 
                                values = 'title', aggfunc=len, 
                                fill_value=0, margins=True))      
    writer.save()

circ_result_class/result/补录_保监会（旧）_class_predict_mysql_20181203(0917-1203).xlsx
行业        180
监管        109
噪音        106
公司内部管理    102
资本市场       44
消费服务       20
产品销售       12
Name: predict_label, dtype: int64

predict_label  产品销售  公司内部管理   噪音  消费服务   监管   行业  资本市场  All
group_id                                                   
微信                3      23   27     2   26   69    11  161
微博                0       0    1     0    0    0     0    1
新闻                9      72   64    15   65   87    31  343
新闻客户端             0       4    3     0    7    5     1   20
纸媒                0       3    3     1   11   18     1   37
视频                0       0    8     1    0    1     0   10
论坛                0       0    0     1    0    0     0    1
All              12     102  106    20  109  180    44  573


In [61]:
fea_filename = 'circ_result_tendency/result/补录_保监会（旧）_tendency_predict_mysql_20181203(0917-1203).xlsx'
print(fea_filename)
print(human_additional['tendency'].value_counts())

with pd.ExcelWriter(fea_filename) as writer:
    for tendency in human_additional['tendency'].unique():
        sel_data = human_additional[human_additional['tendency'] == tendency]    
        t_n = sel_data.shape[0]        
        tmp_data = sel_data.sample(n = t_n, axis = 0, random_state=3)
        tmp_data.to_excel(writer,str(tendency), index = False)           
    writer.save()

circ_result_tendency/result/补录_保监会（旧）_tendency_predict_mysql_20181203(0917-1203).xlsx
 0    381
-1    192
Name: tendency, dtype: int64


## 本地模型

### 八分类

In [None]:
from sklearn.externals import joblib
pipeline_old = joblib.load( "model/circ_8classifier_1015.pkl.z")

In [None]:
combined_data['title_content'] = combined_data['title'].astype(str) + '。' + combined_data['content'].astype(str)
title_content = pre_cor_circ.handle_contents(combined_data['title_content'].tolist())
print(len(title_content))

In [None]:
local_label = pipeline_old.predict(title_content)
local_proba = pipeline_old.predict_proba(title_content)

combined_data['local_label'] = local_label
combined_data['local_proba'] = local_proba.max(axis = 1)
combined_data['local_label'] = combined_data['local_label'].apply(lambda x:class_name_dict[x])
print(combined_data.shape)
combined_data.iloc[:2, :]

#### 线上线下一致性: mysql 与 local

In [None]:
combined_data['R_W'] = combined_data.apply(lambda x: 'Right' if x['local_label'] == x['predict_label'] else 'Wrong', axis = 1)
print(combined_data[combined_data['R_W'] == 'Right'].shape[0]/combined_data.shape[0])
print(combined_data['R_W'].value_counts())
combined_data[combined_data['R_W'] == 'Wrong'].pivot_table(index = ['local_label'], columns = ['predict_label'], 
                                                            aggfunc = [len], values = ['id'], 
                                                            fill_value = 0, margins = True)

#### 线上线下一致性: online 与 local

In [None]:
combined_data['id'] = range(combined_data.shape[0])
combined_data['title'] = combined_data['title'].astype(str) 
combined_data['content'] = combined_data['content'].astype(str)
data = {"record":combined_data.loc[:,['id', 'title' ,'content']].to_dict(orient = 'records')}
url = "http://47.93.77.19:10000/judge_correlation_i"
col_name = 'cor'
parse_data, elapsed_time = get_server_res(data, url, col_name)
parse_data.columns = ['id', 'online_label']
parse_data.head()

In [None]:
combined_data = pd.merge(combined_data, parse_data, on  = 'id')
print(combined_data.shape)
combined_data['O_R_W'] = combined_data.apply(lambda x: 'Right' if x['local_label'] == x['online_label'] else 'Wrong', axis = 1)
print(combined_data[combined_data['O_R_W'] == 'Right'].shape[0]/combined_data.shape[0])
print(combined_data['O_R_W'].value_counts())
combined_data[combined_data['O_R_W'] == 'Wrong'].pivot_table(index = ['local_label'], columns = ['online_label'], 
                                                            aggfunc = [len], values = ['urlhash'], 
                                                            fill_value = 0, margins = True)

# 银行业--旧

In [14]:
engine = specific_func.get_engine('cbrc')

In [15]:
# day_select = '2018-09-09'
day_list = get_day_list('2018-11-05', '2018-11-08')
print(day_list)

['2018-11-06', '2018-11-07', '2018-11-08']


## mysql 数据

### 八分类数据

#### 获取数据

In [None]:
# sql_circ_cor_one_day = "select t1.id, t1.publishtime, t1.title,t2.text as content \
#                             from elint_web_docinfo t1, wise_web_docinfo_text t2 \
#                                 where t1.id = t2.doc_id \
#                                   and date_format(t1.publishtime, '%%Y-%%m-%%d') = '{0}'".format('2018-08-07')
# # 实际
# circ_cor = pd.read_sql(sql_circ_cor_one_day, engine)
# print(circ_cor.shape)
# circ_cor.head()

In [None]:
for day_select in day_list:
    print('-- day_select: ', day_select)
    
    # 获取八分类
    sql_one_day = "select t2.urlhash, t1.traffic_id, t2.title as title_1\
                        from wise_web_classify_traffic_docinfo t1, wise_web_docinfo_basic t2 \
                            where t1.base_id=t2.id \
                                  and date_format(t2.publishtime, '%%Y-%%m-%%d') = '{0}' ".format(day_select)
    cbrc_flag = pd.read_sql(sql_one_day, engine)
    print('cbrc_flag：', cbrc_flag.shape)
    
    # 相关数据
    sql_one_day = "select t1.urlhash, t1.title,t2.text as content, t1.group_id, t1.publishtime as publishtime \
                        from elint_web_docinfo t1, wise_web_docinfo_text t2 \
                            where t1.id=t2.doc_id \
                                  and t1.publishtime >= '{0} 08:00:00' \
                                  and t1.publishtime <= '{0} 14:00:00' \
                                group by t1.titlehash".format(day_select)
    # titlehash 去重后
    cbrc_cor = pd.read_sql(sql_one_day, engine) 
    print('cbrc_cor：', cbrc_cor.shape)
    
    # 不相关数据
    sql_cbrc_uncor = "select urlhash, title, content, group_id, publishtime \
                            from wise_web_docinfo_uncor \
                            where date_format(publishtime, '%%Y-%%m-%%d') = '{0}'".format(day_select)
    cbrc_uncor = pd.read_sql(sql_cbrc_uncor, engine)  
    print('cbrc_uncor：', cbrc_uncor.shape)

    cbrc_data = pd.concat([cbrc_cor, cbrc_uncor], axis = 0)
    print('去重前：', cbrc_data.shape)
    cbrc_data = cbrc_data.drop_duplicates(subset = 'title')
    print('去重后：', cbrc_data.shape)  
    cbrc_data = cbrc_data.dropna(subset = ['content'], axis = 0)
    print('去空值后：', cbrc_data.shape)  

    cbrc_combined = pd.merge(cbrc_flag, cbrc_data, how = 'inner', on = 'urlhash')
    cbrc_combined['predict_label'] = cbrc_combined['traffic_id'].apply(lambda x:class_name_dict[x])
    cbrc_combined['group_id'] = cbrc_combined['group_id'].apply(lambda x:group_dict[str(x)])
    cbrc_combined['label'] = ''
    cbrc_combined = cbrc_combined[['urlhash', 'predict_label', 'label', 'title', 'content', 'group_id', 'publishtime']]
    fea_filename = 'cbrc_result_class/result/cbrc_class_predict_mysql_%s.xlsx'%day_select
    cbrc_combined.to_excel(fea_filename, index = False)
    print(cbrc_combined.shape)
    print(cbrc_combined['predict_label'].value_counts())
    

#### 合并 & 保存

In [None]:
combined_data = pd.DataFrame()
for day_select in day_list:
    tmp_data = pd.read_excel('cbrc_result_class/result/cbrc_class_predict_mysql_%s.xlsx'%day_select)
    combined_data = pd.concat([combined_data, tmp_data], axis = 0)

combined_data = combined_data[combined_data['predict_label'] != '补录']
print(combined_data.shape)  
print('去重前：', combined_data.shape)
combined_data = combined_data.drop_duplicates(subset = 'title')
print('去重后：', combined_data.shape)  
combined_data = combined_data.dropna(subset = ['content'], axis = 0)
print('去空值后：', combined_data.shape)  

print(combined_data['predict_label'].value_counts())
combined_data.iloc[:2, :]

In [None]:
fea_filename = 'cbrc_result_class/result/cbrc_class_predict_mysql_20181112(1106-1110).xlsx'
# sel_col = ['噪音', '消费服务', '公司内部管理', '监管',
#            '行业', '资本市场', '其他相关报道','产品销售']
sel_col = ['公司内部管理', '监管', '行业', '产品销售']
# sel_col = combined_data['predict_label'].unique().tolist()
sel_data = combined_data[combined_data['predict_label'].isin(sel_col)]
print(sel_data['predict_label'].value_counts())
with pd.ExcelWriter(fea_filename) as writer:
    for label in sel_data['predict_label'].unique():
        tmp_data = sel_data[sel_data['predict_label'] == label]
        if tmp_data.shape[0] > 300:
            N = 300
        else :
            N = tmp_data.shape[0]
        tmp_data.sample(n = N, axis = 0, random_state=42).to_excel(writer,label, index = False)
    
    writer.save()

### 倾向性

#### 获取数据

In [None]:
for day_select in day_list:
    print('-- day_select: ', day_select)
    
    # 获取八分类
    sql_one_day = "select t2.urlhash, t1.traffic_id, t2.title as title_1\
                        from wise_web_classify_traffic_docinfo t1, wise_web_docinfo_basic t2 \
                            where t1.base_id=t2.id \
                                  and date_format(t2.publishtime, '%%Y-%%m-%%d') = '{0}' ".format(day_select)
    cbrc_flag = pd.read_sql(sql_one_day, engine)
    print('cbrc_flag：', cbrc_flag.shape)
    
    # 相关数据
    sql_one_day = "select t1.urlhash, t1.title,t2.text as content, t1.group_id, \
                            t1.sen as tendency, t1.publishtime as publishtime \
                        from elint_web_docinfo t1, wise_web_docinfo_text t2 \
                            where t1.id=t2.doc_id \
                                  and t1.publishtime >= '{0} 08:00:00' \
                                  and t1.publishtime <= '{0} 14:00:00' \
                                group by t1.titlehash".format(day_select)
    # titlehash 去重后
    cbrc_cor = pd.read_sql(sql_one_day, engine) 
    print('cbrc_cor：', cbrc_cor.shape)

    cbrc_data = cbrc_cor
    print('去重前：', cbrc_data.shape)
    cbrc_data = cbrc_data.drop_duplicates(subset = 'title')
    print('去重后：', cbrc_data.shape)  
    cbrc_data = cbrc_data.dropna(subset = ['content'], axis = 0)
    print('去空值后：', cbrc_data.shape)  

    cbrc_combined = pd.merge(cbrc_flag, cbrc_data, how = 'inner', on = 'urlhash')
    cbrc_combined['predict_label'] = cbrc_combined['traffic_id'].apply(lambda x:class_name_dict[x])
    cbrc_combined['group_id'] = cbrc_combined['group_id'].apply(lambda x:group_dict[str(x)])
    cbrc_combined['label'] = ''
    cbrc_combined = cbrc_combined[['urlhash', 'group_id', 'predict_label', 'tendency', 
                                   'label', 'title', 'content', 'publishtime']]
    fea_filename = 'cbrc_result_tendency/result/cbrc_tendency_predict_mysql_%s.xlsx'%day_select
    cbrc_combined.to_excel(fea_filename, index = False)
    print(cbrc_combined.shape)
    print(cbrc_combined.pivot_table(index = ['tendency'], columns = ['predict_label'], 
                          values = 'title', aggfunc=len, 
                          fill_value=0, margins=True))
    

#### 合并 & 保存

In [None]:
combined_data = pd.DataFrame()
for day_select in day_list:
    tmp_data = pd.read_excel('cbrc_result_tendency/result/cbrc_tendency_predict_mysql_%s.xlsx'%day_select)
    combined_data = pd.concat([combined_data, tmp_data], axis = 0)

combined_data = combined_data[combined_data['predict_label'] != '补录']
sel_col = [ '消费服务', '公司内部管理', '监管','行业']
combined_data = combined_data[combined_data['predict_label'].isin(sel_col)]
print(combined_data.shape)  
print('去重前：', combined_data.shape)
combined_data = combined_data.drop_duplicates(subset = 'title')
print('去重后：', combined_data.shape)  
combined_data = combined_data.dropna(subset = ['content'], axis = 0)
print('去空值后：', combined_data.shape)  

print(combined_data['tendency'].value_counts())
combined_data.pivot_table(index = ['tendency', 'group_id'], 
                                columns = ['predict_label'], 
                                values = 'title', aggfunc=len, 
                                fill_value=0, margins=True)
# combined_data.iloc[:2, :]

In [None]:
fea_filename = 'cbrc_result_tendency/result/cbrc_tendency_predict_mysql_20181112(1106-1108).xlsx'
print(combined_data['tendency'].value_counts())

N = 200 # 每类 N 条数据
class_n = int(combined_data['predict_label'].unique().shape[0])
n = int(N / class_n) + 200

print('正负各 %s 条，共 %s 类， 每类各 %s 条'%(N, class_n, n))
with pd.ExcelWriter(fea_filename) as writer:
    for tendency in combined_data['tendency'].unique():
        tmp_data = pd.DataFrame()
        sel_data = combined_data[combined_data['tendency'] == tendency]        
        for predict_label in combined_data['predict_label'].unique():
            label_data = sel_data[sel_data['predict_label'] == predict_label]
            if label_data.shape[0] > n:
                sel_label_data = label_data.sample(n = n, axis = 0, random_state=3)
            else :
                sel_label_data = label_data
            tmp_data = pd.concat([tmp_data, sel_label_data], axis = 0)        
            print('tendency: %s, predict_label: %s, size: %s'%(tendency, predict_label, tmp_data.shape))
        
        if tmp_data.shape[0] > N:
            t_n = N
        else :
            t_n = tmp_data.shape[0]
        
        tmp_data = tmp_data.sample(n = N, axis = 0, random_state=3)
        tmp_data.to_excel(writer,str(tendency), index = False)
        print(tmp_data.pivot_table(index = ['tendency'], 
                                    columns = ['predict_label'], 
                                    values = 'title', aggfunc=len, 
                                    fill_value=0, margins=True))    
    writer.save()

### 补录数据

In [20]:
# 人工补录
sql_one_day = "select t1.id, t1.group_id, t1.publishtime as publishtime, t1.gather_type, \
                    t1.sen as tendency, t1.title,t2.text as content \
                    from elint_web_docinfo t1, wise_web_docinfo_text t2 \
                        where t1.id=t2.doc_id \
                              and t1.publishtime >= '{0} 00:00:00' \
                              and t1.publishtime <= '{1} 23:59:59' \
                              and t1.gather_type in (1,3) \
                            group by t1.titlehash".format('2018-09-01', '2018-12-03')
# titlehash 去重后
human_additional = pd.read_sql(sql_one_day, engine) 
human_additional['group_id'] = human_additional['group_id'].apply(lambda x:group_dict[str(x)])
print('title 去重前：', human_additional.shape)
human_additional = human_additional.drop_duplicates(subset = 'title')
print('title 去重后：', human_additional.shape)  
human_additional = human_additional.drop_duplicates(subset = ['content'])
print('content 去重后：', human_additional.shape)  
human_additional = human_additional.dropna(subset = ['title'], axis = 0)
print('title 去空值后：', human_additional.shape) 
human_additional.head()

title 去重前： (3334, 7)
title 去重后： (3334, 7)
content 去重后： (3334, 7)
title 去空值后： (3334, 7)


Unnamed: 0,id,group_id,publishtime,gather_type,tendency,title,content
0,267289612,微博,2018-10-31 12:11:44,1,-1,24号没叫取了28号给我 26号后直接跳到了32号 @中国工商银行 麻烦请加强下底层工作人员...,24号没叫取了28号给我\n26号后直接跳到了32号\n@中国工商银行 麻烦请加强下底层工作...
1,266932412,微博,2018-10-30 11:46:06,1,-1,#魔法万圣节# 在建行庆祝万圣节 各种暂停服务 没人的情况下等待二十分钟 后边的一个叔叔等待...,#魔法万圣节# 在建行庆祝万圣节 各种暂停服务 没人的情况下等待二十分钟 后边的一个叔叔等待...
2,249135456,微博,2018-09-28 10:31:29,1,-1,工商银行的服务真是稀烂！不是财大气粗吗？从工作人员的业务熟悉度到业务平台系统都是渣渣！,工商银行的服务真是稀烂！不是财大气粗吗？从工作人员的业务熟悉度到业务平台系统都是渣渣！
3,263284185,微信,2018-10-17 00:00:00,1,-1,老牌长租公寓也摊事儿！雷军等明星投资人加持却也难逃一劫，年内已有5家爆雷,(图片)\n\n(图片)\n\n长租公寓又“摊上事儿”了。\n\n上海老牌长租公寓 寓见...
4,269162185,微博,2018-11-06 20:42:00,1,-1,不得不表扬下@招商银行信用卡 了，前几天在...,不得不表扬下\n@招商银行信用卡\n 了，前几天在苏格兰玩儿弄丢了卡，发现可以直接打开手机...


In [21]:
human_additional['title'] = human_additional['title'].astype(str) 
human_additional['content'] = human_additional['content'].astype(str)
data = {"record":human_additional.loc[:,['id', 'title' ,'content']].to_dict(orient = 'records')}
url = "http://47.93.77.19:6001/judge_correlation_yjh"
col_name = 'sec'
parse_data, elapsed_time = get_server_res(data, url, col_name)
parse_data.columns = ['id', 'predict_label']
human_additional = pd.merge(human_additional, parse_data, on = 'id', how = 'left')
human_additional['predict_label'] = human_additional['predict_label'].apply(lambda x:class_name_dict[x])
human_additional.insert(5, 'label', '')
print(human_additional['predict_label'].value_counts())
human_additional.head()

消费服务      2366
行业         290
噪音         252
资本市场       131
监管         103
公司内部管理     101
产品销售        80
其他相关报道      11
Name: predict_label, dtype: int64


Unnamed: 0,id,group_id,publishtime,gather_type,tendency,label,title,content,predict_label
0,267289612,微博,2018-10-31 12:11:44,1,-1,,24号没叫取了28号给我 26号后直接跳到了32号 @中国工商银行 麻烦请加强下底层工作人员...,24号没叫取了28号给我\n26号后直接跳到了32号\n@中国工商银行 麻烦请加强下底层工作...,消费服务
1,266932412,微博,2018-10-30 11:46:06,1,-1,,#魔法万圣节# 在建行庆祝万圣节 各种暂停服务 没人的情况下等待二十分钟 后边的一个叔叔等待...,#魔法万圣节# 在建行庆祝万圣节 各种暂停服务 没人的情况下等待二十分钟 后边的一个叔叔等待...,消费服务
2,249135456,微博,2018-09-28 10:31:29,1,-1,,工商银行的服务真是稀烂！不是财大气粗吗？从工作人员的业务熟悉度到业务平台系统都是渣渣！,工商银行的服务真是稀烂！不是财大气粗吗？从工作人员的业务熟悉度到业务平台系统都是渣渣！,消费服务
3,263284185,微信,2018-10-17 00:00:00,1,-1,,老牌长租公寓也摊事儿！雷军等明星投资人加持却也难逃一劫，年内已有5家爆雷,(图片)\n\n(图片)\n\n长租公寓又“摊上事儿”了。\n\n上海老牌长租公寓 寓见...,行业
4,269162185,微博,2018-11-06 20:42:00,1,-1,,不得不表扬下@招商银行信用卡 了，前几天在...,不得不表扬下\n@招商银行信用卡\n 了，前几天在苏格兰玩儿弄丢了卡，发现可以直接打开手机...,消费服务


In [22]:
fea_filename = 'cbrc_result_class/result/补录_银监会（旧）_class_predict_mysql_20181203(0901-1203).xlsx'
print(fea_filename)

sel_col = ['噪音', '消费服务', '公司内部管理', '监管',
           '行业', '资本市场', '其他相关报道','产品销售','交通','环保']
sel_data = human_additional[human_additional['predict_label'].isin(sel_col)]
print(sel_data['predict_label'].value_counts())
print()

c_data = pd.DataFrame()
with pd.ExcelWriter(fea_filename) as writer:
    for label in sel_data['predict_label'].unique():
        tmp_data = sel_data[sel_data['predict_label'] == label]
        N = tmp_data.shape[0]            
        save_data = tmp_data.sample(n = N, axis = 0, random_state=42)
        save_data.to_excel(writer,label, index = False)
        c_data = pd.concat([c_data, save_data], axis = 0)
    print(c_data.pivot_table(index = ['group_id'], 
                                columns = ['predict_label'], 
                                values = 'title', aggfunc=len, 
                                fill_value=0, margins=True))      
    writer.save()

cbrc_result_class/result/补录_银监会（旧）_class_predict_mysql_20181203(0901-1203).xlsx
消费服务      2366
行业         290
噪音         252
资本市场       131
监管         103
公司内部管理     101
产品销售        80
其他相关报道      11
Name: predict_label, dtype: int64

predict_label  产品销售  公司内部管理  其他相关报道   噪音  消费服务   监管   行业  资本市场   All
group_id                                                            
博客                0       0       0    0     1    0    0     0     1
微信                0       3       0    2     5    4   12     6    32
微博               76       3       6  153  2282    4    9     4  2537
新闻                2      86       4   64    23   61  165    63   468
新闻客户端             0       2       0    2     0    4    1     2    11
纸媒                0       5       1   25     7   30  103    56   227
论坛                2       2       0    6    48    0    0     0    58
All              80     101      11  252  2366  103  290   131  3334


In [23]:
fea_filename = 'cbrc_result_tendency/result/补录_银监会（旧）_tendency_predict_mysql_20181204(0901-1203).xlsx'
print(fea_filename)
print(human_additional['tendency'].value_counts())

with pd.ExcelWriter(fea_filename) as writer:
    for tendency in human_additional['tendency'].unique():
        sel_data = human_additional[human_additional['tendency'] == tendency]    
        t_n = sel_data.shape[0]        
        tmp_data = sel_data.sample(n = t_n, axis = 0, random_state=3)
        tmp_data.to_excel(writer,str(tendency), index = False)           
    writer.save()

cbrc_result_tendency/result/补录_银监会（旧）_tendency_predict_mysql_20181204(0901-1203).xlsx
-1    2892
 0     345
 1      97
Name: tendency, dtype: int64


## 本地模型

### 八分类

In [None]:
from sklearn.externals import joblib
pipeline_old = joblib.load( "model/cbrc_8classifier_1015.pkl.z")

In [None]:
combined_data['title_content'] = combined_data['title'].astype(str) + '。' + combined_data['content'].astype(str)
title_content = pre_cor_cbrc.handle_contents(combined_data['title_content'].tolist())
print(len(title_content))

local_label = pipeline_old.predict(title_content)
local_proba = pipeline_old.predict_proba(title_content)

In [None]:
combined_data['local_label'] = local_label
combined_data['local_proba'] = local_proba.max(axis = 1)
combined_data['local_label'] = combined_data['local_label'].apply(lambda x:class_name_dict[x])
print(combined_data.shape)
combined_data.iloc[:2, :]

#### 线上线下一致性: mysql 与 local

In [None]:
combined_data['R_W'] = combined_data.apply(lambda x: 'Right' if x['local_label'] == x['predict_label'] else 'Wrong', axis = 1)
print(combined_data[combined_data['R_W'] == 'Right'].shape[0]/combined_data.shape[0])
print(combined_data['R_W'].value_counts())
combined_data[combined_data['R_W'] == 'Wrong'].pivot_table(index = ['local_label'], columns = ['predict_label'], 
                                                            aggfunc = [len], values = ['urlhash'], 
                                                            fill_value = 0, margins = True)

#### 线上线下一致性: online 与 local

In [None]:
combined_data['id'] = range(combined_data.shape[0])
combined_data['title'] = combined_data['title'].astype(str) 
combined_data['content'] = combined_data['content'].astype(str)

In [None]:
data = {"record":combined_data.loc[:,['id', 'title' ,'content']].to_dict(orient = 'records')}
url = "http://47.93.77.19:6001/judge_correlation_yjh"
col_name = 'sec'
parse_data = get_server_res_yjh(data, url, col_name)
parse_data.columns = ['id', 'online_label']
parse_data.head()

In [None]:
combined_data = pd.merge(combined_data, parse_data, on  = 'id')
print(combined_data.shape)
combined_data['online_label'] = combined_data['online_label'].apply(lambda x:class_name_dict[x])
combined_data['O_R_W'] = combined_data.apply(lambda x: 'Right' if x['local_label'] == x['online_label'] else 'Wrong', axis = 1)
print(combined_data[combined_data['O_R_W'] == 'Right'].shape[0]/combined_data.shape[0])
print(combined_data['O_R_W'].value_counts())
combined_data[combined_data['O_R_W'] == 'Wrong'].pivot_table(index = ['local_label'], columns = ['online_label'], 
                                                            aggfunc = [len], values = ['urlhash'], 
                                                            fill_value = 0, margins = True)

#### 线上线下一致性: online 与 mysql

In [None]:
combined_data['O_R_W'] = combined_data.apply(lambda x: 'Right' if x['predict_label'] == x['online_label'] else 'Wrong', axis = 1)
print(combined_data[combined_data['O_R_W'] == 'Right'].shape[0]/combined_data.shape[0])
print(combined_data['O_R_W'].value_counts())
combined_data[combined_data['O_R_W'] == 'Wrong'].pivot_table(index = ['predict_label'], columns = ['online_label'], 
                                                            aggfunc = [len], values = ['urlhash'], 
                                                            fill_value = 0, margins = True)

### 倾向性

In [None]:
from sklearn.externals import joblib
pipeline_old = joblib.load( "model/cbrc_8classifier_1015.pkl.z")

In [None]:
combined_data['title_content'] = combined_data['title'].astype(str) + '。' + combined_data['content'].astype(str)
title_content = pre_cor_cbrc.handle_contents(combined_data['title_content'].tolist())
print(len(title_content))

local_label = pipeline_old.predict(title_content)
local_proba = pipeline_old.predict_proba(title_content)

In [None]:
combined_data['local_label'] = local_label
combined_data['local_proba'] = local_proba.max(axis = 1)
combined_data['local_label'] = combined_data['local_label'].apply(lambda x:class_name_dict[x])
print(combined_data.shape)
combined_data.iloc[:2, :]

#### 线上线下一致性: mysql 与 local

In [None]:
combined_data['R_W'] = combined_data.apply(lambda x: 'Right' if x['local_label'] == x['predict_label'] else 'Wrong', axis = 1)
print(combined_data[combined_data['R_W'] == 'Right'].shape[0]/combined_data.shape[0])
print(combined_data['R_W'].value_counts())
combined_data[combined_data['R_W'] == 'Wrong'].pivot_table(index = ['local_label'], columns = ['predict_label'], 
                                                            aggfunc = [len], values = ['urlhash'], 
                                                            fill_value = 0, margins = True)

#### 线上线下一致性: online 与 local

In [None]:
combined_data['id'] = range(combined_data.shape[0])
combined_data['title'] = combined_data['title'].astype(str) 
combined_data['content'] = combined_data['content'].astype(str)

In [None]:
data = {"record":combined_data.loc[:,['id', 'title' ,'content']].to_dict(orient = 'records')}
url = "http://47.93.77.19:6001/judge_correlation_yjh"
col_name = 'sec'
parse_data = get_server_res_yjh(data, url, col_name)
parse_data.columns = ['id', 'online_label']
parse_data.head()

In [None]:
combined_data = pd.merge(combined_data, parse_data, on  = 'id')
print(combined_data.shape)
combined_data['online_label'] = combined_data['online_label'].apply(lambda x:class_name_dict[x])
combined_data['O_R_W'] = combined_data.apply(lambda x: 'Right' if x['local_label'] == x['online_label'] else 'Wrong', axis = 1)
print(combined_data[combined_data['O_R_W'] == 'Right'].shape[0]/combined_data.shape[0])
print(combined_data['O_R_W'].value_counts())
combined_data[combined_data['O_R_W'] == 'Wrong'].pivot_table(index = ['local_label'], columns = ['online_label'], 
                                                            aggfunc = [len], values = ['urlhash'], 
                                                            fill_value = 0, margins = True)

#### 线上线下一致性: online 与 mysql

In [None]:
combined_data['O_R_W'] = combined_data.apply(lambda x: 'Right' if x['predict_label'] == x['online_label'] else 'Wrong', axis = 1)
print(combined_data[combined_data['O_R_W'] == 'Right'].shape[0]/combined_data.shape[0])
print(combined_data['O_R_W'].value_counts())
combined_data[combined_data['O_R_W'] == 'Wrong'].pivot_table(index = ['predict_label'], columns = ['online_label'], 
                                                            aggfunc = [len], values = ['urlhash'], 
                                                            fill_value = 0, margins = True)

# 银行业与保险业--新

## mysql 数据

In [14]:
engine = specific_func.get_engine('cbirc')

In [32]:
# day_select = '2018-09-09'
day_list = get_day_list('2018-09-01', '2018-12-03')
print(day_list)

['2018-09-02', '2018-09-03', '2018-09-04', '2018-09-05', '2018-09-06', '2018-09-07', '2018-09-08', '2018-09-09', '2018-09-10', '2018-09-11', '2018-09-12', '2018-09-13', '2018-09-14', '2018-09-15', '2018-09-16', '2018-09-17', '2018-09-18', '2018-09-19', '2018-09-20', '2018-09-21', '2018-09-22', '2018-09-23', '2018-09-24', '2018-09-25', '2018-09-26', '2018-09-27', '2018-09-28', '2018-09-29', '2018-09-30', '2018-10-01', '2018-10-02', '2018-10-03', '2018-10-04', '2018-10-05', '2018-10-06', '2018-10-07', '2018-10-08', '2018-10-09', '2018-10-10', '2018-10-11', '2018-10-12', '2018-10-13', '2018-10-14', '2018-10-15', '2018-10-16', '2018-10-17', '2018-10-18', '2018-10-19', '2018-10-20', '2018-10-21', '2018-10-22', '2018-10-23', '2018-10-24', '2018-10-25', '2018-10-26', '2018-10-27', '2018-10-28', '2018-10-29', '2018-10-30', '2018-10-31', '2018-11-01', '2018-11-02', '2018-11-03', '2018-11-04', '2018-11-05', '2018-11-06', '2018-11-07', '2018-11-08', '2018-11-09', '2018-11-10', '2018-11-11', '2018

### 获取数据

#### 系统采集
- gather_type 0-系统采集

In [28]:
types = 5
gather_types = '采集'
print("获取 %s 数据..."%(proj_name_dict[types]))
for day_select in day_list:
    print()
    print('-- day_select: ', day_select)
    
    # 获取八分类
    if types in [1,2]:
        sql_label = '''
        SELECT 
            t1.type, t1.urlhash, t3.title, t3.group_id, t3.publishtime,
            t1.traffic_id, t2.sen as tendency, t2.gather_type
        FROM
            cbrc_circ.db_classify_traffic_docinfo t1
                LEFT JOIN
            cbrc_circ.db_docinfo_trade t2 ON t1.urlhash = t2.urlhash
                LEFT JOIN
            cbrc_circ.db_docinfo t3 ON t2.urlhash = t3.urlhash
        WHERE
            t3.publishtime >= '{0} 07:00:00'
                AND t3.publishtime <= '{0} 16:00:00'
                and t1.type = {1}
                and t1.type = t2.type
                and t2.gather_type = 0
        group by t3.titlehash
        '''.format(day_select, types)
    elif types in [3,4,5]:
        sql_label = '''
        SELECT 
            t1.id, t1.type, t1.urlhash, t3.title, t3.group_id, t3.publishtime,
            t1.traffic_id, t2.sen as tendency, t2.gather_type
        FROM
            cbrc_circ.db_classify_traffic_docinfo t1
                LEFT JOIN
            cbrc_circ.db_docinfo_trade t2 ON t1.urlhash = t2.urlhash
                LEFT JOIN
            cbrc_circ.db_docinfo t3 ON t2.urlhash = t3.urlhash
        WHERE
            t3.publishtime >= '{0} 00:00:00'
                AND t3.publishtime <= '{0} 23:59:59'
                and t1.type = {1}
                and t1.type = t2.type
                and t2.gather_type = 0
        group by t3.titlehash
        '''.format(day_select, types)        

    cbirc_label = pd.read_sql(sql_label, engine)
    cbirc_label['predict_label'] = cbirc_label['traffic_id'].apply(lambda x:class_name_dict[x])
    cbirc_label['group_id'] = cbirc_label['group_id'].apply(lambda x:group_dict[str(x)])
    cbirc_label['type'] = cbirc_label['type'].apply(lambda x:proj_name_dict[x])
    cbirc_label['gather_type'] = cbirc_label['gather_type'].apply(lambda x:gather_type_name_dict[x])
    print('label 去重前：', cbirc_label.shape)
    cbirc_label = cbirc_label.drop_duplicates(subset = 'title')
    print('label 去重后：', cbirc_label.shape)  
    cbirc_label = cbirc_label.dropna(subset = ['title'], axis = 0)
    print('label 去空值后：', cbirc_label.shape)  
    
    if cbirc_label['urlhash'].shape[0] != 0:
        # 获取 content
        url_l = cbirc_label['urlhash'].tolist()
        if cbirc_label['urlhash'].shape[0] == 1:
            url_l.append(url_l[0])
        url_list = tuple(url_l)
        sql_content = '''
        SELECT 
            t1.urlhash, t1.text as content
        FROM
            cbrc_circ.db_docinfo_text t1
        WHERE
            t1.urlhash in {0}
        '''.format(url_list)

        cbirc_content = pd.read_sql(sql_content, engine)
        print('content 去重前：', cbirc_content.shape)
        cbirc_content = cbirc_content.drop_duplicates(subset = 'content')
        print('content 去重后：', cbirc_content.shape)  
        cbirc_content = cbirc_content.dropna(subset = ['content'], axis = 0)
        print('content 去空值后：', cbirc_content.shape)  

        cbirc_combined = pd.merge(cbirc_label, cbirc_content, on = 'urlhash', how = 'inner')
        print(cbirc_combined.shape)
        print(cbirc_combined.pivot_table(index = ['tendency', 'type'], columns = ['predict_label'], 
                                    aggfunc = [len], values = ['title'], 
                                    fill_value = 0, margins = True))    
        cbirc_combined['label'] = ''
        cbirc_combined = cbirc_combined[['id', 'gather_type', 'type', 'urlhash', 'predict_label', 'label', 'title', 
                                         'content', 'group_id', 'publishtime', 'tendency']]
    #     fea_filename = 'cbirc_result/class/result/cbirc_class_predict_mysql_%s.xlsx'%day_select
        fea_filename = 'cbirc_result/class/result/cbirc_class_predict_%s_types(%s)_%s.xlsx'%(gather_types, types, day_select)
        cbirc_combined.to_excel(fea_filename, index = False)        
    

获取 中国人保 数据...

-- day_select:  2018-11-21
label 去重前： (25169, 10)
label 去重后： (25169, 10)
label 去空值后： (25169, 10)
content 去重前： (25152, 2)
content 去重后： (24705, 2)
content 去空值后： (24705, 2)
(24705, 11)
                len                                                        
              title                                                        
predict_label    交通 产品销售 公司内部管理 其他相关报道     噪音 消费服务  环保  监管   行业 资本市场    All
tendency type                                                              
-1       中国人保  2037    3     46     14   5614   43   2   7   71  166   8003
0        中国人保  2982   77    101    164  12715  129  22  48  215  249  16702
All            5019   80    147    178  18329  172  24  55  286  415  24705

-- day_select:  2018-11-22
label 去重前： (9139, 10)
label 去重后： (9139, 10)
label 去空值后： (9139, 10)
content 去重前： (9139, 2)
content 去重后： (9067, 2)
content 去空值后： (9067, 2)
(9067, 11)
                len                                                      
              title  

In [18]:
# combined_data[combined_data['predict_label'] == '交通'].to_excel('建行北分—交通.xlsx')
cbirc_combined[cbirc_combined['predict_label'] == '交通']

Unnamed: 0,id,gather_type,type,urlhash,predict_label,label,title,content,group_id,publishtime,tendency


#### 补录
- gather_type 1-人工补录
- gather_type 3-导入数据

In [39]:
types = 5
gather_types = '补录'
print("获取 %s 数据..."%(proj_name_dict[types]))
for day_select in day_list:
    print('-- day_select: ', day_select)
    
    # 获取八分类
    if types in [1,2]:
        sql_label = '''
        SELECT 
            t1.type, t1.urlhash, t3.title, t3.group_id, t3.publishtime,
            t1.traffic_id, t2.sen as tendency, t2.gather_type
        FROM
            cbrc_circ.db_classify_traffic_docinfo t1
                LEFT JOIN
            cbrc_circ.db_docinfo_trade t2 ON t1.urlhash = t2.urlhash
                LEFT JOIN
            cbrc_circ.db_docinfo t3 ON t2.urlhash = t3.urlhash
        WHERE
            t3.publishtime >= '{0} 00:00:00'
                AND t3.publishtime <= '{0} 23:59:59'
                and t1.type = {1}
                and t1.type = t2.type
                and t2.gather_type in (1,3)
        group by t3.titlehash
        '''.format(day_select, types)
    elif types in [3,4,5]:
        sql_label = '''
        SELECT 
            t1.type, t1.urlhash, t3.title, t3.group_id, t3.publishtime,
            t1.traffic_id, t2.sen as tendency, t2.gather_type
        FROM
            cbrc_circ.db_classify_traffic_docinfo t1
                LEFT JOIN
            cbrc_circ.db_docinfo_trade t2 ON t1.urlhash = t2.urlhash
                LEFT JOIN
            cbrc_circ.db_docinfo t3 ON t2.urlhash = t3.urlhash
        WHERE
            t3.publishtime >= '{0} 00:00:00'
                AND t3.publishtime <= '{0} 23:59:59'
                and t1.type = {1}
                and t1.type = t2.type
                and t2.gather_type in (1,3)
        group by t3.titlehash
        '''.format(day_select, types)        

    cbirc_label = pd.read_sql(sql_label, engine)
    cbirc_label['predict_label'] = cbirc_label['traffic_id'].apply(lambda x:class_name_dict[x])
    cbirc_label['group_id'] = cbirc_label['group_id'].apply(lambda x:group_dict[str(x)])
    cbirc_label['type'] = cbirc_label['type'].apply(lambda x:proj_name_dict[x])
    cbirc_label['gather_type'] = cbirc_label['gather_type'].apply(lambda x:gather_type_name_dict[x])
    print('label 去重前：', cbirc_label.shape)
    cbirc_label = cbirc_label.drop_duplicates(subset = 'title')
    print('label 去重后：', cbirc_label.shape)  
    cbirc_label = cbirc_label.dropna(subset = ['title'], axis = 0)
    print('label 去空值后：', cbirc_label.shape)  
        
    if cbirc_label['urlhash'].shape[0] != 0:
        # 获取 content
        url_l = cbirc_label['urlhash'].tolist()
        if cbirc_label['urlhash'].shape[0] == 1:
            url_l.append(url_l[0])
        url_list = tuple(url_l)
        sql_content = '''
        SELECT 
            t1.urlhash, t1.text as content
        FROM
            cbrc_circ.db_docinfo_text t1
        WHERE
            t1.urlhash in {0}
        '''.format(url_list)
        cbirc_content = pd.read_sql(sql_content, engine)
        print('content 去重前：', cbirc_content.shape)
        cbirc_content = cbirc_content.drop_duplicates(subset = 'content')
        print('content 去重后：', cbirc_content.shape)  
        cbirc_content = cbirc_content.dropna(subset = ['content'], axis = 0)
        print('content 去空值后：', cbirc_content.shape)  

        cbirc_combined = pd.merge(cbirc_label, cbirc_content, on = 'urlhash', how = 'inner')
        print(cbirc_combined.shape)
        print(cbirc_combined.pivot_table(index = ['tendency', 'type'], columns = ['predict_label'], 
                                    aggfunc = [len], values = ['title'], 
                                    fill_value = 0, margins = True))    
        cbirc_combined['label'] = ''
        cbirc_combined = cbirc_combined[['gather_type', 'type', 'urlhash', 'predict_label', 'label', 'title', 
                                         'content', 'group_id', 'publishtime', 'tendency']]
    #     fea_filename = 'cbirc_result/class/result/cbirc_class_predict_mysql_%s.xlsx'%day_select
        fea_filename = 'cbirc_result/class/result/cbirc_class_predict_%s_types(%s)_%s.xlsx'%(gather_types, types, day_select)
        cbirc_combined.to_excel(fea_filename, index = False)
    

获取 中国人保 数据...
-- day_select:  2018-09-02
label 去重前： (0, 9)
label 去重后： (0, 9)
label 去空值后： (0, 9)
-- day_select:  2018-09-03
label 去重前： (0, 9)
label 去重后： (0, 9)
label 去空值后： (0, 9)
-- day_select:  2018-09-04
label 去重前： (0, 9)
label 去重后： (0, 9)
label 去空值后： (0, 9)
-- day_select:  2018-09-05
label 去重前： (0, 9)
label 去重后： (0, 9)
label 去空值后： (0, 9)
-- day_select:  2018-09-06
label 去重前： (0, 9)
label 去重后： (0, 9)
label 去空值后： (0, 9)
-- day_select:  2018-09-07
label 去重前： (0, 9)
label 去重后： (0, 9)
label 去空值后： (0, 9)
-- day_select:  2018-09-08
label 去重前： (0, 9)
label 去重后： (0, 9)
label 去空值后： (0, 9)
-- day_select:  2018-09-09
label 去重前： (0, 9)
label 去重后： (0, 9)
label 去空值后： (0, 9)
-- day_select:  2018-09-10
label 去重前： (0, 9)
label 去重后： (0, 9)
label 去空值后： (0, 9)
-- day_select:  2018-09-11
label 去重前： (0, 9)
label 去重后： (0, 9)
label 去空值后： (0, 9)
-- day_select:  2018-09-12
label 去重前： (0, 9)
label 去重后： (0, 9)
label 去空值后： (0, 9)
-- day_select:  2018-09-13
label 去重前： (0, 9)
label 去重后： (0, 9)
label 去空值后： (0, 9)
--

content 去重前： (8, 2)
content 去重后： (8, 2)
content 去空值后： (8, 2)
(8, 10)
                len                 
              title                 
predict_label    交通 公司内部管理 环保 行业 All
tendency type                       
-1       中国人保     0      0  4  1   5
0        中国人保     1      1  0  1   3
All               1      1  4  2   8
-- day_select:  2018-11-24
label 去重前： (0, 9)
label 去重后： (0, 9)
label 去空值后： (0, 9)
-- day_select:  2018-11-25
label 去重前： (2, 9)
label 去重后： (2, 9)
label 去空值后： (2, 9)
content 去重前： (2, 2)
content 去重后： (2, 2)
content 去空值后： (2, 2)
(2, 10)
                len       
              title       
predict_label    交通 环保 All
tendency type             
-1       中国人保     0  1   1
0        中国人保     1  0   1
All               1  1   2
-- day_select:  2018-11-26
label 去重前： (2, 9)
label 去重后： (2, 9)
label 去空值后： (2, 9)
content 去重前： (2, 2)
content 去重后： (2, 2)
content 去空值后： (2, 2)
(2, 10)
                len       
              title       
predict_label    交通 行业 All
tendency type     

### 八分类数据

#### 合并 & 保存

In [30]:
# types = 3
# gather_types = '采集'

In [40]:
combined_data = pd.DataFrame()
for day_select in day_list:
#     tmp_data = pd.read_excel('cbirc_result/class/result/cbirc_class_predict_mysql_%s.xlsx'%day_select)
    file_name = 'cbirc_result/class/result/cbirc_class_predict_%s_types(%s)_%s.xlsx'%(gather_types, types, day_select)
    if os.path.isfile(file_name):
        print(file_name)
        tmp_data = pd.read_excel(file_name)
        combined_data = pd.concat([combined_data, tmp_data], axis = 0)

combined_data = combined_data[combined_data['predict_label'] != '补录']
print(combined_data.shape)
print(combined_data.pivot_table(index = ['type'], columns = ['predict_label'], 
                            aggfunc = [len], values = ['title'], 
                            fill_value = 0, margins = True)) 
print()
# {1: '银监会', 2: '保监会', 3: '中国人寿', 4: '建行北分', 5: '中国人保'}
# types = ['银监会', '建行北分']
# types = [ '中国人寿', '中国人保'] # '保监会',
# combined_data = combined_data[combined_data['type'].isin(types)]
# print(combined_data.shape)  
print(combined_data['predict_label'].value_counts())
combined_data.iloc[:2, :]

cbirc_result/class/result/cbirc_class_predict_补录_types(5)_2018-11-18.xlsx
cbirc_result/class/result/cbirc_class_predict_补录_types(5)_2018-11-19.xlsx
cbirc_result/class/result/cbirc_class_predict_补录_types(5)_2018-11-20.xlsx
cbirc_result/class/result/cbirc_class_predict_补录_types(5)_2018-11-21.xlsx
cbirc_result/class/result/cbirc_class_predict_补录_types(5)_2018-11-22.xlsx
cbirc_result/class/result/cbirc_class_predict_补录_types(5)_2018-11-23.xlsx
cbirc_result/class/result/cbirc_class_predict_补录_types(5)_2018-11-24.xlsx
cbirc_result/class/result/cbirc_class_predict_补录_types(5)_2018-11-25.xlsx
cbirc_result/class/result/cbirc_class_predict_补录_types(5)_2018-11-26.xlsx
cbirc_result/class/result/cbirc_class_predict_补录_types(5)_2018-11-27.xlsx
cbirc_result/class/result/cbirc_class_predict_补录_types(5)_2018-11-28.xlsx
cbirc_result/class/result/cbirc_class_predict_补录_types(5)_2018-11-29.xlsx
cbirc_result/class/result/cbirc_class_predict_补录_types(5)_2018-11-30.xlsx
cbirc_result/class/result/cbirc_class_

Unnamed: 0,gather_type,type,urlhash,predict_label,label,title,content,group_id,publishtime,tendency
0,补录,中国人保,-1451164327749029888,环保,,11月18日21时40分新疆和田地区于田县发生3.0级地震,据中国地震台网测定，北京时间2018年11月18日21时40分在新疆和田地区于田县（北纬36...,新闻,2018-11-18 21:58:50,-1
0,补录,中国人保,-6173404825726358528,监管,,周亮：民营经济离场论调极其错误 银保监会对国有和民营经济一视同仁,11月19日，在第九届财新峰会上，中国银行保险监督管理委员会副主席周亮表示，改革开放40年中...,新闻,2018-11-19 11:09:19,0


In [141]:
# combined_data = combined_data[['type', 'urlhash', 'local_label', 'label', 'title', 'content']]
# combined_data.rename(columns = {'local_label':'predict_label'}, inplace = True)
# combined_data.head()

In [41]:
fea_filename = 'cbirc_result/class/result/%s_%s_class_predict_mysql_20181203(0902-1203).xlsx'%(gather_types, proj_name_dict[types])
print(fea_filename)
sel_col = ['噪音', '消费服务', '公司内部管理', '监管',
           '行业', '资本市场', '其他相关报道','产品销售','交通','环保']
# sel_col = ['公司内部管理', '监管', '行业', '产品销售']
# sel_col = combined_data['predict_label'].unique().tolist()
sel_data = combined_data[combined_data['predict_label'].isin(sel_col)]
print(sel_data['predict_label'].value_counts())
print()
c_data = pd.DataFrame()
with pd.ExcelWriter(fea_filename) as writer:
    for label in sel_data['predict_label'].unique():
        tmp_data = sel_data[sel_data['predict_label'] == label]
        if gather_types == '补录':
            N = tmp_data.shape[0]
        else :
            if tmp_data.shape[0] > 100:
                N = 50
            else :
                N = tmp_data.shape[0]
            if label in ['交通',]: # '环保'
                N =  200 # tmp_data.shape[0]
            
        save_data = tmp_data.sample(n = N, axis = 0, random_state=42)
        save_data.to_excel(writer,label, index = False)
        c_data = pd.concat([c_data, save_data], axis = 0)
    print(c_data.pivot_table(index = ['type'], 
                                columns = ['predict_label'], 
                                values = 'title', aggfunc=len, 
                                fill_value=0, margins=True))      
    writer.save()

cbirc_result/class/result/补录_中国人保_class_predict_mysql_20181203(0902-1203).xlsx
环保        21
行业        17
噪音        12
监管        12
交通         8
产品销售       6
公司内部管理     4
消费服务       4
其他相关报道     3
资本市场       2
Name: predict_label, dtype: int64

predict_label  交通  产品销售  公司内部管理  其他相关报道  噪音  消费服务  环保  监管  行业  资本市场  All
type                                                                    
中国人保            8     6       4       3  12     4  21  12  17     2   89
All             8     6       4       3  12     4  21  12  17     2   89


### 倾向性

#### 合并 & 保存

In [31]:
combined_data = pd.DataFrame()
for day_select in day_list:
#     tmp_data = pd.read_excel('cbirc_result/class/result/cbirc_class_predict_mysql_%s.xlsx'%day_select)
    file_name = 'cbirc_result/class/result/cbirc_class_predict_%s_types(%s)_%s.xlsx'%(gather_types, types, day_select)
    if os.path.isfile(file_name):
        print(file_name)
        tmp_data = pd.read_excel(file_name)
        combined_data = pd.concat([combined_data, tmp_data], axis = 0)
        
# combined_data = combined_data[combined_data['predict_label'] != '补录']
print(combined_data.shape)
print(combined_data.pivot_table(index = ['type'], columns = ['tendency'], 
                            aggfunc = [len], values = ['title'], 
                            fill_value = 0, margins = True)) 
# print()
# {1: '银监会', 2: '保监会', 3: '中国人寿', 4: '建行北分', 5: '中国人保'}
# types = ['银监会', '建行北分']
# types = [ '中国人保']# '保监会', '中国人寿',
# combined_data = combined_data[combined_data['type'].isin(types)]
# print(combined_data.shape)  

print(combined_data['tendency'].value_counts())
combined_data.iloc[:2, :]

cbirc_result/class/result/cbirc_class_predict_采集_types(5)_2018-11-21.xlsx
cbirc_result/class/result/cbirc_class_predict_采集_types(5)_2018-11-22.xlsx
cbirc_result/class/result/cbirc_class_predict_采集_types(5)_2018-11-23.xlsx
cbirc_result/class/result/cbirc_class_predict_采集_types(5)_2018-11-24.xlsx
cbirc_result/class/result/cbirc_class_predict_采集_types(5)_2018-11-25.xlsx
(51061, 11)
            len              
          title              
tendency     -1      0    All
type                         
中国人保      18017  33044  51057
All       18016  33041  51057
 0    33044
-1    18017
Name: tendency, dtype: int64


Unnamed: 0,id,gather_type,type,urlhash,predict_label,label,title,content,group_id,publishtime,tendency
0,9196833,系统采集,中国人保,7090836087712654336,噪音,,[看，我们是被执行人]执行通知书、报告财产令、执行告知书(11月12-14日)立案案件,根据《中华人民共和国民事诉讼法》第二百四十条“执行员接到申请执行书或者移交执行书，应当向被执...,微信,2018-11-21 00:01:14,0
1,9237521,系统采集,中国人保,4322856188218469888,噪音,,【青 · 榜样】披星戴月风雨兼程，勠力同心攻坚克难,厉！害！啦！ 在向家坝电厂2018年度防汛总结暨2018-2019年度岁修动员会上，电气维修...,微信,2018-11-21 04:55:37,0


In [32]:
fea_filename = 'cbirc_result/tendency/result/%s_%s_tendency_predict_mysql_20181126(1121-1125).xlsx'%(gather_types, 
                                                                                                     proj_name_dict[types])
print(fea_filename)
print(combined_data['tendency'].value_counts())

N = 300 # 每类 N 条数据
class_n = int(combined_data['predict_label'].unique().shape[0])
n = int(N / class_n) + 200

print('正负各 %s 条，共 %s 类， 每类各 %s 条'%(N, class_n, n))
with pd.ExcelWriter(fea_filename) as writer:
    for tendency in combined_data['tendency'].unique():
        tmp_data = pd.DataFrame()
        sel_data = combined_data[combined_data['tendency'] == tendency]        
        for predict_label in combined_data['predict_label'].unique():
            label_data = sel_data[sel_data['predict_label'] == predict_label]
            if label_data.shape[0] > n:
                sel_label_data = label_data.sample(n = n, axis = 0, random_state=3)
            else :
                sel_label_data = label_data
            tmp_data = pd.concat([tmp_data, sel_label_data], axis = 0)        
            print('tendency: %s, predict_label: %s, size: %s'%(tendency, predict_label, tmp_data.shape))

        if gather_types == '补录':
            t_n = tmp_data.shape[0]
        else :            
            if tmp_data.shape[0] > N:
                t_n = N
            else :
                t_n = tmp_data.shape[0]
        
        tmp_data = tmp_data.sample(n = t_n, axis = 0, random_state=3)
        tmp_data.to_excel(writer,str(tendency), index = False)        
        print(tmp_data.pivot_table(index = ['tendency', 'type'], 
                                    columns = ['predict_label'], 
                                    values = 'title', aggfunc=len, 
                                    fill_value=0, margins=True))    
        print()
    writer.save()

cbirc_result/tendency/result/采集_中国人保_tendency_predict_mysql_20181126(1121-1125).xlsx
 0    33044
-1    18017
Name: tendency, dtype: int64
正负各 300 条，共 10 类， 每类各 230 条
tendency: 0, predict_label: 噪音, size: (230, 11)
tendency: 0, predict_label: 交通, size: (460, 11)
tendency: 0, predict_label: 行业, size: (690, 11)
tendency: 0, predict_label: 资本市场, size: (920, 11)
tendency: 0, predict_label: 公司内部管理, size: (1150, 11)
tendency: 0, predict_label: 监管, size: (1248, 11)
tendency: 0, predict_label: 其他相关报道, size: (1478, 11)
tendency: 0, predict_label: 产品销售, size: (1706, 11)
tendency: 0, predict_label: 消费服务, size: (1936, 11)
tendency: 0, predict_label: 环保, size: (2004, 11)
predict_label  交通  产品销售  公司内部管理  其他相关报道  噪音  消费服务  环保  监管  行业  资本市场  All
tendency type                                                           
0        中国人保  27    35      29      31  45    36  14  16  28    39  300
All            27    35      29      31  45    36  14  16  28    39  300

tendency: -1, predict_label: 噪音, size: (2

## 本地模型

### 八分类

#### CBRC

In [75]:
from sklearn.externals import joblib
pipeline_old = joblib.load( "model/cbrc_8classifier_1015.pkl.z")

url = 'http://47.93.183.157:6001/judge_correlation_b'
col_name = 'cor'
types = 1

combined_data['title_content'] = combined_data['title'].astype(str) + '。' + combined_data['content'].astype(str)
title_content = pre_cor_cbrc.handle_contents(combined_data['title_content'].tolist())
print(len(title_content))

#### CIRC

In [92]:
from sklearn.externals import joblib
pipeline_old = joblib.load( "model/circ_8classifier_1113.pkl.z")
# pipeline_old = joblib.load( "model/circ_picc_10classifier_1118.pkl.z")

url = 'http://47.93.183.157:10000/judge_correlation_i'
col_name = 'cor'
types = 5

combined_data['title_content'] = combined_data['title'].astype(str) + '。' + combined_data['content'].astype(str)
title_content = pre_cor_circ.handle_contents(combined_data['title_content'].tolist())
print(len(title_content))

43


#### pre

In [93]:
local_label = pipeline_old.predict(title_content)
local_proba = pipeline_old.predict_proba(title_content)

In [94]:
combined_data['local_label'] = local_label
combined_data['local_proba'] = local_proba.max(axis = 1)
combined_data['local_label'] = combined_data['local_label'].apply(lambda x:class_name_dict[x])
print(combined_data.shape)
combined_data.iloc[:2, :]

(43, 13)


Unnamed: 0,gather_type,type,urlhash,predict_label,label,title,content,group_id,publishtime,tendency,title_content,local_label,local_proba
0,补录,中国人保,-1451164327749029888,环保,,11月18日21时40分新疆和田地区于田县发生3.0级地震,据中国地震台网测定，北京时间2018年11月18日21时40分在新疆和田地区于田县（北纬36...,新闻,2018-11-18 21:58:50,-1,11月18日21时40分新疆和田地区于田县发生3.0级地震。据中国地震台网测定，北京时间20...,噪音,0.605084
0,补录,中国人保,-6173404825726358528,监管,,周亮：民营经济离场论调极其错误 银保监会对国有和民营经济一视同仁,11月19日，在第九届财新峰会上，中国银行保险监督管理委员会副主席周亮表示，改革开放40年中...,新闻,2018-11-19 11:09:19,0,周亮：民营经济离场论调极其错误 银保监会对国有和民营经济一视同仁。11月19日，在第九届财新...,监管,0.915518


In [95]:
combined_data['local_label'].value_counts()

噪音        13
行业        12
公司内部管理     5
消费服务       5
其他相关报道     4
监管         3
资本市场       1
Name: local_label, dtype: int64

#### 线上线下一致性: mysql 与 local

In [96]:
combined_data['R_W'] = combined_data.apply(lambda x: 'Right' if x['local_label'] == x['predict_label'] else 'Wrong', axis = 1)
print(combined_data[combined_data['R_W'] == 'Right'].shape[0]/combined_data.shape[0])
print(combined_data['R_W'].value_counts())
combined_data[combined_data['R_W'] == 'Wrong'].pivot_table(index = ['local_label'], columns = ['predict_label'], 
                                                            aggfunc = [len], values = ['urlhash'], 
                                                            fill_value = 0, margins = True)

0.46511627906976744
Wrong    23
Right    20
Name: R_W, dtype: int64


Unnamed: 0_level_0,len,len,len,len,len,len,len
Unnamed: 0_level_1,urlhash,urlhash,urlhash,urlhash,urlhash,urlhash,urlhash
predict_label,交通,产品销售,公司内部管理,环保,监管,行业,All
local_label,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3
公司内部管理,1,1,0,0,0,1,3
其他相关报道,1,1,0,0,0,0,2
噪音,1,0,0,9,0,1,11
消费服务,1,0,1,1,0,0,3
行业,0,0,0,0,4,0,4
All,4,2,1,10,4,2,23


#### 线上线下一致性: online 与 local

In [97]:
combined_data['id'] = range(combined_data.shape[0])
combined_data['title'] = combined_data['title'].astype(str) 
combined_data['content'] = combined_data['content'].astype(str)

In [102]:
data = {"types":types,"record":combined_data.loc[:,['id', 'title' ,'content']].to_dict(orient = 'records')}
# url = "http://47.93.77.19:6001/judge_correlation_yjh"
# col_name = 'sec'

# parse_data = get_server_res_yjh(data, url, col_name)
parse_data, elapsed_time = get_server_res(data, url, col_name)
print('elapsed_time: ', elapsed_time)
parse_data.columns = ['id', 'online_label']
parse_data.head()

elapsed_time:  0.00


Unnamed: 0,id,online_label
0,0,8
1,1,1
2,2,2
3,3,7
4,4,6


In [103]:
combined_data = pd.merge(combined_data, parse_data, on  = 'id')
print(combined_data.shape)
combined_data['online_label'] = combined_data['online_label'].apply(lambda x:class_name_dict[x])
combined_data['O_R_W'] = combined_data.apply(lambda x: 'Right' if x['local_label'] == x['online_label'] else 'Wrong', axis = 1)
print(combined_data[combined_data['O_R_W'] == 'Right'].shape[0]/combined_data.shape[0])
print(combined_data['O_R_W'].value_counts())
combined_data[combined_data['O_R_W'] == 'Wrong'].pivot_table(index = ['local_label'], columns = ['online_label'], 
                                                            aggfunc = [len], values = ['urlhash'], 
                                                            fill_value = 0, margins = True)

(43, 16)
0.7674418604651163
Right    33
Wrong    10
Name: O_R_W, dtype: int64


Unnamed: 0_level_0,len,len,len,len,len,len
Unnamed: 0_level_1,urlhash,urlhash,urlhash,urlhash,urlhash,urlhash
online_label,交通,公司内部管理,噪音,监管,行业,All
local_label,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3
公司内部管理,0,0,2,1,0,3
其他相关报道,0,0,1,0,0,1
噪音,2,0,0,0,1,3
消费服务,1,1,0,0,0,2
行业,0,0,0,1,0,1
All,3,1,3,2,1,10


#### 线上线下一致性: online 与 mysql

In [104]:
combined_data['O_R_W'] = combined_data.apply(lambda x: 'Right' if x['predict_label'] == x['online_label'] else 'Wrong', axis = 1)
print(combined_data[combined_data['O_R_W'] == 'Right'].shape[0]/combined_data.shape[0])
print(combined_data['O_R_W'].value_counts())
# combined_data[combined_data['O_R_W'] == 'Wrong'].pivot_table(index = ['predict_label'], 
#                                                              columns = ['online_label'], 
#                                                              aggfunc = [len], values = ['urlhash'], 
#                                                              fill_value = 0, margins = True)

0.5348837209302325
Right    23
Wrong    20
Name: O_R_W, dtype: int64


### 倾向性

In [110]:
combined_data.iloc[:2, :]

Unnamed: 0,type,urlhash,predict_label,label,title,content,group_id,publishtime,tendency,title_content
10724,中国人保,4823901446525329408,噪音,,因祸得福！CDR基金惊现大动作,渐渐被市场遗忘CDR基金，最近又曝出新动向。\n\n11月7日晚间，中国人保A股IPO网下初...,新闻客户端,2018-11-11 12:28:47,-1,因祸得福！CDR基金惊现大动作。渐渐被市场遗忘CDR基金，最近又曝出新动向。\n\n11月7...
4469,中国人保,2875199957075597824,噪音,,【招聘】中国人民财产保险股份有限公司珠海市分公司2019年校招,一 公司简介 （一）中国人民财产保险股份有限公司简介 中国人民财产保险股份有限公司（PICC...,微信,2018-11-13 13:29:47,0,【招聘】中国人民财产保险股份有限公司珠海市分公司2019年校招。一 公司简介 （一）中国人民...


#### CBRC

In [None]:
from sklearn.externals import joblib
pipeline_old = joblib.load( "model/cbrc_tendency_pipeline_20181114.pkl.z")

url = 'http://47.93.183.157:6001/tendency_analysis_b'
col_name = 'tendency'
types = 1

combined_data['title_content'] = combined_data['title'].astype(str) + '。' + combined_data['content'].astype(str)
title_content = pre_cor_cbrc.handle_contents(combined_data['title_content'].tolist())
print(len(title_content))

#### CIRC

In [118]:
from sklearn.externals import joblib
pipeline_old = joblib.load( "model/circ_chapter_tendency_1113.pkl.z")

url = 'http://47.93.183.157:10000/tendency_analysis_i'
col_name = 'tendency'
types = 5

combined_data['title_content'] = combined_data['title'].astype(str) + '。' + combined_data['content'].astype(str)
title_content = pre_cor_circ.handle_contents(combined_data['title_content'].tolist())
print(len(title_content))

43


#### pre

In [119]:
local_label = pipeline_old.predict(title_content)
local_proba = pipeline_old.predict_proba(title_content)

In [120]:
combined_data['local_label'] = local_label
combined_data['local_proba'] = local_proba.max(axis = 1)
# combined_data['local_label'] = combined_data['local_label'].apply(lambda x:class_name_dict[x])
print(combined_data.shape)
combined_data.iloc[:2, :]

(43, 13)


Unnamed: 0,gather_type,type,urlhash,predict_label,label,title,content,group_id,publishtime,tendency,title_content,local_label,local_proba
0,补录,中国人保,-1451164327749029888,环保,,11月18日21时40分新疆和田地区于田县发生3.0级地震,据中国地震台网测定，北京时间2018年11月18日21时40分在新疆和田地区于田县（北纬36...,新闻,2018-11-18 21:58:50,-1,11月18日21时40分新疆和田地区于田县发生3.0级地震。据中国地震台网测定，北京时间20...,-1,1.0
0,补录,中国人保,-6173404825726358528,监管,,周亮：民营经济离场论调极其错误 银保监会对国有和民营经济一视同仁,11月19日，在第九届财新峰会上，中国银行保险监督管理委员会副主席周亮表示，改革开放40年中...,新闻,2018-11-19 11:09:19,0,周亮：民营经济离场论调极其错误 银保监会对国有和民营经济一视同仁。11月19日，在第九届财新...,0,1.0


#### 线上线下一致性: mysql 与 local

In [121]:
combined_data['R_W'] = combined_data.apply(lambda x: 'Right' if x['local_label'] == x['tendency'] else 'Wrong', axis = 1)
print(combined_data[combined_data['R_W'] == 'Right'].shape[0]/combined_data.shape[0])
print(combined_data['R_W'].value_counts())
combined_data[combined_data['R_W'] == 'Wrong'].pivot_table(index = ['local_label'], columns = ['tendency'], 
                                                            aggfunc = [len], values = ['urlhash'], 
                                                            fill_value = 0, margins = True)

0.813953488372093
Right    35
Wrong     8
Name: R_W, dtype: int64


Unnamed: 0_level_0,len,len
Unnamed: 0_level_1,urlhash,urlhash
tendency,0,All
local_label,Unnamed: 1_level_3,Unnamed: 2_level_3
-1,8,8
All,8,8


#### 线上线下一致性: online 与 local

In [122]:
combined_data['id'] = range(combined_data.shape[0])
combined_data['title'] = combined_data['title'].astype(str) 
combined_data['content'] = combined_data['content'].astype(str)

In [123]:
data = {"types":types, "record":combined_data.loc[:,['id', 'title' ,'content']].to_dict(orient = 'records')}
# url = "http://47.93.77.19:6001/judge_correlation_yjh"
# col_name = 'sec'

# parse_data = get_server_res_yjh(data, url, col_name)
parse_data, elapsed_time = get_server_res(data, url, col_name)
print('elapsed_time: ', elapsed_time)
parse_data.columns = ['id', 'online_label']
parse_data.head()

elapsed_time:  2.00


Unnamed: 0,id,online_label
0,0,-1
1,1,0
2,2,-1
3,3,0
4,4,0


In [124]:
# combined_data.head()
# combined_data.iloc[:2, :]

In [125]:
combined_data = pd.merge(combined_data, parse_data, on  = 'id')
print(combined_data.shape)
# combined_data['online_label'] = combined_data['online_label'].apply(lambda x:class_name_dict[x])
combined_data['O_R_W'] = combined_data.apply(lambda x: 'Right' if x['local_label'] == x['online_label'] else 'Wrong', axis = 1)
print(combined_data[combined_data['O_R_W'] == 'Right'].shape[0]/combined_data.shape[0])
print(combined_data['O_R_W'].value_counts())
combined_data[combined_data['O_R_W'] == 'Wrong'].pivot_table(index = ['local_label'], columns = ['online_label'], 
                                                            aggfunc = [len], values = ['urlhash'], 
                                                            fill_value = 0, margins = True)

(43, 16)
0.9069767441860465
Right    39
Wrong     4
Name: O_R_W, dtype: int64


Unnamed: 0_level_0,len,len
Unnamed: 0_level_1,urlhash,urlhash
online_label,0,All
local_label,Unnamed: 1_level_3,Unnamed: 2_level_3
-1,4,4
All,4,4


#### 线上线下一致性: online 与 mysql

In [126]:
combined_data['O_R_W'] = combined_data.apply(lambda x: 'Right' if x['tendency'] == x['online_label'] else 'Wrong', axis = 1)
print(combined_data[combined_data['O_R_W'] == 'Right'].shape[0]/combined_data.shape[0])
print(combined_data['O_R_W'].value_counts())
# combined_data[combined_data['O_R_W'] == 'Wrong'].pivot_table(index = ['predict_label'], columns = ['online_label'], 
#                                                             aggfunc = [len], values = ['urlhash'], 
#                                                             fill_value = 0, margins = True)

0.9069767441860465
Right    39
Wrong     4
Name: O_R_W, dtype: int64


# 保存本文件

In [None]:
if 0:
    import datetime as dt
    
    def output_HTML(read_file, output_file):
        from nbconvert import HTMLExporter
        import codecs
        import nbformat
        exporter = HTMLExporter()
        # read_file is '.ipynb', output_file is '.html'
        output_notebook = nbformat.read(read_file, as_version=4)
        output, resources = exporter.from_notebook_node(output_notebook)
        codecs.open(output_file, 'w', encoding='utf-8').write(output)

    html_file_folder = 'html_files'
    if not os.path.exists(html_file_folder):
        os.makedirs(html_file_folder)

    today = dt.datetime.now().strftime('%Y%m%d')
    current_file = 'circ_cor_model_2_train.ipynb'
    output_file = 'html_files\%s_%s.html'%(os.path.splitext(current_file)[0], today)
    output_HTML(current_file, output_file)