# 本文件说明
- 数据库里导出数据，本地模型、线上模型测试

# 基本设置

In [1]:
import numpy as np
import pandas as pd

import os

import requests,json
from sklearn.externals import joblib

import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

In [2]:
from toolkits.setup.date_time import get_day_list
from toolkits.setup import specific_func

from toolkits.nlp import pre_cor_circ
from toolkits.nlp import pre_cor_cbrc

Building prefix dict from the default dictionary ...
Dumping model to file cache C:\Users\ADMINI~1\AppData\Local\Temp\jieba.cache
Loading model cost 1.186 seconds.
Prefix dict has been built succesfully.


In [3]:
%load_ext autoreload
%autoreload 2

## 一些函数

In [4]:
def get_server_res_yjh(data, url, col_name):
    '''
    服务器接口测试程序
    传入 dict, 传出 DataFrame
    '''
    # data = {'record':[{'id':0,'title':'ss','content':'zzz'},]}
    # data = {"record":marked_human_data.iloc[:5,:3].to_dict(orient = 'records')}
    # url "http://47.93.77.19:10000/correlation_negative"
    headers={'content-type':'application/json'}
    result = requests.post(url,
                      data = json.dumps(data),
                      headers=headers, allow_redirects=True)
    # print(result.text)
    json_data = json.loads(result.text)
    parse_data = []
    elapsed_time = json_data['elapsed_time']
    for i in range(len(json_data['docs'])):
        parse_data.append([json_data['docs'][i]['id'],
                          json_data['docs'][i][col_name]])
    parse_data = pd.DataFrame(parse_data, columns = ['id', col_name])    
    return parse_data , elapsed_time

In [5]:
def get_serve_data_yjh(day_list, sql_one_day, url, col_name, save_filename):    
    chunksize = 1000
    for day_select in day_list:
        print('-- day_select: ', day_select)
        mysql_data = pd.read_sql(eval(sql_one_day), engine, chunksize= chunksize)
        num = 1
        combined_data = pd.DataFrame()
        for tmp_data in mysql_data:  
            print('---- loop num: ', num, 'tmp_data: ', tmp_data.shape)
            data = {"record":tmp_data.loc[:,['id', 'title' ,'content']].to_dict(orient = 'records')}
            parse_data = get_server_res_yjh(data, url, col_name)

            parse_data.columns = ['id', 'predict_label']
            
            parse_data['label'] = ''
            combined_tmp = pd.merge(parse_data, tmp_data, on = 'id', how = 'inner')
            combined_data = pd.concat([combined_tmp, combined_data])

        combined_data['predict_label'] = combined_data['predict_label'].apply(lambda x:class_name_dict[x])
        combined_data['group_id'] = combined_data['group_id'].apply(lambda x:group_dict[str(x)])
        combined_data.to_excel(eval(save_filename), index = False)
        print(combined_data['predict_label'].value_counts())

In [6]:
def get_server_res(data, url, col_name):
    '''
    服务器接口测试程序
    传入 dict, 传出 DataFrame
    '''
    # data = {'record':[{'id':0,'title':'ss','content':'zzz'},]}
    # data = {"record":marked_human_data.iloc[:5,:3].to_dict(orient = 'records')}
    # url "http://47.93.77.19:10000/correlation_negative"
    headers={'content-type':'application/json'}
    result = requests.post(url,
                      data = json.dumps(data),
                      headers=headers, allow_redirects=True)
    # print(result.text)
    json_data = json.loads(result.text)
    parse_data = []
    elapsed_time = json_data['elapsed_time']
    for i in range(len(json_data['docs'])):
        parse_data.append([json_data['docs'][i]['id'],
                          json_data['docs'][i][col_name]])
    parse_data = pd.DataFrame(parse_data, columns = ['id', col_name])    
    return parse_data, elapsed_time

In [7]:
def get_serve_data(day_list, sql_one_day, url, col_name):
    combined_data = pd.DataFrame()
    for day_select in day_list:
        print('-- day_select: ', day_select)
        mysql_data = pd.read_sql(eval(sql_one_day), engine)
        print('去空值前：', mysql_data.shape)
        mysql_data = mysql_data.drop_duplicates(subset = ['title', 'content'])
        print('去空值后：', mysql_data.shape)
        data = {"record":mysql_data.loc[:,['id', 'title' ,'content']].to_dict(orient = 'records')}
        
        parse_data, elapsed_time = get_server_res(data, url)
        print('elapsed_time: ', elapsed_time)
        
        parse_data.columns = ['id', 'predict_label']
        parse_data['predict_label'] = parse_data['predict_label'].apply(lambda x:class_name_dict[x])
        parse_data['label'] = ''
        combined_cor = pd.merge(parse_data, mysql_data, on = 'id', how = 'inner')
        combined_data = pd.concat([combined_data, combined_cor], axis = 0)

        print(combined_cor['predict_label'].value_counts())
    return combined_data

# 基本信息

In [8]:
label_dic={'补录':0,'监管':1,'行业':2,'产品销售':3,'资本市场':4,'公司内部管理':5,
           '消费服务':6,'其他相关报道':7,'噪音':8,'交通':9,'环保':10}
class_name_dict = {v: k for k, v in label_dic.items()}
class_name_dict

{0: '补录',
 1: '监管',
 2: '行业',
 3: '产品销售',
 4: '资本市场',
 5: '公司内部管理',
 6: '消费服务',
 7: '其他相关报道',
 8: '噪音',
 9: '交通',
 10: '环保'}

In [9]:
group = '1-新闻，2-论坛，3-博客，4-微博，5-纸媒，6-视频，7-外媒，8-广播，9-电视，11-微信，13-新闻客户端，15-推特'
group_dict = dict([x.split('-') for x in group.split('，')])
group_dict

{'1': '新闻',
 '11': '微信',
 '13': '新闻客户端',
 '15': '推特',
 '2': '论坛',
 '3': '博客',
 '4': '微博',
 '5': '纸媒',
 '6': '视频',
 '7': '外媒',
 '8': '广播',
 '9': '电视'}

In [10]:
proj_dic={'银监会':1,'保监会':2,'中国人寿':3,'建行北分':4,'中国人保':5,'安徽银监局':6}
proj_name_dict = {v: k for k, v in proj_dic.items()}
proj_name_dict

{1: '银监会', 2: '保监会', 3: '中国人寿', 4: '建行北分', 5: '中国人保', 6: '安徽银监局'}

In [11]:
gather_type_dic={'系统采集':0,'补录':1,'校正':2,'导入数据':3,'其它':4}
gather_type_name_dict = {v: k for k, v in gather_type_dic.items()}
gather_type_name_dict

{0: '系统采集', 1: '补录', 2: '校正', 3: '导入数据', 4: '其它'}

In [12]:
file_path = 'cbirc_result\pom.json'

with open(file_path,'r',encoding='utf-8-sig') as json_file:
    cbrc_data = json.load(json_file)  
    
# cbrc_data = pd.DataFrame.from_dict(json_data['record'], orient='index' ) 
# cbrc_data.shape

FileNotFoundError: [Errno 2] No such file or directory: 'cbirc_result\\pom.json'

In [None]:
# from langconv import *
from toolkits.nlp.langconv import *

def Traditional2Simplified(sentence):
    '''
    将sentence中的繁体字转为简体字
    :param sentence: 待转换的句子
    :return: 将句子中繁体字转换为简体字之后的句子
    '''
    sentence = Converter('zh-hans').convert(sentence)
    return sentence


In [None]:
index = 481
# data = {"record":[cbrc_data['record'][index],]}
# url = "http://47.93.77.19:6001/judge_correlation_yjh"
data = {"record":[{'id':'1', 
                   'title': Traditional2Simplified(data['record'][0]['title']),  
                   'content': Traditional2Simplified(data['record'][0]['content'])},]}
url = "http://192.168.0.104:8100/judge_correlation_yjh"
col_name = 'sec'

parse_data , elapsed_time = get_server_res_yjh(data, url, col_name)
print(index, '  耗时：%s s'%(elapsed_time))

In [None]:
data['record'][0]['content']

In [None]:
{"record":[cbrc_data['record'][152:155],]}

In [None]:
{"record":[cbrc_data['record'][154],]}

In [None]:
# data = {"record":combined_data.loc[:,['id', 'title' ,'content']].to_dict(orient = 'records')}
import time
for index in range(len(cbrc_data['record'])):
#     data = {"record":[cbrc_data['record'][index],]}
#     url = "http://47.93.77.19:6001/judge_correlation_yjh"
    data = {"record":[{'id':cbrc_data['record'][index]['id'], 
                       'title': Traditional2Simplified(cbrc_data['record'][index]['title']),  
                       'content': Traditional2Simplified(cbrc_data['record'][index]['content'])},]}
    url = "http://192.168.0.104:8100/judge_correlation_yjh"
    col_name = 'sec'
    
    parse_data , elapsed_time = get_server_res_yjh(data, url, col_name)
    print(index, '  耗时：%s s'%(elapsed_time))

In [None]:
import time
for index in range(len(cbrc_data['record'])):
    data = {"record":[cbrc_data['record'][index],]}
#     url = "http://47.93.77.19:6001/judge_correlation_yjh"
    url = "http://192.168.0.104:8100/judge_correlation_yjh"
    col_name = 'sec'
    
    parse_data , elapsed_time = get_server_res_yjh(data, url, col_name)
    print(index, '  耗时：%s s'%(elapsed_time))

In [None]:
data = {"record":cbrc_data['record']}
#     url = "http://47.93.77.19:6001/judge_correlation_yjh"
url = "http://192.168.0.104:8100/judge_correlation_yjh"
col_name = 'sec'

parse_data , elapsed_time = get_server_res_yjh(data, url, col_name)
print(index, '  耗时：%s s'%(elapsed_time))

In [None]:
parse_data

In [None]:
# data = {"record":combined_data.loc[:,['id', 'title' ,'content']].to_dict(orient = 'records')}
import time
for index in range(len(cbrc_data['record'])):
    data = {"record":[cbrc_data['record'][index],]}
#     url = "http://47.93.77.19:6001/judge_correlation_yjh"
    url = "http://192.168.0.104:8100/judge_correlation_yjh"
    col_name = 'sec'
    
    parse_data , elapsed_time = get_server_res_yjh(data, url, col_name)
    print(index, '  耗时：%s s'%(elapsed_time))

In [None]:
data = {"record":[cbrc_data['record'][0], ]}
data

In [None]:
cbrc_data['record'][0]
len(cbrc_data['record'])

# 保险业--旧

In [13]:
engine = specific_func.get_engine('circ')

## mysql 数据

### 八分类

In [13]:
# day_select = '2018-09-09'
day_list = get_day_list('2019-03-26', '2019-03-31')
print(day_list)

['2019-03-27', '2019-03-28', '2019-03-29', '2019-03-30', '2019-03-31']


#### 获取数据--系统采集

In [17]:
gather_types = '采集'

for day_select in day_list:
    print('-- day_select: ', day_select)

    # 相关数据
    sql_one_day = "select t1.id, t1.group_id,t1.classify as predict_label,\
                        t1.title,t2.center as content, t1.publishtime as publishtime \
                        from wise_web_docinfo t1, wise_web_docinfo_center t2 \
                            where t1.id=t2.doc_id \
                                  and  date_format(t1.publishtime, '%%Y-%%m-%%d') = '{0}' \
                                  and t1.gather_type = 0 \
                                  group by t1.titlehash".format(day_select) # 
    # # titlehash 去重后
    circ_cor = pd.read_sql(sql_one_day, engine)
    print('circ_cor: ', circ_cor.shape  )
    
    # 不相关数据
    sql_one_day = "select t1.id, t1.group_id,t1.title,t2.center as content, t1.publishtime as publishtime \
                        from wise_web_docinfo_uncorr t1, wise_web_docinfo_center_uncurr t2 \
                            where t1.id=t2.doc_id \
                                  and t1.publishtime >= '{0} 8:00:00' \
                              and t1.publishtime <= '{0} 14:00:00'".format(day_select)
    # 一段时间
    circ_uncor = pd.read_sql(sql_one_day, engine)
    circ_uncor.insert(2, 'predict_label', 8) # 噪音
    print('circ_uncor: ', circ_uncor.shape)

    circ_data = pd.concat([circ_cor, circ_uncor], axis = 0)
    print('去重前：', circ_data.shape)
    circ_data = circ_data.drop_duplicates(subset = 'title')
    print('去重后：', circ_data.shape)  
    circ_data = circ_data.dropna(subset = ['content'], axis = 0)
    print('去空值后：', circ_data.shape)  

    circ_data['predict_label'] = circ_data['predict_label'].apply(lambda x:class_name_dict[x])
    circ_data['group_id'] = circ_data['group_id'].apply(lambda x:group_dict[str(x)])
    circ_data.insert(3, 'label', '')
    fea_filename = 'result/circ_result_class/result/%s_circ_class_predict_mysql_%s.xlsx'%(gather_types, 
                                                                                          day_select)
    circ_data.to_excel(fea_filename, index = False)
    print(circ_data.shape)
    print(circ_data['predict_label'].value_counts())

-- day_select:  2019-03-02
circ_cor:  (5202, 6)
circ_uncor:  (12811, 6)
去重前： (18013, 6)
去重后： (13402, 6)
去空值后： (13402, 6)
(13402, 7)
噪音        8200
资本市场      2173
监管         817
消费服务       726
产品销售       535
行业         340
公司内部管理     321
其他相关报道     290
Name: predict_label, dtype: int64
-- day_select:  2019-03-03
circ_cor:  (4003, 6)
circ_uncor:  (7950, 6)
去重前： (11953, 6)
去重后： (8422, 6)
去空值后： (8422, 6)
(8422, 7)
噪音        4421
资本市场      1830
监管         511
消费服务       488
产品销售       411
行业         320
公司内部管理     245
其他相关报道     196
Name: predict_label, dtype: int64
-- day_select:  2019-03-04
circ_cor:  (9473, 6)
circ_uncor:  (18880, 6)
去重前： (28353, 6)
去重后： (20752, 6)
去空值后： (20752, 6)
(20752, 7)
噪音        11321
资本市场       4502
监管         1399
消费服务        952
产品销售        738
行业          715
公司内部管理      588
其他相关报道      537
Name: predict_label, dtype: int64
-- day_select:  2019-03-05
circ_cor:  (9010, 6)
circ_uncor:  (23014, 6)
去重前： (32024, 6)
去重后： (22100, 6)
去空值后： (22100, 6)
(22100, 7)
噪音    

#### 合并 & 保存

In [20]:
combined_data = pd.DataFrame()
for day_select in day_list:
    file_name = 'result/circ_result_class/result/%s_circ_class_predict_mysql_%s.xlsx'%(gather_types, day_select)
    if os.path.isfile(file_name):
        print(file_name)
        tmp_data = pd.read_excel(file_name)
        combined_data = pd.concat([combined_data, tmp_data], axis = 0)

if gather_types != '补录':
    combined_data = combined_data[combined_data['predict_label'] != '补录']
print(combined_data.shape)  
print('去重前：', combined_data.shape)
combined_data = combined_data.drop_duplicates(subset = 'title')
print('去重后：', combined_data.shape)  
combined_data = combined_data.dropna(subset = ['content'], axis = 0)
print('去空值后：', combined_data.shape)  

print(combined_data['predict_label'].value_counts())
combined_data.head()

result/circ_result_class/result/采集_circ_class_predict_mysql_2019-03-02.xlsx
result/circ_result_class/result/采集_circ_class_predict_mysql_2019-03-03.xlsx
result/circ_result_class/result/采集_circ_class_predict_mysql_2019-03-04.xlsx
result/circ_result_class/result/采集_circ_class_predict_mysql_2019-03-05.xlsx
(64676, 7)
去重前： (64676, 7)
去重后： (61048, 7)
去空值后： (61008, 7)
噪音        35276
资本市场      11490
监管         4597
消费服务       2733
产品销售       2056
行业         1876
公司内部管理     1508
其他相关报道     1472
Name: predict_label, dtype: int64


Unnamed: 0,id,group_id,predict_label,label,title,content,publishtime
0,14246095,新闻,资本市场,,中国人寿(601628)融资融券信息(03-01),中国人寿(601628)融资融券信息(03-01)2019年03月02日 07:39来源： ...,2019-03-02 07:39:01
1,14245525,微信,资本市场,,指数再次逼近3000点，下周一能否突破,点击“Top股市之路” 订阅，获取更多股市逻辑！ 昨天复盘看了很久，觉得大金融今天是很有可能...,2019-03-02 06:29:54
2,14243343,微信,产品销售,,香港保险六问详解之二：如何鉴别保障类产品之重疾险,相信有过购买保险念头的朋友都清楚，保险合同条款晦涩难懂，经常给人一团乱麻的感觉，让人各种理不...,2019-03-02 01:24:46
3,14287687,新闻,资本市场,,欢迎境外长期资金进入国内市场,"3月1日早间,MSCI(明晟公司)表示,将扩大中国A股在MSCI全球基准指数中的纳入因子...",2019-03-02 00:00:00
4,14248264,新闻,公司内部管理,,华夏保险衡水中支营销渠道召开三超一启动会,2月25日，华夏保险衡水中支营销渠道在职场会议室隆重召开“盛世定乾坤 决胜三超一”启动会，长...,2019-03-02 09:31:41


In [21]:
fea_filename = 'result/circ_result_class/result/%s_circ_class_predict_mysql_20190306(0302-0305).xlsx'%gather_types
# sel_col = ['行业','资本市场', '消费服务', '公司内部管理', '监管']
# sel_col = ['其他相关报道','行业',  '公司内部管理', '监管']
sel_col = combined_data['predict_label'].unique().tolist()
sel_data = combined_data[combined_data['predict_label'].isin(sel_col)]
print(sel_data['predict_label'].value_counts())
with pd.ExcelWriter(fea_filename) as writer:
    for label in sel_data['predict_label'].unique():
        tmp_data = sel_data[sel_data['predict_label'] == label]
        if gather_types == '补录':
            N = tmp_data.shape[0]
        else :
            if tmp_data.shape[0] > 200:
                N = 200
            else :
                N = tmp_data.shape[0]
    #         if label == '公司内部管理': 
    #             N = 200
        tmp_data.sample(n = N, axis = 0, random_state=3).to_excel(writer,label, index = False)
    
    writer.save()

噪音        35276
资本市场      11490
监管         4597
消费服务       2733
产品销售       2056
行业         1876
公司内部管理     1508
其他相关报道     1472
Name: predict_label, dtype: int64


### 倾向性

In [15]:
# day_select = '2018-09-09'
day_list = get_day_list('2019-02-22', '2019-02-25')
print(day_list)

['2019-02-23', '2019-02-24', '2019-02-25']


#### 获取数据

In [16]:
for day_select in day_list:
    print('-- day_select: ', day_select)

    # 相关数据
    sql_one_day = "select t1.id, t1.group_id,t1.classify as predict_label, t1.tendency,\
                        t1.title,t2.center as content, t1.publishtime as publishtime \
                        from wise_web_docinfo t1, wise_web_docinfo_center t2 \
                            where t1.id=t2.doc_id \
                                  and  date_format(t1.publishtime, '%%Y-%%m-%%d') = '{0}' \
                                  and t1.gather_type = 0 \
                                  group by t1.titlehash".format(day_select) # 
    # # titlehash 去重后
    circ_cor = pd.read_sql(sql_one_day, engine)
    print('circ_cor: ', circ_cor.shape  )

    circ_data = circ_cor
    print('去重前：', circ_data.shape)
    circ_data = circ_data.drop_duplicates(subset = 'title')
    print('去重后：', circ_data.shape)  
    circ_data = circ_data.dropna(subset = ['content'], axis = 0)
    print('去空值后：', circ_data.shape)  

    circ_data['predict_label'] = circ_data['predict_label'].apply(lambda x:class_name_dict[x])
    circ_data['group_id'] = circ_data['group_id'].apply(lambda x:group_dict[str(x)])
    circ_data.insert(4, 'label', '')
    fea_filename = 'result/circ_result_tendency/result/circ_tendency_predict_mysql_%s.xlsx'%day_select
    circ_data.to_excel(fea_filename, index = False)
    print(circ_data.shape)
    
    print(circ_data.pivot_table(index = ['tendency'], columns = ['predict_label'], 
                          values = 'title', aggfunc=len, 
                          fill_value=0, margins=True))

-- day_select:  2019-03-27
circ_cor:  (8050, 7)
去重前： (8050, 7)
去重后： (8050, 7)
去空值后： (8050, 7)
(8050, 8)
predict_label  产品销售  公司内部管理  其他相关报道  消费服务   监管   行业  资本市场   All
tendency                                                       
-1               15     111       1   259   65   91   217   759
0               831     893     700  1106  793  709  2259  7291
All             846    1004     701  1365  858  800  2476  8050
-- day_select:  2019-03-28
circ_cor:  (7964, 7)
去重前： (7964, 7)
去重后： (7964, 7)
去空值后： (7964, 7)
(7964, 8)
predict_label  产品销售  公司内部管理  其他相关报道  消费服务   监管   行业  资本市场   All
tendency                                                       
-1               18     133       2   265   40   59   230   747
0               709     956     659  1133  755  798  2207  7217
All             727    1089     661  1398  795  857  2437  7964
-- day_select:  2019-03-29
circ_cor:  (8445, 7)
去重前： (8445, 7)
去重后： (8445, 7)
去空值后： (8445, 7)
(8445, 8)
predict_label  产品销售  公司内部管理  其他相关报道  消费服务   监管  

#### 合并 & 保存

In [14]:
combined_data = pd.DataFrame()
for day_select in day_list:
    tmp_data = pd.read_excel('result/circ_result_tendency/result/circ_tendency_predict_mysql_%s.xlsx'%day_select)
    combined_data = pd.concat([combined_data, tmp_data], axis = 0)

combined_data = combined_data[combined_data['predict_label'] != '补录']
print(combined_data.shape)  
print('去重前：', combined_data.shape)
combined_data = combined_data.drop_duplicates(subset = 'title')
print('去重后：', combined_data.shape)  
combined_data = combined_data.dropna(subset = ['content'], axis = 0)
print('去空值后：', combined_data.shape)  

print(combined_data['tendency'].value_counts())
combined_data.pivot_table(index = ['tendency', 'group_id'], 
                                columns = ['predict_label'], 
                                values = 'title', aggfunc=len, 
                                fill_value=0, margins=True)
# combined_data.head()

(30360, 8)
去重前： (30360, 8)
去重后： (27912, 8)
去空值后： (27905, 8)
 0    25287
-1     2618
Name: tendency, dtype: int64


Unnamed: 0_level_0,predict_label,产品销售,公司内部管理,其他相关报道,消费服务,监管,行业,资本市场,All
tendency,group_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
-1,博客,0,0,0,2,0,0,5,7
-1,外媒,0,0,0,1,0,0,1,2
-1,微信,1,29,0,199,17,43,114,403
-1,微博,37,73,2,290,57,16,68,543
-1,新闻,10,271,3,417,67,154,591,1513
-1,新闻客户端,2,31,0,23,3,10,53,122
-1,纸媒,0,0,0,12,2,1,5,20
-1,论坛,0,2,0,5,1,0,0,8
0,博客,12,1,2,5,0,0,100,120
0,外媒,4,15,6,8,3,5,20,61


In [18]:
# combined_data_sel = combined_data # [combined_data['predict_label'].isin(['交通', '环保'])]
# combined_data_sel.to_excel('circ_result_tendency/circ_tendency_data_20181224(1217-1219).xlsx', index = False)
# combined_data_sel['tendency'].value_counts()

In [15]:
fea_filename = 'result/circ_result_tendency/result/circ_tendency_predict_mysql_20190401(0327-0331).xlsx'
# sel_col = combined_data['predict_label'].unique().tolist()
# sel_data = combined_data[combined_data['tendency'].isin(sel_col)]
print(combined_data['tendency'].value_counts())

N = 400 # 每类 N 条数据
class_n = int(combined_data['predict_label'].unique().shape[0])
n = int(N / class_n) + 100

print('正负各 %s 条，共 %s 类， 每类各 %s 条'%(N, class_n, n))
with pd.ExcelWriter(fea_filename) as writer:
    for tendency in combined_data['tendency'].unique():
        tmp_data = pd.DataFrame()
        sel_data = combined_data[combined_data['tendency'] == tendency]    
        print(sel_data.pivot_table(index = ['tendency'], 
                            columns = ['predict_label'], 
                            values = 'title', aggfunc=len, 
                            fill_value=0, margins=True))  
        for predict_label in combined_data['predict_label'].unique():
            label_data = sel_data[sel_data['predict_label'] == predict_label]
            if label_data.shape[0] > n:
                sel_label_data = label_data.sample(n = n, axis = 0, random_state=3)
            else :
                sel_label_data = label_data
            tmp_data = pd.concat([tmp_data, sel_label_data], axis = 0)        
            print('tendency: %s, predict_label: %s, size: %s'%(tendency, predict_label, tmp_data.shape))
        
        if tmp_data.shape[0] > N:
            t_n = N
        else :
            t_n = tmp_data.shape[0]
        
        tmp_data = tmp_data.sample(n = N, axis = 0, random_state=3)
        tmp_data.to_excel(writer,str(tendency), index = False)
        print(tmp_data.pivot_table(index = ['tendency'], 
                                    columns = ['predict_label'], 
                                    values = 'title', aggfunc=len, 
                                    fill_value=0, margins=True))    
    writer.save()

 0    25287
-1     2618
Name: tendency, dtype: int64
正负各 400 条，共 7 类， 每类各 157 条
predict_label  产品销售  公司内部管理  其他相关报道  消费服务    监管    行业  资本市场    All
tendency                                                          
0              2775    2778    2235  3782  2196  2323  9198  25287
All            2775    2778    2235  3782  2196  2323  9198  25287
tendency: 0, predict_label: 公司内部管理, size: (157, 8)
tendency: 0, predict_label: 资本市场, size: (314, 8)
tendency: 0, predict_label: 消费服务, size: (471, 8)
tendency: 0, predict_label: 其他相关报道, size: (628, 8)
tendency: 0, predict_label: 产品销售, size: (785, 8)
tendency: 0, predict_label: 监管, size: (942, 8)
tendency: 0, predict_label: 行业, size: (1099, 8)
predict_label  产品销售  公司内部管理  其他相关报道  消费服务  监管  行业  资本市场  All
tendency                                                    
0                61      65      56    55  63  47    53  400
All              61      65      56    55  63  47    53  400
predict_label  产品销售  公司内部管理  其他相关报道  消费服务   监管   行业  资本市场   All


### 补录数据

In [58]:
# 人工补录
sql_human_additional = "select t1.id, t1.group_id, date_format(t1.publishtime,'%%Y-%%m-%%d') as publishtime,  \
                            t1.gather_type, t1.tendency,t1.classify as mysql_label, \
                            t1.title, t2.center as content\
                            from wise_web_docinfo t1, wise_web_docinfo_center t2 \
                                where (date_format(publishtime, '%%Y-%%m-%%d') >= '{0}' and \
                                      date_format(publishtime, '%%Y-%%m-%%d') <= '{1}') and \
                                      t1.id = t2.doc_id and \
                                      t1.gather_type in (1,3) \
                            group by t1.titlehash".format('2018-09-16', '2018-12-03') 

human_additional = pd.read_sql(sql_human_additional, engine)
human_additional['group_id'] = human_additional['group_id'].apply(lambda x:group_dict[str(x)])
print('title 去重前：', human_additional.shape)
human_additional = human_additional.drop_duplicates(subset = 'title')
print('title 去重后：', human_additional.shape)  
human_additional = human_additional.drop_duplicates(subset = ['content'])
print('content 去重后：', human_additional.shape)  
human_additional = human_additional.dropna(subset = ['title'], axis = 0)
print('title 去空值后：', human_additional.shape) 
human_additional.head()

title 去重前： (573, 8)
title 去重后： (573, 8)
content 去重后： (573, 8)
title 去空值后： (573, 8)


Unnamed: 0,id,group_id,publishtime,gather_type,tendency,mysql_label,title,content
0,10933233,微信,2018-09-20,1,0,0,外资股东全搜罗：隐身中资险企，财险、寿险市场份额双双接近10%,(图片)\n\n说到外资在国内保险市场的表现，很多人首先就会想到市场份额低这一点。确实，自...
1,10910821,微信,2018-09-18,1,0,0,从“天鸽”到“山竹”，保险业用这些方法更好的“管住”风险,(图片)\n\n台风“山竹”在广东西部沿海过境已经3天，得益于国家及地方政府相关部门的及早...
2,11483454,微信,2018-10-20,1,-1,0,突发，上市公司举报“平安养老风控总监伪造公章夺控制权”,"\n\t\t\t\t\t 关于<font color=""#FF0000"">平安养老保险</..."
3,11666987,微信,2018-10-26,1,-1,0,上半年持续亏损 董事长遭逮捕华安保险精达股份玩起“二人转” | 保险,"\n\t\t\t\t\t 特华投资与<font color=""#FF0000"">华安保险<..."
4,11765888,纸媒,2018-10-31,1,0,0,2018年10月31日--视点--业内专家分析如何保障重疾险消费者权益,"\n\t\t\t\t\t \n\t\t\t\t\t今年6月,中国<font color=""..."


In [59]:
human_additional['title'] = human_additional['title'].astype(str) 
human_additional['content'] = human_additional['content'].astype(str)
data = {"record":human_additional.loc[:,['id', 'title' ,'content']].to_dict(orient = 'records')}
url = "http://47.93.77.19:10000/judge_correlation_i"
col_name = 'cor'
parse_data, elapsed_time = get_server_res(data, url, col_name)
parse_data.columns = ['id', 'predict_label']
human_additional = pd.merge(human_additional, parse_data, on = 'id', how = 'left')
human_additional['predict_label'] = human_additional['predict_label'].apply(lambda x:class_name_dict[x])
human_additional.insert(6, 'label', '')
print(human_additional['predict_label'].value_counts())
human_additional.head()

行业        180
监管        109
噪音        106
公司内部管理    102
资本市场       44
消费服务       20
产品销售       12
Name: predict_label, dtype: int64


Unnamed: 0,id,group_id,publishtime,gather_type,tendency,mysql_label,label,title,content,predict_label
0,10933233,微信,2018-09-20,1,0,0,,外资股东全搜罗：隐身中资险企，财险、寿险市场份额双双接近10%,(图片)\n\n说到外资在国内保险市场的表现，很多人首先就会想到市场份额低这一点。确实，自...,行业
1,10910821,微信,2018-09-18,1,0,0,,从“天鸽”到“山竹”，保险业用这些方法更好的“管住”风险,(图片)\n\n台风“山竹”在广东西部沿海过境已经3天，得益于国家及地方政府相关部门的及早...,行业
2,11483454,微信,2018-10-20,1,-1,0,,突发，上市公司举报“平安养老风控总监伪造公章夺控制权”,"\n\t\t\t\t\t 关于<font color=""#FF0000"">平安养老保险</...",资本市场
3,11666987,微信,2018-10-26,1,-1,0,,上半年持续亏损 董事长遭逮捕华安保险精达股份玩起“二人转” | 保险,"\n\t\t\t\t\t 特华投资与<font color=""#FF0000"">华安保险<...",公司内部管理
4,11765888,纸媒,2018-10-31,1,0,0,,2018年10月31日--视点--业内专家分析如何保障重疾险消费者权益,"\n\t\t\t\t\t \n\t\t\t\t\t今年6月,中国<font color=""...",噪音


In [60]:
fea_filename = 'circ_result_class/result/补录_保监会（旧）_class_predict_mysql_20181203(0917-1203).xlsx'
print(fea_filename)

sel_col = ['噪音', '消费服务', '公司内部管理', '监管',
           '行业', '资本市场', '其他相关报道','产品销售','交通','环保']
sel_data = human_additional[human_additional['predict_label'].isin(sel_col)]
print(sel_data['predict_label'].value_counts())
print()

c_data = pd.DataFrame()
with pd.ExcelWriter(fea_filename) as writer:
    for label in sel_data['predict_label'].unique():
        tmp_data = sel_data[sel_data['predict_label'] == label]
        N = tmp_data.shape[0]            
        save_data = tmp_data.sample(n = N, axis = 0, random_state=42)
        save_data.to_excel(writer,label, index = False)
        c_data = pd.concat([c_data, save_data], axis = 0)
    print(c_data.pivot_table(index = ['group_id'], 
                                columns = ['predict_label'], 
                                values = 'title', aggfunc=len, 
                                fill_value=0, margins=True))      
    writer.save()

circ_result_class/result/补录_保监会（旧）_class_predict_mysql_20181203(0917-1203).xlsx
行业        180
监管        109
噪音        106
公司内部管理    102
资本市场       44
消费服务       20
产品销售       12
Name: predict_label, dtype: int64

predict_label  产品销售  公司内部管理   噪音  消费服务   监管   行业  资本市场  All
group_id                                                   
微信                3      23   27     2   26   69    11  161
微博                0       0    1     0    0    0     0    1
新闻                9      72   64    15   65   87    31  343
新闻客户端             0       4    3     0    7    5     1   20
纸媒                0       3    3     1   11   18     1   37
视频                0       0    8     1    0    1     0   10
论坛                0       0    0     1    0    0     0    1
All              12     102  106    20  109  180    44  573


In [61]:
fea_filename = 'circ_result_tendency/result/补录_保监会（旧）_tendency_predict_mysql_20181203(0917-1203).xlsx'
print(fea_filename)
print(human_additional['tendency'].value_counts())

with pd.ExcelWriter(fea_filename) as writer:
    for tendency in human_additional['tendency'].unique():
        sel_data = human_additional[human_additional['tendency'] == tendency]    
        t_n = sel_data.shape[0]        
        tmp_data = sel_data.sample(n = t_n, axis = 0, random_state=3)
        tmp_data.to_excel(writer,str(tendency), index = False)           
    writer.save()

circ_result_tendency/result/补录_保监会（旧）_tendency_predict_mysql_20181203(0917-1203).xlsx
 0    381
-1    192
Name: tendency, dtype: int64


## 本地模型

### 八分类

In [None]:
from sklearn.externals import joblib
pipeline_old = joblib.load( "model/circ_8classifier_1015.pkl.z")

In [None]:
combined_data['title_content'] = combined_data['title'].astype(str) + '。' + combined_data['content'].astype(str)
title_content = pre_cor_circ.handle_contents(combined_data['title_content'].tolist())
print(len(title_content))

In [None]:
local_label = pipeline_old.predict(title_content)
local_proba = pipeline_old.predict_proba(title_content)

combined_data['local_label'] = local_label
combined_data['local_proba'] = local_proba.max(axis = 1)
combined_data['local_label'] = combined_data['local_label'].apply(lambda x:class_name_dict[x])
print(combined_data.shape)
combined_data.iloc[:2, :]

#### 线上线下一致性: mysql 与 local

In [None]:
combined_data['R_W'] = combined_data.apply(lambda x: 'Right' if x['local_label'] == x['predict_label'] else 'Wrong', axis = 1)
print(combined_data[combined_data['R_W'] == 'Right'].shape[0]/combined_data.shape[0])
print(combined_data['R_W'].value_counts())
combined_data[combined_data['R_W'] == 'Wrong'].pivot_table(index = ['local_label'], columns = ['predict_label'], 
                                                            aggfunc = [len], values = ['id'], 
                                                            fill_value = 0, margins = True)

#### 线上线下一致性: online 与 local

In [None]:
combined_data['id'] = range(combined_data.shape[0])
combined_data['title'] = combined_data['title'].astype(str) 
combined_data['content'] = combined_data['content'].astype(str)
data = {"record":combined_data.loc[:,['id', 'title' ,'content']].to_dict(orient = 'records')}
url = "http://47.93.77.19:10000/judge_correlation_i"
col_name = 'cor'
parse_data, elapsed_time = get_server_res(data, url, col_name)
parse_data.columns = ['id', 'online_label']
parse_data.head()

In [None]:
combined_data = pd.merge(combined_data, parse_data, on  = 'id')
print(combined_data.shape)
combined_data['O_R_W'] = combined_data.apply(lambda x: 'Right' if x['local_label'] == x['online_label'] else 'Wrong', axis = 1)
print(combined_data[combined_data['O_R_W'] == 'Right'].shape[0]/combined_data.shape[0])
print(combined_data['O_R_W'].value_counts())
combined_data[combined_data['O_R_W'] == 'Wrong'].pivot_table(index = ['local_label'], columns = ['online_label'], 
                                                            aggfunc = [len], values = ['urlhash'], 
                                                            fill_value = 0, margins = True)

# 银行业--旧

In [19]:
engine = specific_func.get_engine('cbrc')

In [15]:
# day_select = '2018-09-09'
day_list = get_day_list('2019-02-22', '2019-02-25')
print(day_list)

['2019-02-17']


## mysql 数据

### 八分类数据

#### 获取数据

In [None]:
# sql_circ_cor_one_day = "select t1.id, t1.publishtime, t1.title,t2.text as content \
#                             from elint_web_docinfo t1, wise_web_docinfo_text t2 \
#                                 where t1.id = t2.doc_id \
#                                   and date_format(t1.publishtime, '%%Y-%%m-%%d') = '{0}'".format('2018-08-07')
# # 实际
# circ_cor = pd.read_sql(sql_circ_cor_one_day, engine)
# print(circ_cor.shape)
# circ_cor.head()

In [26]:
for day_select in day_list:
    print('-- day_select: ', day_select)
    
    # 获取八分类
    sql_one_day = "select t2.urlhash, t1.traffic_id, t2.title as title_1\
                        from wise_web_classify_traffic_docinfo t1, wise_web_docinfo_basic t2 \
                            where t1.base_id=t2.id \
                                  and date_format(t2.publishtime, '%%Y-%%m-%%d') = '{0}' ".format(day_select)
    cbrc_flag = pd.read_sql(sql_one_day, engine)
    print('cbrc_flag：', cbrc_flag.shape)
    
    # 相关数据
    sql_one_day = "select t1.urlhash, t1.title,t2.text as content, t1.group_id, t1.publishtime as publishtime \
                        from elint_web_docinfo t1, wise_web_docinfo_text t2 \
                            where t1.id=t2.doc_id \
                                  and t1.publishtime >= '{0} 08:00:00' \
                                  and t1.publishtime <= '{0} 14:00:00' \
                                group by t1.titlehash".format(day_select)
    # titlehash 去重后
    cbrc_cor = pd.read_sql(sql_one_day, engine) 
    print('cbrc_cor：', cbrc_cor.shape)
    
    # 不相关数据
    sql_cbrc_uncor = "select urlhash, title, content, group_id, publishtime \
                            from wise_web_docinfo_uncor \
                            where date_format(publishtime, '%%Y-%%m-%%d') = '{0}'".format(day_select)
    cbrc_uncor = pd.read_sql(sql_cbrc_uncor, engine)  
    print('cbrc_uncor：', cbrc_uncor.shape)

    cbrc_data = pd.concat([cbrc_cor, cbrc_uncor], axis = 0)
    print('去重前：', cbrc_data.shape)
    cbrc_data = cbrc_data.drop_duplicates(subset = 'title')
    print('去重后：', cbrc_data.shape)  
    cbrc_data = cbrc_data.dropna(subset = ['content'], axis = 0)
    print('去空值后：', cbrc_data.shape)  

    cbrc_combined = pd.merge(cbrc_flag, cbrc_data, how = 'inner', on = 'urlhash')
    cbrc_combined['predict_label'] = cbrc_combined['traffic_id'].apply(lambda x:class_name_dict[x])
    cbrc_combined['group_id'] = cbrc_combined['group_id'].apply(lambda x:group_dict[str(x)])
    cbrc_combined['label'] = ''
    cbrc_combined = cbrc_combined[['urlhash', 'predict_label', 'label', 'title', 'content', 'group_id', 'publishtime']]
    fea_filename = 'result/cbrc_result_class/result/cbrc_class_predict_mysql_%s.xlsx'%day_select
    cbrc_combined.to_excel(fea_filename, index = False)
    print(cbrc_combined.shape)
    print(cbrc_combined['predict_label'].value_counts())    

-- day_select:  2019-03-02
cbrc_flag： (78896, 3)
cbrc_cor： (2671, 5)
cbrc_uncor： (5004, 5)
去重前： (7675, 5)
去重后： (6354, 5)
去空值后： (6354, 5)
(5778, 7)
噪音        2588
消费服务      1623
行业         564
资本市场       364
监管         325
公司内部管理     158
产品销售        79
其他相关报道      77
Name: predict_label, dtype: int64
-- day_select:  2019-03-03
cbrc_flag： (68232, 3)
cbrc_cor： (1596, 5)
cbrc_uncor： (5005, 5)
去重前： (6601, 5)
去重后： (5020, 5)
去空值后： (5020, 5)
(4643, 7)
噪音        2607
消费服务      1036
资本市场       330
行业         305
监管         188
公司内部管理      66
产品销售        59
其他相关报道      52
Name: predict_label, dtype: int64
-- day_select:  2019-03-04
cbrc_flag： (109419, 3)
cbrc_cor： (4238, 5)
cbrc_uncor： (5003, 5)
去重前： (9241, 5)
去重后： (8153, 5)
去空值后： (8153, 5)
(7401, 7)
噪音        2652
消费服务      2068
行业        1044
资本市场       605
监管         547
公司内部管理     301
其他相关报道      96
产品销售        88
Name: predict_label, dtype: int64
-- day_select:  2019-03-05
cbrc_flag： (100698, 3)
cbrc_cor： (4509, 5)
cbrc_uncor： (5006, 5)
去重前：

#### 合并 & 保存

In [32]:
combined_data = pd.DataFrame()
for day_select in day_list:
    tmp_data = pd.read_excel('result/cbrc_result_class/result/cbrc_class_predict_mysql_%s.xlsx'%day_select)
    combined_data = pd.concat([combined_data, tmp_data], axis = 0)

combined_data = combined_data[combined_data['predict_label'] != '补录']
print(combined_data.shape)  
print('去重前：', combined_data.shape)
combined_data = combined_data.drop_duplicates(subset = 'title')
print('去重后：', combined_data.shape)  
combined_data = combined_data.dropna(subset = ['content'], axis = 0)
print('去空值后：', combined_data.shape)  

print(combined_data['predict_label'].value_counts())
combined_data.iloc[:2, :]

(25461, 7)
去重前： (25461, 7)
去重后： (24336, 7)
去空值后： (24297, 7)
噪音        10134
消费服务       6163
行业         3019
资本市场       1798
监管         1779
公司内部管理      742
其他相关报道      347
产品销售        315
Name: predict_label, dtype: int64


Unnamed: 0,urlhash,predict_label,label,title,content,group_id,publishtime
0,7835383079258223616,资本市场,,600595:中孚实业:中德证券有限责任公司关于河南中孚实业股份有限公司控股子公司为林州市立...,600595:中孚实业:中德证券有限责任公司关于河南中孚实业股份有限公司控股子公司为林州市...,新闻,2019-03-02
1,8837754838192826368,噪音,,格林美:独立董事候选人声明(吴树阶),格林美股份有限公司\n 独立董事候选人声明\n 声明人吴树阶，作为格林美股份有限公司第五届...,新闻,2019-03-02


In [33]:
fea_filename = 'result/cbrc_result_class/result/cbrc_class_predict_mysql_20190306(0302-0305).xlsx'
# sel_col = ['噪音', '消费服务', '公司内部管理', '监管',
#            '行业', '资本市场', '其他相关报道','产品销售']
# sel_col = ['公司内部管理', '监管', '行业', '产品销售']
sel_col = combined_data['predict_label'].unique().tolist()
sel_data = combined_data[combined_data['predict_label'].isin(sel_col)]
print(sel_data['predict_label'].value_counts())
with pd.ExcelWriter(fea_filename) as writer:
    for label in sel_data['predict_label'].unique():
        tmp_data = sel_data[sel_data['predict_label'] == label]
        if tmp_data.shape[0] > 200:
            N = 200
        else :
            N = tmp_data.shape[0]
        tmp_data.sample(n = N, axis = 0, random_state=42).to_excel(writer,label, index = False)
    
    writer.save()

噪音        10134
消费服务       6163
行业         3019
资本市场       1798
监管         1779
公司内部管理      742
其他相关报道      347
产品销售        315
Name: predict_label, dtype: int64


### 倾向性

#### 获取数据

In [20]:
for day_select in day_list:
    print('-- day_select: ', day_select)
    
    # 获取八分类
    sql_one_day = "select t2.urlhash, t1.traffic_id, t2.title as title_1\
                        from wise_web_classify_traffic_docinfo t1, wise_web_docinfo_basic t2 \
                            where t1.base_id=t2.id \
                                  and date_format(t2.publishtime, '%%Y-%%m-%%d') = '{0}' ".format(day_select)
    cbrc_flag = pd.read_sql(sql_one_day, engine)
    print('cbrc_flag：', cbrc_flag.shape)
    
    # 相关数据
    sql_one_day = "select t1.id, t1.urlhash, t1.title,t2.text as content, t1.group_id, \
                            t1.sen as tendency, t1.publishtime as publishtime \
                        from elint_web_docinfo t1, wise_web_docinfo_text t2 \
                            where t1.id=t2.doc_id \
                                  and t1.publishtime >= '{0} 08:00:00' \
                                  and t1.publishtime <= '{0} 14:00:00' \
                                group by t1.titlehash".format(day_select)
    # titlehash 去重后
    cbrc_cor = pd.read_sql(sql_one_day, engine) 
    print('cbrc_cor：', cbrc_cor.shape)

    cbrc_data = cbrc_cor
    print('去重前：', cbrc_data.shape)
    cbrc_data = cbrc_data.drop_duplicates(subset = 'title')
    print('去重后：', cbrc_data.shape)  
    cbrc_data = cbrc_data.dropna(subset = ['content'], axis = 0)
    print('去空值后：', cbrc_data.shape)  

    cbrc_combined = pd.merge(cbrc_flag, cbrc_data, how = 'inner', on = 'urlhash')
    cbrc_combined['predict_label'] = cbrc_combined['traffic_id'].apply(lambda x:class_name_dict[x])
    cbrc_combined['group_id'] = cbrc_combined['group_id'].apply(lambda x:group_dict[str(x)])
    cbrc_combined['label'] = ''
    cbrc_combined = cbrc_combined[['id', 'urlhash', 'group_id', 'predict_label', 'tendency', 
                                   'label', 'title', 'content', 'publishtime']]
    fea_filename = 'result/cbrc_result_tendency/result/cbrc_tendency_predict_mysql_%s.xlsx'%day_select
    cbrc_combined.to_excel(fea_filename, index = False)
    print(cbrc_combined.shape)
    print(cbrc_combined.pivot_table(index = ['tendency'], columns = ['predict_label'], 
                          values = 'title', aggfunc=len, 
                          fill_value=0, margins=True))

-- day_select:  2019-03-27
cbrc_flag： (132263, 3)
cbrc_cor： (6985, 7)
去重前： (6985, 7)
去重后： (6985, 7)
去空值后： (6985, 7)
(6741, 9)
predict_label  公司内部管理  其他相关报道  噪音  消费服务   监管   行业  资本市场   All
tendency                                                     
-1                 61       0   2  4116   36   85     0  4300
0                 363       2   4   868  357  830    17  2441
All               424       2   6  4984  393  915    17  6741
-- day_select:  2019-03-28
cbrc_flag： (113241, 3)
cbrc_cor： (5044, 7)
去重前： (5044, 7)
去重后： (5044, 7)
去空值后： (5044, 7)
(4814, 9)
predict_label  公司内部管理  其他相关报道  噪音  消费服务   监管   行业  资本市场   All
tendency                                                     
-1                 58       0   0  2078   63   59     1  2259
0                 379       4   8   860  411  879    13  2554
1                   0       0   0     0    1    0     0     1
All               437       4   8  2938  475  938    14  4814
-- day_select:  2019-03-29
cbrc_flag： (116575, 3)
cbrc_cor： (4966,

#### 合并 & 保存

In [16]:
combined_data = pd.DataFrame()
for day_select in day_list:
    tmp_data = pd.read_excel('result/cbrc_result_tendency/result/cbrc_tendency_predict_mysql_%s.xlsx'%day_select)
    combined_data = pd.concat([combined_data, tmp_data], axis = 0)

combined_data = combined_data[combined_data['predict_label'] != '补录']
combined_data = combined_data[combined_data['tendency'].isin([-1, 0]) ]
# sel_col = [ '消费服务', '公司内部管理', '监管','行业']
# sel_col = ['噪音', '消费服务', '公司内部管理', '监管',
#            '行业', '资本市场', '其他相关报道','产品销售']
# sel_col = ['公司内部管理', '监管', '行业', '产品销售']
sel_col = combined_data['predict_label'].unique().tolist()

combined_data = combined_data[combined_data['predict_label'].isin(sel_col)]
print(combined_data.shape)  
print('去重前：', combined_data.shape)
combined_data = combined_data.drop_duplicates(subset = 'title')
print('去重后：', combined_data.shape)  
combined_data = combined_data.dropna(subset = ['content'], axis = 0)
print('去空值后：', combined_data.shape)  

print(combined_data['tendency'].value_counts())
combined_data.pivot_table(index = ['tendency', 'group_id'], 
                                columns = ['predict_label'], 
                                values = 'title', aggfunc=len, 
                                fill_value=0, margins=True)
# combined_data.iloc[:2, :]

(19481, 9)
去重前： (19481, 9)
去重后： (18886, 9)
去空值后： (18880, 9)
-1    10359
 0     8521
Name: tendency, dtype: int64


Unnamed: 0_level_0,predict_label,产品销售,公司内部管理,其他相关报道,噪音,消费服务,监管,行业,资本市场,All
tendency,group_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
-1,博客,0,0,0,0,7,0,0,0,7
-1,外媒,0,0,0,0,1,0,0,0,1
-1,微信,0,23,0,0,366,26,50,0,465
-1,微博,0,10,0,0,7646,80,1,0,7737
-1,新闻,0,154,0,3,1543,62,171,3,1936
-1,新闻客户端,0,17,0,0,88,7,20,1,133
-1,纸媒,0,4,0,0,31,6,8,0,49
-1,论坛,0,2,0,0,25,0,4,0,31
0,博客,0,0,0,0,3,1,4,0,8
0,外媒,0,4,0,0,10,3,11,0,28


In [19]:
# combined_data_sel = combined_data # [combined_data['predict_label'].isin(['交通', '环保'])]
# combined_data_sel.to_excel('result/cbrc_result_tendency/cbrc_tendency_data_20190306(0302-0305).xlsx', 
#                            index = False)
# combined_data_sel['tendency'].value_counts()

-1    1052
 0     768
Name: tendency, dtype: int64

In [17]:
fea_filename = 'result/cbrc_result_tendency/result/cbrc_tendency_predict_mysql_20190401(0327-0331).xlsx'
print(combined_data['tendency'].value_counts())

N = 400 # 每类 N 条数据
class_n = int(combined_data['predict_label'].unique().shape[0])
n = int(N / class_n) + 200

print('正负各 %s 条，共 %s 类， 每类各 %s 条'%(N, class_n, n))
with pd.ExcelWriter(fea_filename) as writer:
    for tendency in combined_data['tendency'].unique():
        tmp_data = pd.DataFrame()
        sel_data = combined_data[combined_data['tendency'] == tendency]        
        for predict_label in combined_data['predict_label'].unique():
            label_data = sel_data[sel_data['predict_label'] == predict_label]
            if label_data.shape[0] > n:
                sel_label_data = label_data.sample(n = n, axis = 0, random_state=3)
            else :
                sel_label_data = label_data
            tmp_data = pd.concat([tmp_data, sel_label_data], axis = 0)        
            print('tendency: %s, predict_label: %s, size: %s'%(tendency, predict_label, tmp_data.shape))
        
        if tmp_data.shape[0] > N:
            t_n = N
        else :
            t_n = tmp_data.shape[0]
        
        tmp_data = tmp_data.sample(n = N, axis = 0, random_state=3)
        tmp_data.to_excel(writer,str(tendency), index = False)
        print(tmp_data.pivot_table(index = ['tendency'], 
                                    columns = ['predict_label'], 
                                    values = 'title', aggfunc=len, 
                                    fill_value=0, margins=True))    
    writer.save()

-1    10359
 0     8521
Name: tendency, dtype: int64
正负各 400 条，共 8 类， 每类各 250 条
tendency: 0, predict_label: 监管, size: (250, 9)
tendency: 0, predict_label: 行业, size: (500, 9)
tendency: 0, predict_label: 公司内部管理, size: (750, 9)
tendency: 0, predict_label: 消费服务, size: (1000, 9)
tendency: 0, predict_label: 噪音, size: (1017, 9)
tendency: 0, predict_label: 资本市场, size: (1062, 9)
tendency: 0, predict_label: 其他相关报道, size: (1069, 9)
tendency: 0, predict_label: 产品销售, size: (1070, 9)
predict_label  公司内部管理  其他相关报道  噪音  消费服务   监管  行业  资本市场  All
tendency                                                   
0                  95       1   4    93  110  84    13  400
All                95       1   4    93  110  84    13  400
tendency: -1, predict_label: 监管, size: (181, 9)
tendency: -1, predict_label: 行业, size: (431, 9)
tendency: -1, predict_label: 公司内部管理, size: (641, 9)
tendency: -1, predict_label: 消费服务, size: (891, 9)
tendency: -1, predict_label: 噪音, size: (894, 9)
tendency: -1, predict_label: 资本市场, size

### 补录数据

In [20]:
# 人工补录
sql_one_day = "select t1.id, t1.group_id, t1.publishtime as publishtime, t1.gather_type, \
                    t1.sen as tendency, t1.title,t2.text as content \
                    from elint_web_docinfo t1, wise_web_docinfo_text t2 \
                        where t1.id=t2.doc_id \
                              and t1.publishtime >= '{0} 00:00:00' \
                              and t1.publishtime <= '{1} 23:59:59' \
                              and t1.gather_type in (1,3) \
                            group by t1.titlehash".format('2018-09-01', '2018-12-03')
# titlehash 去重后
human_additional = pd.read_sql(sql_one_day, engine) 
human_additional['group_id'] = human_additional['group_id'].apply(lambda x:group_dict[str(x)])
print('title 去重前：', human_additional.shape)
human_additional = human_additional.drop_duplicates(subset = 'title')
print('title 去重后：', human_additional.shape)  
human_additional = human_additional.drop_duplicates(subset = ['content'])
print('content 去重后：', human_additional.shape)  
human_additional = human_additional.dropna(subset = ['title'], axis = 0)
print('title 去空值后：', human_additional.shape) 
human_additional.head()

title 去重前： (3334, 7)
title 去重后： (3334, 7)
content 去重后： (3334, 7)
title 去空值后： (3334, 7)


Unnamed: 0,id,group_id,publishtime,gather_type,tendency,title,content
0,267289612,微博,2018-10-31 12:11:44,1,-1,24号没叫取了28号给我 26号后直接跳到了32号 @中国工商银行 麻烦请加强下底层工作人员...,24号没叫取了28号给我\n26号后直接跳到了32号\n@中国工商银行 麻烦请加强下底层工作...
1,266932412,微博,2018-10-30 11:46:06,1,-1,#魔法万圣节# 在建行庆祝万圣节 各种暂停服务 没人的情况下等待二十分钟 后边的一个叔叔等待...,#魔法万圣节# 在建行庆祝万圣节 各种暂停服务 没人的情况下等待二十分钟 后边的一个叔叔等待...
2,249135456,微博,2018-09-28 10:31:29,1,-1,工商银行的服务真是稀烂！不是财大气粗吗？从工作人员的业务熟悉度到业务平台系统都是渣渣！,工商银行的服务真是稀烂！不是财大气粗吗？从工作人员的业务熟悉度到业务平台系统都是渣渣！
3,263284185,微信,2018-10-17 00:00:00,1,-1,老牌长租公寓也摊事儿！雷军等明星投资人加持却也难逃一劫，年内已有5家爆雷,(图片)\n\n(图片)\n\n长租公寓又“摊上事儿”了。\n\n上海老牌长租公寓 寓见...
4,269162185,微博,2018-11-06 20:42:00,1,-1,不得不表扬下@招商银行信用卡 了，前几天在...,不得不表扬下\n@招商银行信用卡\n 了，前几天在苏格兰玩儿弄丢了卡，发现可以直接打开手机...


In [21]:
human_additional['title'] = human_additional['title'].astype(str) 
human_additional['content'] = human_additional['content'].astype(str)
data = {"record":human_additional.loc[:,['id', 'title' ,'content']].to_dict(orient = 'records')}
url = "http://47.93.77.19:6001/judge_correlation_yjh"
col_name = 'sec'
parse_data, elapsed_time = get_server_res(data, url, col_name)
parse_data.columns = ['id', 'predict_label']
human_additional = pd.merge(human_additional, parse_data, on = 'id', how = 'left')
human_additional['predict_label'] = human_additional['predict_label'].apply(lambda x:class_name_dict[x])
human_additional.insert(5, 'label', '')
print(human_additional['predict_label'].value_counts())
human_additional.head()

消费服务      2366
行业         290
噪音         252
资本市场       131
监管         103
公司内部管理     101
产品销售        80
其他相关报道      11
Name: predict_label, dtype: int64


Unnamed: 0,id,group_id,publishtime,gather_type,tendency,label,title,content,predict_label
0,267289612,微博,2018-10-31 12:11:44,1,-1,,24号没叫取了28号给我 26号后直接跳到了32号 @中国工商银行 麻烦请加强下底层工作人员...,24号没叫取了28号给我\n26号后直接跳到了32号\n@中国工商银行 麻烦请加强下底层工作...,消费服务
1,266932412,微博,2018-10-30 11:46:06,1,-1,,#魔法万圣节# 在建行庆祝万圣节 各种暂停服务 没人的情况下等待二十分钟 后边的一个叔叔等待...,#魔法万圣节# 在建行庆祝万圣节 各种暂停服务 没人的情况下等待二十分钟 后边的一个叔叔等待...,消费服务
2,249135456,微博,2018-09-28 10:31:29,1,-1,,工商银行的服务真是稀烂！不是财大气粗吗？从工作人员的业务熟悉度到业务平台系统都是渣渣！,工商银行的服务真是稀烂！不是财大气粗吗？从工作人员的业务熟悉度到业务平台系统都是渣渣！,消费服务
3,263284185,微信,2018-10-17 00:00:00,1,-1,,老牌长租公寓也摊事儿！雷军等明星投资人加持却也难逃一劫，年内已有5家爆雷,(图片)\n\n(图片)\n\n长租公寓又“摊上事儿”了。\n\n上海老牌长租公寓 寓见...,行业
4,269162185,微博,2018-11-06 20:42:00,1,-1,,不得不表扬下@招商银行信用卡 了，前几天在...,不得不表扬下\n@招商银行信用卡\n 了，前几天在苏格兰玩儿弄丢了卡，发现可以直接打开手机...,消费服务


In [24]:
fea_filename = 'cbrc_result_class/result/补录_银监会（旧）_class_predict_mysql_20181203(0901-1203).xlsx'
print(fea_filename)

sel_col = ['噪音', '消费服务', '公司内部管理', '监管',
           '行业', '资本市场', '其他相关报道','产品销售','交通','环保']
sel_data = human_additional[human_additional['predict_label'].isin(sel_col)]
print(sel_data['predict_label'].value_counts())
print()

c_data = pd.DataFrame()
with pd.ExcelWriter(fea_filename) as writer:
    for label in sel_data['predict_label'].unique():
        tmp_data = sel_data[sel_data['predict_label'] == label]
        N = tmp_data.shape[0]            
        save_data = tmp_data.sample(n = N, axis = 0, random_state=42)
        save_data.to_excel(writer,label, index = False)
        c_data = pd.concat([c_data, save_data], axis = 0)
    print(c_data.pivot_table(index = ['group_id'], 
                                columns = ['predict_label'], 
                                values = 'title', aggfunc=len, 
                                fill_value=0, margins=True))      
    writer.save()

cbrc_result_class/result/补录_银监会（旧）_class_predict_mysql_20181203(0901-1203).xlsx
消费服务      2366
行业         290
噪音         252
资本市场       131
监管         103
公司内部管理     101
产品销售        80
其他相关报道      11
Name: predict_label, dtype: int64

predict_label  产品销售  公司内部管理  其他相关报道   噪音  消费服务   监管   行业  资本市场   All
group_id                                                            
博客                0       0       0    0     1    0    0     0     1
微信                0       3       0    2     5    4   12     6    32
微博               76       3       6  153  2282    4    9     4  2537
新闻                2      86       4   64    23   61  165    63   468
新闻客户端             0       2       0    2     0    4    1     2    11
纸媒                0       5       1   25     7   30  103    56   227
论坛                2       2       0    6    48    0    0     0    58
All              80     101      11  252  2366  103  290   131  3334


In [25]:
fea_filename = 'cbrc_result_tendency/result/补录_银监会（旧）_tendency_predict_mysql_20181204(0901-1203).xlsx'
print(fea_filename)
print(human_additional['tendency'].value_counts())

with pd.ExcelWriter(fea_filename) as writer:
    for tendency in human_additional['tendency'].unique():
        sel_data = human_additional[human_additional['tendency'] == tendency]    
        t_n = sel_data.shape[0]        
        tmp_data = sel_data.sample(n = t_n, axis = 0, random_state=3)
        tmp_data.to_excel(writer,str(tendency), index = False)           
    writer.save()

cbrc_result_tendency/result/补录_银监会（旧）_tendency_predict_mysql_20181204(0901-1203).xlsx
-1    2892
 0     345
 1      97
Name: tendency, dtype: int64


## 本地模型

### 八分类

In [None]:
from sklearn.externals import joblib
pipeline_old = joblib.load( "model/cbrc_8classifier_1015.pkl.z")

In [None]:
combined_data['title_content'] = combined_data['title'].astype(str) + '。' + combined_data['content'].astype(str)
title_content = pre_cor_cbrc.handle_contents(combined_data['title_content'].tolist())
print(len(title_content))

local_label = pipeline_old.predict(title_content)
local_proba = pipeline_old.predict_proba(title_content)

In [None]:
combined_data['local_label'] = local_label
combined_data['local_proba'] = local_proba.max(axis = 1)
combined_data['local_label'] = combined_data['local_label'].apply(lambda x:class_name_dict[x])
print(combined_data.shape)
combined_data.iloc[:2, :]

#### 线上线下一致性: mysql 与 local

In [None]:
combined_data['R_W'] = combined_data.apply(lambda x: 'Right' if x['local_label'] == x['predict_label'] else 'Wrong', axis = 1)
print(combined_data[combined_data['R_W'] == 'Right'].shape[0]/combined_data.shape[0])
print(combined_data['R_W'].value_counts())
combined_data[combined_data['R_W'] == 'Wrong'].pivot_table(index = ['local_label'], columns = ['predict_label'], 
                                                            aggfunc = [len], values = ['urlhash'], 
                                                            fill_value = 0, margins = True)

#### 线上线下一致性: online 与 local

In [None]:
combined_data['id'] = range(combined_data.shape[0])
combined_data['title'] = combined_data['title'].astype(str) 
combined_data['content'] = combined_data['content'].astype(str)

In [None]:
data = {"record":combined_data.loc[:,['id', 'title' ,'content']].to_dict(orient = 'records')}
url = "http://47.93.77.19:6001/judge_correlation_yjh"
col_name = 'sec'
parse_data = get_server_res_yjh(data, url, col_name)
parse_data.columns = ['id', 'online_label']
parse_data.head()

In [None]:
combined_data = pd.merge(combined_data, parse_data, on  = 'id')
print(combined_data.shape)
combined_data['online_label'] = combined_data['online_label'].apply(lambda x:class_name_dict[x])
combined_data['O_R_W'] = combined_data.apply(lambda x: 'Right' if x['local_label'] == x['online_label'] else 'Wrong', axis = 1)
print(combined_data[combined_data['O_R_W'] == 'Right'].shape[0]/combined_data.shape[0])
print(combined_data['O_R_W'].value_counts())
combined_data[combined_data['O_R_W'] == 'Wrong'].pivot_table(index = ['local_label'], columns = ['online_label'], 
                                                            aggfunc = [len], values = ['urlhash'], 
                                                            fill_value = 0, margins = True)

#### 线上线下一致性: online 与 mysql

In [None]:
combined_data['O_R_W'] = combined_data.apply(lambda x: 'Right' if x['predict_label'] == x['online_label'] else 'Wrong', axis = 1)
print(combined_data[combined_data['O_R_W'] == 'Right'].shape[0]/combined_data.shape[0])
print(combined_data['O_R_W'].value_counts())
combined_data[combined_data['O_R_W'] == 'Wrong'].pivot_table(index = ['predict_label'], columns = ['online_label'], 
                                                            aggfunc = [len], values = ['urlhash'], 
                                                            fill_value = 0, margins = True)

### 倾向性

In [None]:
from sklearn.externals import joblib
pipeline_old = joblib.load( "model/cbrc_8classifier_1015.pkl.z")

In [None]:
combined_data['title_content'] = combined_data['title'].astype(str) + '。' + combined_data['content'].astype(str)
title_content = pre_cor_cbrc.handle_contents(combined_data['title_content'].tolist())
print(len(title_content))

local_label = pipeline_old.predict(title_content)
local_proba = pipeline_old.predict_proba(title_content)

In [None]:
combined_data['local_label'] = local_label
combined_data['local_proba'] = local_proba.max(axis = 1)
combined_data['local_label'] = combined_data['local_label'].apply(lambda x:class_name_dict[x])
print(combined_data.shape)
combined_data.iloc[:2, :]

#### 线上线下一致性: mysql 与 local

In [None]:
combined_data['R_W'] = combined_data.apply(lambda x: 'Right' if x['local_label'] == x['predict_label'] else 'Wrong', axis = 1)
print(combined_data[combined_data['R_W'] == 'Right'].shape[0]/combined_data.shape[0])
print(combined_data['R_W'].value_counts())
combined_data[combined_data['R_W'] == 'Wrong'].pivot_table(index = ['local_label'], columns = ['predict_label'], 
                                                            aggfunc = [len], values = ['urlhash'], 
                                                            fill_value = 0, margins = True)

#### 线上线下一致性: online 与 local

In [None]:
combined_data['id'] = range(combined_data.shape[0])
combined_data['title'] = combined_data['title'].astype(str) 
combined_data['content'] = combined_data['content'].astype(str)

In [None]:
data = {"record":combined_data.loc[:,['id', 'title' ,'content']].to_dict(orient = 'records')}
url = "http://47.93.77.19:6001/judge_correlation_yjh"
col_name = 'sec'
parse_data = get_server_res_yjh(data, url, col_name)
parse_data.columns = ['id', 'online_label']
parse_data.head()

In [None]:
combined_data = pd.merge(combined_data, parse_data, on  = 'id')
print(combined_data.shape)
combined_data['online_label'] = combined_data['online_label'].apply(lambda x:class_name_dict[x])
combined_data['O_R_W'] = combined_data.apply(lambda x: 'Right' if x['local_label'] == x['online_label'] else 'Wrong', axis = 1)
print(combined_data[combined_data['O_R_W'] == 'Right'].shape[0]/combined_data.shape[0])
print(combined_data['O_R_W'].value_counts())
combined_data[combined_data['O_R_W'] == 'Wrong'].pivot_table(index = ['local_label'], columns = ['online_label'], 
                                                            aggfunc = [len], values = ['urlhash'], 
                                                            fill_value = 0, margins = True)

#### 线上线下一致性: online 与 mysql

In [None]:
combined_data['O_R_W'] = combined_data.apply(lambda x: 'Right' if x['predict_label'] == x['online_label'] else 'Wrong', axis = 1)
print(combined_data[combined_data['O_R_W'] == 'Right'].shape[0]/combined_data.shape[0])
print(combined_data['O_R_W'].value_counts())
combined_data[combined_data['O_R_W'] == 'Wrong'].pivot_table(index = ['predict_label'], columns = ['online_label'], 
                                                            aggfunc = [len], values = ['urlhash'], 
                                                            fill_value = 0, margins = True)

# 银行业与保险业--新

## mysql 数据

In [18]:
engine = specific_func.get_engine('cbirc')

In [26]:
engine = specific_func.get_engine('ahyjj')

In [20]:
# day_select = '2018-09-09'
day_list = get_day_list('2019-01-01', '2019-03-25')
print(day_list)

['2019-01-02', '2019-01-03', '2019-01-04', '2019-01-05', '2019-01-06', '2019-01-07', '2019-01-08', '2019-01-09', '2019-01-10', '2019-01-11', '2019-01-12', '2019-01-13', '2019-01-14', '2019-01-15', '2019-01-16', '2019-01-17', '2019-01-18', '2019-01-19', '2019-01-20', '2019-01-21', '2019-01-22', '2019-01-23', '2019-01-24', '2019-01-25', '2019-01-26', '2019-01-27', '2019-01-28', '2019-01-29', '2019-01-30', '2019-01-31', '2019-02-01', '2019-02-02', '2019-02-03', '2019-02-04', '2019-02-05', '2019-02-06', '2019-02-07', '2019-02-08', '2019-02-09', '2019-02-10', '2019-02-11', '2019-02-12', '2019-02-13', '2019-02-14', '2019-02-15', '2019-02-16', '2019-02-17', '2019-02-18', '2019-02-19', '2019-02-20', '2019-02-21', '2019-02-22', '2019-02-23', '2019-02-24', '2019-02-25', '2019-02-26', '2019-02-27', '2019-02-28', '2019-03-01', '2019-03-02', '2019-03-03', '2019-03-04', '2019-03-05', '2019-03-06', '2019-03-07', '2019-03-08', '2019-03-09', '2019-03-10', '2019-03-11', '2019-03-12', '2019-03-13', '2019

In [104]:
# ahyyj_data = pd.read_excel('cbrc_data_class/安徽银监局数据_20181221.xlsx')
# print(ahyyj_data.shape)
# # ahyyj_data.iloc[:1,:]

# url_list = tuple(ahyyj_data['url'].tolist())
# sql_label = '''
#         SELECT 
#             t3.id, t3.title, t3.group_id, t3.publishtime, t3.url, t3.urlhash
#         FROM
#             ahyjj.db_docinfo t3
#         WHERE
#             t3.url in {0}
#         '''.format(url_list)

# ahyjj_label = pd.read_sql(sql_label, engine)
# # ahyjj_label['predict_label_mysql'] = ahyjj_label['traffic_id'].apply(lambda x:class_name_dict[x])
# ahyjj_label['group_id'] = ahyjj_label['group_id'].apply(lambda x:group_dict[str(x)])
# # ahyjj_label['type'] = ahyjj_label['type'].apply(lambda x:proj_name_dict[6])
# print(ahyjj_label.shape)
# # ahyjj_label.iloc[:1,:]

# url_l = ahyjj_label['urlhash'].tolist()
# url_list = tuple(url_l)
# sql_content = '''
# SELECT 
#     t1.urlhash, t1.text as content
# FROM
#     ahyjj.db_docinfo_text t1
# WHERE
#     t1.urlhash in {0}
# '''.format(url_list)

# ahyyj_content = pd.read_sql(sql_content, engine)
# ahyyj_combined = pd.merge(ahyjj_label, ahyyj_content, on = 'urlhash', how = 'inner')
# print(ahyyj_combined.shape)  
# ahyyj_combined['label'] = ''
# ahyyj_combined = ahyyj_combined[['id', 'label', 'title', 'content', 'group_id', 'publishtime', 'url']]
# ahyyj_combined['title'] = ahyyj_combined['title'].astype(str)
# ahyyj_combined['content'] = ahyyj_combined['content'].astype(str)
# # ahyyj_combined.iloc[:1,:]

# data = {"types":'1',"record":ahyyj_combined.loc[:,['id', 'title' ,'content']].to_dict(orient = 'records')}
# url = "http://39.105.162.87:6001/judge_correlation_b"
# col_name = 'cor'

# # parse_data = get_server_res_yjh(data, url, col_name)
# parse_data, elapsed_time = get_server_res(data, url, col_name)
# print('elapsed_time: ', elapsed_time)
# parse_data.columns = ['id', 'predict_label']
# print(parse_data.shape)
# # parse_data.head()

# ahyyj_combined = pd.merge(parse_data, ahyyj_combined, on = 'id')
# ahyyj_combined['predict_label'] = ahyyj_combined['predict_label'].apply(lambda x:class_name_dict[x])
# print(ahyyj_combined.shape)
# # ahyyj_combined.iloc[:1,:]

# ahyyj_data = pd.merge(ahyyj_data, ahyyj_combined, on = 'url', how = 'left')
# writer = pd.ExcelWriter('cbrc_data_class/安徽银监局数据_content_20181221.xlsx',
#                         engine='xlsxwriter',
#                         options={'strings_to_urls': False})
# ahyyj_data.to_excel(writer, sheet_name='Sheet1', index = False)
# writer.save()
# print(ahyyj_data.shape)
# ahyyj_data.iloc[:1,:]

### 获取数据

#### 系统采集
- gather_type 0-系统采集

In [27]:
types = 3
gather_types = '采集'
proj_name = proj_name_dict[types]
print("获取 %s 数据..."%(proj_name))
if proj_name == proj_name_dict[6]: 
    sel_type = 1
else :
    sel_type = types
for day_select in day_list:
    print()
    print('-- day_select: ', day_select)

    # 获取八分类
    if types in [1,2, 6]:
        sql_label = '''
        SELECT 
            t1.id, t1.type, t1.urlhash, t3.title, t3.group_id, t3.publishtime,
            t1.traffic_id, t2.sen as tendency, t2.gather_type
        FROM
            db_classify_traffic_docinfo t1
                LEFT JOIN
            db_docinfo_trade t2 ON t1.urlhash = t2.urlhash
                LEFT JOIN
            db_docinfo t3 ON t2.urlhash = t3.urlhash
        WHERE
            t3.publishtime >= '{0} 07:00:00'
                AND t3.publishtime <= '{0} 16:00:00'
                and t1.type = {1}
                and t1.type = t2.type
                and t2.gather_type = 0
        group by t3.titlehash
        '''.format(day_select, sel_type)
    elif types in [3,4,5]:
        sql_label = '''
        SELECT 
            t1.id, t1.type, t1.urlhash, t3.title, t3.group_id, t3.publishtime,
            t1.traffic_id, t2.sen as tendency, t2.gather_type
        FROM
            db_classify_traffic_docinfo t1
                LEFT JOIN
            db_docinfo_trade t2 ON t1.urlhash = t2.urlhash
                LEFT JOIN
            db_docinfo t3 ON t2.urlhash = t3.urlhash
        WHERE
            t3.publishtime >= '{0} 00:00:00'
                AND t3.publishtime <= '{0} 23:59:59'
                and t1.type = {1}
                and t1.type = t2.type
                and t2.gather_type = 0
        group by t3.titlehash
        '''.format(day_select, sel_type)        

    cbirc_label = pd.read_sql(sql_label, engine)
    cbirc_label['predict_label'] = cbirc_label['traffic_id'].apply(lambda x:class_name_dict[x])
    cbirc_label['group_id'] = cbirc_label['group_id'].apply(lambda x:group_dict[str(x)])
    cbirc_label['type'] = cbirc_label['type'].apply(lambda x:proj_name_dict[x])
    
    if proj_name == proj_name_dict[6]: cbirc_label['type'] = proj_name_dict[6]
    
    cbirc_label['gather_type'] = cbirc_label['gather_type'].apply(lambda x:gather_type_name_dict[x])
    print('label 去重前：', cbirc_label.shape)
    cbirc_label = cbirc_label.drop_duplicates(subset = 'title')
    print('label 去重后：', cbirc_label.shape)  
    cbirc_label = cbirc_label.dropna(subset = ['title'], axis = 0)
    print('label 去空值后：', cbirc_label.shape)  
    
    if cbirc_label['urlhash'].shape[0] != 0:
        # 获取 content
        url_l = cbirc_label['urlhash'].tolist()
        if cbirc_label['urlhash'].shape[0] == 1:
            url_l.append(url_l[0])
        url_list = tuple(url_l)
        sql_content = '''
        SELECT 
            t1.urlhash, t1.text as content
        FROM
            db_docinfo_text t1
        WHERE
            t1.urlhash in {0}
        '''.format(url_list)

        cbirc_content = pd.read_sql(sql_content, engine)
        print('content 去重前：', cbirc_content.shape)
        cbirc_content = cbirc_content.drop_duplicates(subset = 'content')
        print('content 去重后：', cbirc_content.shape)  
        cbirc_content = cbirc_content.dropna(subset = ['content'], axis = 0)
        print('content 去空值后：', cbirc_content.shape)  

        cbirc_combined = pd.merge(cbirc_label, cbirc_content, on = 'urlhash', how = 'inner')
        print(cbirc_combined.shape)
        print(cbirc_combined.pivot_table(index = ['tendency', 'type'], columns = ['predict_label'], 
                                    aggfunc = [len], values = ['title'], 
                                    fill_value = 0, margins = True))    
        cbirc_combined['label'] = ''
        cbirc_combined = cbirc_combined[['id', 'gather_type', 'type', 'urlhash', 'predict_label', 'label', 'title', 
                                         'content', 'group_id', 'publishtime', 'tendency']]
    #     fea_filename = 'cbirc_result/class/result/cbirc_class_predict_mysql_%s.xlsx'%day_select
        fea_filename = 'result/cbirc_result/class/result/cbirc_class_predict_%s_types(%s)_%s.xlsx'%(gather_types, types, day_select)
        cbirc_combined.to_excel(fea_filename, index = False) 
        print('   已生成：', fea_filename)    

获取 中国人寿 数据...

-- day_select:  2019-01-02
label 去重前： (40, 10)
label 去重后： (40, 10)
label 去空值后： (40, 10)
content 去重前： (40, 2)
content 去重后： (40, 2)
content 去空值后： (40, 2)
(40, 11)
                len                                 
              title                                 
predict_label  产品销售 公司内部管理 其他相关报道  噪音 监管 行业 资本市场 All
tendency type                                       
-1       中国人寿     0      0      0   0  0  2    0   2
0        中国人寿     1      5      1  20  1  7    3  38
All               1      5      1  20  1  9    3  40
   已生成： result/cbirc_result/class/result/cbirc_class_predict_采集_types(3)_2019-01-02.xlsx

-- day_select:  2019-01-03
label 去重前： (47, 10)
label 去重后： (47, 10)
label 去空值后： (47, 10)
content 去重前： (47, 2)
content 去重后： (47, 2)
content 去空值后： (47, 2)
(47, 11)
                 len                       
               title                       
predict_label 公司内部管理 其他相关报道  噪音 行业 资本市场 All
tendency type                              
-1       中国人寿      0      

   已生成： result/cbirc_result/class/result/cbirc_class_predict_采集_types(3)_2019-01-15.xlsx

-- day_select:  2019-01-16
label 去重前： (2942, 10)
label 去重后： (2941, 10)
label 去空值后： (2941, 10)
content 去重前： (2941, 2)
content 去重后： (2761, 2)
content 去空值后： (2761, 2)
(2761, 11)
                len                                             
              title                                             
predict_label  产品销售 公司内部管理 其他相关报道    噪音 消费服务  监管   行业 资本市场   All
tendency type                                                   
-1       中国人寿     0     10      0     0    2   0   14    0    26
0        中国人寿    43     68     31  2206   55  24  137  171  2735
All              43     78     31  2206   57  24  151  171  2761
   已生成： result/cbirc_result/class/result/cbirc_class_predict_采集_types(3)_2019-01-16.xlsx

-- day_select:  2019-01-17
label 去重前： (3853, 10)
label 去重后： (3853, 10)
label 去空值后： (3853, 10)
content 去重前： (3853, 2)
content 去重后： (3344, 2)
content 去空值后： (3344, 2)
(3344, 11)
               

   已生成： result/cbirc_result/class/result/cbirc_class_predict_采集_types(3)_2019-01-27.xlsx

-- day_select:  2019-01-28
label 去重前： (1324, 10)
label 去重后： (1324, 10)
label 去空值后： (1324, 10)
content 去重前： (1324, 2)
content 去重后： (1308, 2)
content 去空值后： (1308, 2)
(1308, 11)
                len                                           
              title                                           
predict_label  产品销售 公司内部管理 其他相关报道   噪音 消费服务  监管  行业 资本市场   All
tendency type                                                 
-1       中国人寿     0      7      0    0    3  10   8    0    28
0        中国人寿    10     19     27  959   15  79  79   92  1280
All              10     26     27  959   18  89  87   92  1308
   已生成： result/cbirc_result/class/result/cbirc_class_predict_采集_types(3)_2019-01-28.xlsx

-- day_select:  2019-01-29
label 去重前： (3356, 10)
label 去重后： (3356, 10)
label 去空值后： (3356, 10)
content 去重前： (3356, 2)
content 去重后： (3308, 2)
content 去空值后： (3308, 2)
(3308, 11)
                len          

   已生成： result/cbirc_result/class/result/cbirc_class_predict_采集_types(3)_2019-02-09.xlsx

-- day_select:  2019-02-10
label 去重前： (363, 10)
label 去重后： (362, 10)
label 去空值后： (362, 10)
content 去重前： (362, 2)
content 去重后： (359, 2)
content 去空值后： (359, 2)
(359, 11)
                 len                                  
               title                                  
predict_label 公司内部管理 其他相关报道   噪音 消费服务 监管  行业 资本市场  All
tendency type                                         
-1       中国人寿      1      0    0    0  0   3    0    4
0        中国人寿      8      1  311    5  8  15    7  355
All                9      1  311    5  8  18    7  359
   已生成： result/cbirc_result/class/result/cbirc_class_predict_采集_types(3)_2019-02-10.xlsx

-- day_select:  2019-02-11
label 去重前： (923, 10)
label 去重后： (923, 10)
label 去空值后： (923, 10)
content 去重前： (923, 2)
content 去重后： (898, 2)
content 去空值后： (898, 2)
(898, 11)
                len                                          
              title                  

label 去重前： (714, 10)
label 去重后： (714, 10)
label 去空值后： (714, 10)
content 去重前： (714, 2)
content 去重后： (707, 2)
content 去空值后： (707, 2)
(707, 11)
                len                                          
              title                                          
predict_label  产品销售 公司内部管理 其他相关报道   噪音 消费服务  监管  行业 资本市场  All
tendency type                                                
-1       中国人寿     0      2      0    0    0   1  13    0   16
0        中国人寿     4     18      4  579    4  10  42   30  691
All               4     20      4  579    4  11  55   30  707
   已生成： result/cbirc_result/class/result/cbirc_class_predict_采集_types(3)_2019-02-22.xlsx

-- day_select:  2019-02-23
label 去重前： (627, 10)
label 去重后： (627, 10)
label 去空值后： (627, 10)
content 去重前： (627, 2)
content 去重后： (626, 2)
content 去空值后： (626, 2)
(626, 11)
                len                                         
              title                                         
predict_label  产品销售 公司内部管理 其他相关报道   噪音 消费服务 监

content 去重前： (938, 2)
content 去重后： (898, 2)
content 去空值后： (898, 2)
(898, 11)
                len                                          
              title                                          
predict_label  产品销售 公司内部管理 其他相关报道   噪音 消费服务  监管  行业 资本市场  All
tendency type                                                
-1       中国人寿     0      2      0    0    0   1   8    0   11
0        中国人寿     8     11      4  759    9  15  44   37  887
All               8     13      4  759    9  16  52   37  898
   已生成： result/cbirc_result/class/result/cbirc_class_predict_采集_types(3)_2019-03-06.xlsx

-- day_select:  2019-03-07
label 去重前： (764, 10)
label 去重后： (764, 10)
label 去空值后： (764, 10)
content 去重前： (764, 2)
content 去重后： (759, 2)
content 去空值后： (759, 2)
(759, 11)
                len                                          
              title                                          
predict_label  产品销售 公司内部管理 其他相关报道   噪音 消费服务  监管  行业 资本市场  All
tendency type                                

content 去重前： (1174, 2)
content 去重后： (1145, 2)
content 去空值后： (1145, 2)
(1145, 11)
                len                                          
              title                                          
predict_label  产品销售 公司内部管理 其他相关报道   噪音 消费服务 监管  行业 资本市场   All
tendency type                                                
-1       中国人寿     0      1      0    0    3  1  11    0    16
0        中国人寿    13     11      8  960   18  6  71   42  1129
All              13     12      8  960   21  7  82   42  1145
   已生成： result/cbirc_result/class/result/cbirc_class_predict_采集_types(3)_2019-03-18.xlsx

-- day_select:  2019-03-19
label 去重前： (1064, 10)
label 去重后： (1064, 10)
label 去空值后： (1064, 10)
content 去重前： (1064, 2)
content 去重后： (1048, 2)
content 去空值后： (1048, 2)
(1048, 11)
                len                                          
              title                                          
predict_label  产品销售 公司内部管理 其他相关报道   噪音 消费服务 监管  行业 资本市场   All
tendency type                     

In [42]:
# combined_data[combined_data['predict_label'] == '交通'].to_excel('建行北分—交通.xlsx')
# cbirc_combined[cbirc_combined['predict_label'] == '交通']

Unnamed: 0,id,gather_type,type,urlhash,predict_label,label,title,content,group_id,publishtime,tendency


#### 补录
- gather_type 1-人工补录
- gather_type 3-导入数据

In [39]:
types = 5
gather_types = '补录'
print("获取 %s 数据..."%(proj_name_dict[types]))
if proj_name == proj_name_dict[6]: 
    sel_type = 1
else :
    sel_type = types
for day_select in day_list:
    print('-- day_select: ', day_select)
    
    # 获取八分类
    if types in [1,2, 6]:
        sql_label = '''
        SELECT 
            t1.type, t1.urlhash, t3.title, t3.group_id, t3.publishtime,
            t1.traffic_id, t2.sen as tendency, t2.gather_type
        FROM
            db_classify_traffic_docinfo t1
                LEFT JOIN
            db_docinfo_trade t2 ON t1.urlhash = t2.urlhash
                LEFT JOIN
            db_docinfo t3 ON t2.urlhash = t3.urlhash
        WHERE
            t3.publishtime >= '{0} 00:00:00'
                AND t3.publishtime <= '{0} 23:59:59'
                and t1.type = {1}
                and t1.type = t2.type
                and t2.gather_type in (1,3)
        group by t3.titlehash
        '''.format(day_select, sel_type)
    elif types in [3,4,5]:
        sql_label = '''
        SELECT 
            t1.type, t1.urlhash, t3.title, t3.group_id, t3.publishtime,
            t1.traffic_id, t2.sen as tendency, t2.gather_type
        FROM
            db_classify_traffic_docinfo t1
                LEFT JOIN
            db_docinfo_trade t2 ON t1.urlhash = t2.urlhash
                LEFT JOIN
            db_docinfo t3 ON t2.urlhash = t3.urlhash
        WHERE
            t3.publishtime >= '{0} 00:00:00'
                AND t3.publishtime <= '{0} 23:59:59'
                and t1.type = {1}
                and t1.type = t2.type
                and t2.gather_type in (1,3)
        group by t3.titlehash
        '''.format(day_select, sel_type)        

    cbirc_label = pd.read_sql(sql_label, engine)
    cbirc_label['predict_label'] = cbirc_label['traffic_id'].apply(lambda x:class_name_dict[x])
    cbirc_label['group_id'] = cbirc_label['group_id'].apply(lambda x:group_dict[str(x)])
    cbirc_label['type'] = cbirc_label['type'].apply(lambda x:proj_name_dict[x])
    cbirc_label['gather_type'] = cbirc_label['gather_type'].apply(lambda x:gather_type_name_dict[x])
    
    if proj_name == proj_name_dict[6]: cbirc_label['type'] = proj_name_dict[6]
    
    print('label 去重前：', cbirc_label.shape)
    cbirc_label = cbirc_label.drop_duplicates(subset = 'title')
    print('label 去重后：', cbirc_label.shape)  
    cbirc_label = cbirc_label.dropna(subset = ['title'], axis = 0)
    print('label 去空值后：', cbirc_label.shape)  
        
    if cbirc_label['urlhash'].shape[0] != 0:
        # 获取 content
        url_l = cbirc_label['urlhash'].tolist()
        if cbirc_label['urlhash'].shape[0] == 1:
            url_l.append(url_l[0])
        url_list = tuple(url_l)
        sql_content = '''
        SELECT 
            t1.urlhash, t1.text as content
        FROM
            db_docinfo_text t1
        WHERE
            t1.urlhash in {0}
        '''.format(url_list)
        cbirc_content = pd.read_sql(sql_content, engine)
        print('content 去重前：', cbirc_content.shape)
        cbirc_content = cbirc_content.drop_duplicates(subset = 'content')
        print('content 去重后：', cbirc_content.shape)  
        cbirc_content = cbirc_content.dropna(subset = ['content'], axis = 0)
        print('content 去空值后：', cbirc_content.shape)  

        cbirc_combined = pd.merge(cbirc_label, cbirc_content, on = 'urlhash', how = 'inner')
        print(cbirc_combined.shape)
        print(cbirc_combined.pivot_table(index = ['tendency', 'type'], columns = ['predict_label'], 
                                    aggfunc = [len], values = ['title'], 
                                    fill_value = 0, margins = True))    
        cbirc_combined['label'] = ''
        cbirc_combined = cbirc_combined[['gather_type', 'type', 'urlhash', 'predict_label', 'label', 'title', 
                                         'content', 'group_id', 'publishtime', 'tendency']]
    #     fea_filename = 'cbirc_result/class/result/cbirc_class_predict_mysql_%s.xlsx'%day_select
        fea_filename = 'result/cbirc_result/class/result/cbirc_class_predict_%s_types(%s)_%s.xlsx'%(gather_types, types, day_select)
        cbirc_combined.to_excel(fea_filename, index = False)
    

获取 中国人保 数据...
-- day_select:  2018-09-02
label 去重前： (0, 9)
label 去重后： (0, 9)
label 去空值后： (0, 9)
-- day_select:  2018-09-03
label 去重前： (0, 9)
label 去重后： (0, 9)
label 去空值后： (0, 9)
-- day_select:  2018-09-04
label 去重前： (0, 9)
label 去重后： (0, 9)
label 去空值后： (0, 9)
-- day_select:  2018-09-05
label 去重前： (0, 9)
label 去重后： (0, 9)
label 去空值后： (0, 9)
-- day_select:  2018-09-06
label 去重前： (0, 9)
label 去重后： (0, 9)
label 去空值后： (0, 9)
-- day_select:  2018-09-07
label 去重前： (0, 9)
label 去重后： (0, 9)
label 去空值后： (0, 9)
-- day_select:  2018-09-08
label 去重前： (0, 9)
label 去重后： (0, 9)
label 去空值后： (0, 9)
-- day_select:  2018-09-09
label 去重前： (0, 9)
label 去重后： (0, 9)
label 去空值后： (0, 9)
-- day_select:  2018-09-10
label 去重前： (0, 9)
label 去重后： (0, 9)
label 去空值后： (0, 9)
-- day_select:  2018-09-11
label 去重前： (0, 9)
label 去重后： (0, 9)
label 去空值后： (0, 9)
-- day_select:  2018-09-12
label 去重前： (0, 9)
label 去重后： (0, 9)
label 去空值后： (0, 9)
-- day_select:  2018-09-13
label 去重前： (0, 9)
label 去重后： (0, 9)
label 去空值后： (0, 9)
--

content 去重前： (8, 2)
content 去重后： (8, 2)
content 去空值后： (8, 2)
(8, 10)
                len                 
              title                 
predict_label    交通 公司内部管理 环保 行业 All
tendency type                       
-1       中国人保     0      0  4  1   5
0        中国人保     1      1  0  1   3
All               1      1  4  2   8
-- day_select:  2018-11-24
label 去重前： (0, 9)
label 去重后： (0, 9)
label 去空值后： (0, 9)
-- day_select:  2018-11-25
label 去重前： (2, 9)
label 去重后： (2, 9)
label 去空值后： (2, 9)
content 去重前： (2, 2)
content 去重后： (2, 2)
content 去空值后： (2, 2)
(2, 10)
                len       
              title       
predict_label    交通 环保 All
tendency type             
-1       中国人保     0  1   1
0        中国人保     1  0   1
All               1  1   2
-- day_select:  2018-11-26
label 去重前： (2, 9)
label 去重后： (2, 9)
label 去空值后： (2, 9)
content 去重前： (2, 2)
content 去重后： (2, 2)
content 去空值后： (2, 2)
(2, 10)
                len       
              title       
predict_label    交通 行业 All
tendency type     

### 八分类数据

#### 合并 & 保存

In [28]:
types =3
gather_types = '采集'

In [29]:
combined_data = pd.DataFrame()
for day_select in day_list:
#     tmp_data = pd.read_excel('cbirc_result/class/result/cbirc_class_predict_mysql_%s.xlsx'%day_select)
    file_name = 'result/cbirc_result/class/result/cbirc_class_predict_%s_types(%s)_%s.xlsx'%(gather_types, types, day_select)
    if os.path.isfile(file_name):
        print(file_name)
        tmp_data = pd.read_excel(file_name)
        combined_data = pd.concat([combined_data, tmp_data], axis = 0)

combined_data = combined_data[combined_data['predict_label'] != '补录']
print(combined_data.shape)
print(combined_data.pivot_table(index = ['type'], columns = ['predict_label'], 
                            aggfunc = [len], values = ['title'], 
                            fill_value = 0, margins = True)) 
print()
# {1: '银监会', 2: '保监会', 3: '中国人寿', 4: '建行北分', 5: '中国人保'}
# types = ['银监会', '建行北分']
# types = [ '中国人寿', '中国人保'] # '保监会',
# combined_data = combined_data[combined_data['type'].isin(types)]
# print(combined_data.shape)  
print(combined_data['predict_label'].value_counts())
combined_data.iloc[:2, :]

result/cbirc_result/class/result/cbirc_class_predict_采集_types(3)_2019-01-02.xlsx
result/cbirc_result/class/result/cbirc_class_predict_采集_types(3)_2019-01-03.xlsx
result/cbirc_result/class/result/cbirc_class_predict_采集_types(3)_2019-01-04.xlsx
result/cbirc_result/class/result/cbirc_class_predict_采集_types(3)_2019-01-05.xlsx
result/cbirc_result/class/result/cbirc_class_predict_采集_types(3)_2019-01-06.xlsx
result/cbirc_result/class/result/cbirc_class_predict_采集_types(3)_2019-01-07.xlsx
result/cbirc_result/class/result/cbirc_class_predict_采集_types(3)_2019-01-08.xlsx
result/cbirc_result/class/result/cbirc_class_predict_采集_types(3)_2019-01-09.xlsx
result/cbirc_result/class/result/cbirc_class_predict_采集_types(3)_2019-01-10.xlsx
result/cbirc_result/class/result/cbirc_class_predict_采集_types(3)_2019-01-11.xlsx
result/cbirc_result/class/result/cbirc_class_predict_采集_types(3)_2019-01-12.xlsx
result/cbirc_result/class/result/cbirc_class_predict_采集_types(3)_2019-01-13.xlsx
result/cbirc_result/class/re

Unnamed: 0,id,gather_type,type,urlhash,predict_label,label,title,content,group_id,publishtime,tendency
0,19998464,系统采集,中国人寿,-2117016323927309056,资本市场,,（上接D11版）上海华培动力科技股份有限公司首次公开发行股票网下初步配售结果及网上中签结果公...,（上接D11版）\n\n976 南方基金管理股份有限公司 南方安心...,纸媒,2019-01-02 04:12:22,0
1,17103448,系统采集,中国人寿,-7917605445913194496,噪音,,2018年齐鲁金融之星名单正式发布 济宁有她们,山东省人民政府办公厅关于公布2018年齐鲁金融之星名单的通知\n鲁政办字〔2018〕253号...,微信,2019-01-02 17:47:11,0


In [114]:
# combined_data_sel = combined_data[combined_data['predict_label'].isin(['交通', '环保'])]
# combined_data_sel.to_excel('picc_result_class/picc_data_20181220(1210-1219).xlsx', index = False)
# combined_data_sel['predict_label'].value_counts()

In [141]:
# combined_data = combined_data[['type', 'urlhash', 'local_label', 'label', 'title', 'content']]
# combined_data.rename(columns = {'local_label':'predict_label'}, inplace = True)
# combined_data.head()

In [30]:
fea_filename = 'result/cbirc_result/class/result/%s_%s_class_predict_mysql_20190401(0102-0331).xlsx'%(gather_types, 
                                                                                                      proj_name_dict[types])
print(fea_filename)
# sel_col = ['噪音', '消费服务', '公司内部管理', '监管',
#            '行业', '资本市场', '其他相关报道','产品销售','交通','环保']
# sel_col = ['公司内部管理', '监管', '行业', '产品销售']
sel_col = combined_data['predict_label'].unique().tolist()
sel_data = combined_data[combined_data['predict_label'].isin(sel_col)]
print(sel_data['predict_label'].value_counts())
print()
c_data = pd.DataFrame()
with pd.ExcelWriter(fea_filename) as writer:
    for label in sel_data['predict_label'].unique():
        tmp_data = sel_data[sel_data['predict_label'] == label]
        if gather_types == '补录':
            N = tmp_data.shape[0]
        else :
            if tmp_data.shape[0] > 200:
                N = 200
            else :
                N = tmp_data.shape[0]
            if label in ['交通',]: # '环保'
                N =  200 # tmp_data.shape[0]
            
        save_data = tmp_data.sample(n = N, axis = 0, random_state=42)
        save_data.to_excel(writer,label, index = False)
        c_data = pd.concat([c_data, save_data], axis = 0)
    cc_data = c_data.copy()
    cc_data['publishtime'] = cc_data['publishtime'].astype(str).apply(lambda x:x[:10])
    print(cc_data.pivot_table(index = ['type', 'publishtime'], 
                                columns = ['predict_label'], 
                                values = 'title', aggfunc=len, 
                                fill_value=0, margins=True))      
    writer.save()

result/cbirc_result/class/result/采集_中国人寿_class_predict_mysql_20190401(0102-0331).xlsx
噪音        73941
资本市场       5290
行业         4901
公司内部管理     1510
消费服务       1365
监管         1270
产品销售        942
其他相关报道      708
Name: predict_label, dtype: int64

predict_label     产品销售  公司内部管理  其他相关报道   噪音  消费服务   监管   行业  资本市场   All
type publishtime                                                       
中国人寿 2019-01-02      0       0       0    0     0    0    1     0     1
     2019-01-03      0       0       0    0     0    0    1     0     1
     2019-01-04      0       0       1    0     0    0    1     0     2
     2019-01-05      0       0       2    0     0    0    0     0     2
     2019-01-07      0       0       0    0     0    0    1     1     2
     2019-01-08      0       1       0    0     0    0    1     1     3
     2019-01-09      0       0       1    1     0    0    1     1     4
     2019-01-10      0       1       0    1     0    0    0     1     3
     2019-01-11      2       0 

### 倾向性

#### 合并 & 保存

In [116]:
combined_data = pd.DataFrame()
for day_select in day_list:
#     tmp_data = pd.read_excel('cbirc_result/class/result/cbirc_class_predict_mysql_%s.xlsx'%day_select)
    file_name = 'cbirc_result/class/result/cbirc_class_predict_%s_types(%s)_%s.xlsx'%(gather_types, types, day_select)
    if os.path.isfile(file_name):
        print(file_name)
        tmp_data = pd.read_excel(file_name)
        combined_data = pd.concat([combined_data, tmp_data], axis = 0)

print(combined_data.shape)
print(combined_data.pivot_table(index = ['predict_label'], columns = ['tendency'], 
                            aggfunc = [len], values = ['title'], 
                            fill_value = 0, margins = True).T) 
print()

if types in [4,5]:
    combined_data = combined_data[combined_data['predict_label'] != '噪音']
else :
    sel_col = ['公司内部管理', '监管', '行业', '消费服务']
    combined_data = combined_data[combined_data['predict_label'].isin(sel_col)]

print(combined_data.shape)  
print(combined_data.pivot_table(index = ['predict_label'], columns = ['tendency'], 
                            aggfunc = [len], values = ['title'], 
                            fill_value = 0, margins = True).T) 

print(combined_data['tendency'].value_counts())
combined_data.iloc[:2, :]

cbirc_result/class/result/cbirc_class_predict_采集_types(6)_2018-11-02.xlsx
cbirc_result/class/result/cbirc_class_predict_采集_types(6)_2018-11-03.xlsx
cbirc_result/class/result/cbirc_class_predict_采集_types(6)_2018-11-04.xlsx
cbirc_result/class/result/cbirc_class_predict_采集_types(6)_2018-11-05.xlsx
cbirc_result/class/result/cbirc_class_predict_采集_types(6)_2018-11-06.xlsx
cbirc_result/class/result/cbirc_class_predict_采集_types(6)_2018-11-07.xlsx
cbirc_result/class/result/cbirc_class_predict_采集_types(6)_2018-11-08.xlsx
cbirc_result/class/result/cbirc_class_predict_采集_types(6)_2018-11-09.xlsx
cbirc_result/class/result/cbirc_class_predict_采集_types(6)_2018-11-10.xlsx
cbirc_result/class/result/cbirc_class_predict_采集_types(6)_2018-11-11.xlsx
cbirc_result/class/result/cbirc_class_predict_采集_types(6)_2018-11-12.xlsx
cbirc_result/class/result/cbirc_class_predict_采集_types(6)_2018-11-13.xlsx
cbirc_result/class/result/cbirc_class_predict_采集_types(6)_2018-11-14.xlsx
cbirc_result/class/result/cbirc_class_

Unnamed: 0,id,gather_type,type,urlhash,predict_label,label,title,content,group_id,publishtime,tendency
11,5835886,系统采集,安徽银监局,-9157588078424358912,消费服务,,商河国寿财险重阳节送爱心,信托公司\n\r信托公司\n\rA-安徽国元信托有限责任公司\n\rA-安信信托股份有限公司...,新闻,2018-11-02 08:44:37,0
15,5830308,系统采集,安徽银监局,-7847497080215177216,公司内部管理,,中信银行合肥财富广场支行因违规吸收存款被罚25万,\n中国青年网北京11月2日电\n 据中国银保监会网站消息，中国银保监会网站公布了安徽银...,新闻,2018-11-02 10:53:00,-1


In [119]:
fea_filename = 'cbirc_result/tendency/result/%s_%s_tendency_predict_mysql_20181221(1102-1220).xlsx'%(gather_types, 
                                                                                                     proj_name_dict[types])
print(fea_filename)
print(combined_data['tendency'].value_counts())

N = 1000 # 每类 N 条数据
class_n = int(combined_data['predict_label'].unique().shape[0])
n = int(N / class_n) + 100

print('正负各 %s 条，共 %s 类， 每类各 %s 条'%(N, class_n, n))
with pd.ExcelWriter(fea_filename) as writer:
    for tendency in combined_data['tendency'].unique():
        print('-------------------------------------------------------')
        tmp_data = pd.DataFrame()
        sel_data = combined_data[combined_data['tendency'] == tendency]        
        for predict_label in combined_data['predict_label'].unique():
            label_data = sel_data[sel_data['predict_label'] == predict_label]
            if label_data.shape[0] > n:
                sel_label_data = label_data.sample(n = n, axis = 0, random_state=3)
            else :
                sel_label_data = label_data
            tmp_data = pd.concat([tmp_data, sel_label_data], axis = 0)        
            print('tendency: %s, predict_label: %s, size: %s'%(tendency, predict_label, tmp_data.shape))

        if gather_types == '补录':
            t_n = tmp_data.shape[0]
        else :            
            if tmp_data.shape[0] > N:
                t_n = N
            else :
                t_n = tmp_data.shape[0]
        
        tmp_data = tmp_data.sample(n = t_n, axis = 0, random_state=3)
        tmp_data.to_excel(writer,str(tendency), index = False)        
        print(tmp_data.pivot_table(index = ['tendency', 'type'], 
                                    columns = ['predict_label'], 
                                    values = 'title', aggfunc=len, 
                                    fill_value=0, margins=True))    
        
    writer.save()

cbirc_result/tendency/result/采集_安徽银监局_tendency_predict_mysql_20181221(1102-1220).xlsx
 0    3162
-1     980
Name: tendency, dtype: int64
正负各 1000 条，共 4 类， 每类各 350 条
-------------------------------------------------------
tendency: 0, predict_label: 消费服务, size: (336, 11)
tendency: 0, predict_label: 公司内部管理, size: (686, 11)
tendency: 0, predict_label: 行业, size: (1036, 11)
tendency: 0, predict_label: 监管, size: (1386, 11)
predict_label   公司内部管理  消费服务   监管   行业   All
tendency type                               
0        安徽银监局     247   239  259  255  1000
All                247   239  259  255  1000
-------------------------------------------------------
tendency: -1, predict_label: 消费服务, size: (350, 11)
tendency: -1, predict_label: 公司内部管理, size: (565, 11)
tendency: -1, predict_label: 行业, size: (834, 11)
tendency: -1, predict_label: 监管, size: (978, 11)
predict_label   公司内部管理  消费服务   监管   行业  All
tendency type                              
-1       安徽银监局     215   350  144  269  978
All      

## 本地模型

### 八分类

#### CBRC

In [75]:
from sklearn.externals import joblib
pipeline_old = joblib.load( "model/cbrc_8classifier_1015.pkl.z")

url = 'http://47.93.183.157:6001/judge_correlation_b'
col_name = 'cor'
types = 1

combined_data['title_content'] = combined_data['title'].astype(str) + '。' + combined_data['content'].astype(str)
title_content = pre_cor_cbrc.handle_contents(combined_data['title_content'].tolist())
print(len(title_content))

#### CIRC

In [92]:
from sklearn.externals import joblib
pipeline_old = joblib.load( "model/circ_8classifier_1113.pkl.z")
# pipeline_old = joblib.load( "model/circ_picc_10classifier_1118.pkl.z")

url = 'http://47.93.183.157:10000/judge_correlation_i'
col_name = 'cor'
types = 5

combined_data['title_content'] = combined_data['title'].astype(str) + '。' + combined_data['content'].astype(str)
title_content = pre_cor_circ.handle_contents(combined_data['title_content'].tolist())
print(len(title_content))

43


#### pre

In [93]:
local_label = pipeline_old.predict(title_content)
local_proba = pipeline_old.predict_proba(title_content)

In [94]:
combined_data['local_label'] = local_label
combined_data['local_proba'] = local_proba.max(axis = 1)
combined_data['local_label'] = combined_data['local_label'].apply(lambda x:class_name_dict[x])
print(combined_data.shape)
combined_data.iloc[:2, :]

(43, 13)


Unnamed: 0,gather_type,type,urlhash,predict_label,label,title,content,group_id,publishtime,tendency,title_content,local_label,local_proba
0,补录,中国人保,-1451164327749029888,环保,,11月18日21时40分新疆和田地区于田县发生3.0级地震,据中国地震台网测定，北京时间2018年11月18日21时40分在新疆和田地区于田县（北纬36...,新闻,2018-11-18 21:58:50,-1,11月18日21时40分新疆和田地区于田县发生3.0级地震。据中国地震台网测定，北京时间20...,噪音,0.605084
0,补录,中国人保,-6173404825726358528,监管,,周亮：民营经济离场论调极其错误 银保监会对国有和民营经济一视同仁,11月19日，在第九届财新峰会上，中国银行保险监督管理委员会副主席周亮表示，改革开放40年中...,新闻,2018-11-19 11:09:19,0,周亮：民营经济离场论调极其错误 银保监会对国有和民营经济一视同仁。11月19日，在第九届财新...,监管,0.915518


In [95]:
combined_data['local_label'].value_counts()

噪音        13
行业        12
公司内部管理     5
消费服务       5
其他相关报道     4
监管         3
资本市场       1
Name: local_label, dtype: int64

#### 线上线下一致性: mysql 与 local

In [96]:
combined_data['R_W'] = combined_data.apply(lambda x: 'Right' if x['local_label'] == x['predict_label'] else 'Wrong', axis = 1)
print(combined_data[combined_data['R_W'] == 'Right'].shape[0]/combined_data.shape[0])
print(combined_data['R_W'].value_counts())
combined_data[combined_data['R_W'] == 'Wrong'].pivot_table(index = ['local_label'], columns = ['predict_label'], 
                                                            aggfunc = [len], values = ['urlhash'], 
                                                            fill_value = 0, margins = True)

0.46511627906976744
Wrong    23
Right    20
Name: R_W, dtype: int64


Unnamed: 0_level_0,len,len,len,len,len,len,len
Unnamed: 0_level_1,urlhash,urlhash,urlhash,urlhash,urlhash,urlhash,urlhash
predict_label,交通,产品销售,公司内部管理,环保,监管,行业,All
local_label,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3
公司内部管理,1,1,0,0,0,1,3
其他相关报道,1,1,0,0,0,0,2
噪音,1,0,0,9,0,1,11
消费服务,1,0,1,1,0,0,3
行业,0,0,0,0,4,0,4
All,4,2,1,10,4,2,23


#### 线上线下一致性: online 与 local

In [97]:
combined_data['id'] = range(combined_data.shape[0])
combined_data['title'] = combined_data['title'].astype(str) 
combined_data['content'] = combined_data['content'].astype(str)

In [32]:
combined_data = pd.read_excel('广发银行数据.xlsx')

url = 'http://47.93.183.157:10000/judge_correlation_i'
col_name = 'cor'
types = 2

data = {"types":types,"record":combined_data.loc[:,['id', 'title' ,'content']].to_dict(orient = 'records')}

parse_data, elapsed_time = get_server_res(data, url, col_name)
print('elapsed_time: ', elapsed_time)
parse_data.columns = ['id', 'predict_label']
combined_data = pd.merge(combined_data, parse_data, on  = 'id')
parse_data.head()

elapsed_time:  5.00


Unnamed: 0,id,predict_label
0,15238562,1
1,9567425,8
2,16364349,8
3,18131378,1
4,17741383,5


In [36]:
# combined_data['predict_label'] = combined_data['predict_label'].apply(lambda x:class_name_dict[x])
combined_data.to_excel('广发银行数据_20181204.xlsx')
print(combined_data.shape)
combined_data.head()

(129, 9)


Unnamed: 0,id,node_id,tendency,type,group_id,publishtime,title,content,predict_label
0,15238562,1187,-1,3,1,2018-03-21 21:53:58,金融领域风险该戳的脓包要戳 否则有道德风险,金融领域也有一些违法违规行为或者规避风险的行为在兴风作浪。最近我们监管部门主动出手、果...,监管
1,9567425,1187,-1,3,1,2018-08-04 08:40:24,强监管之风继续吹 7月银监系统罚没金额超8000万,03:12 \n\n进入下半年以来，银行业乱象的整治仍在进行。本报记者...,噪音
2,16364349,1187,-1,3,1,2018-08-02 09:44:43,东莞市发展和改革局行政处罚决定书（东发改价监处〔2018〕21号）,当事人：\n\r东莞市臻品表业有限公司\n\r地\n\r址：\n\r东莞市黄江镇胜前岗村环城...,噪音
3,18131378,1187,0,3,1,2018-12-03 00:00:00,银行严监管边际改善,随着金融去杠杆进入尾声，严监管政策基本出尽，“降\n利率\n”阶段对\n银行\n负债成本...,监管
4,17741383,1187,0,3,1,2018-11-27 13:36:00,涉嫌多项违规 信美人寿“相互保”今日下架,来源标题：\n涉嫌多项违规 信美人寿“相互保”今日下架\n\r上证报记者最新获悉，监管部门已...,公司内部管理


In [102]:
data = {"types":types,"record":combined_data.loc[:,['id', 'title' ,'content']].to_dict(orient = 'records')}
# url = "http://47.93.77.19:6001/judge_correlation_yjh"
# col_name = 'sec'

# parse_data = get_server_res_yjh(data, url, col_name)
parse_data, elapsed_time = get_server_res(data, url, col_name)
print('elapsed_time: ', elapsed_time)
parse_data.columns = ['id', 'online_label']
parse_data.head()

elapsed_time:  0.00


Unnamed: 0,id,online_label
0,0,8
1,1,1
2,2,2
3,3,7
4,4,6


In [103]:
combined_data = pd.merge(combined_data, parse_data, on  = 'id')
print(combined_data.shape)
combined_data['online_label'] = combined_data['online_label'].apply(lambda x:class_name_dict[x])
combined_data['O_R_W'] = combined_data.apply(lambda x: 'Right' if x['local_label'] == x['online_label'] else 'Wrong', axis = 1)
print(combined_data[combined_data['O_R_W'] == 'Right'].shape[0]/combined_data.shape[0])
print(combined_data['O_R_W'].value_counts())
combined_data[combined_data['O_R_W'] == 'Wrong'].pivot_table(index = ['local_label'], columns = ['online_label'], 
                                                            aggfunc = [len], values = ['urlhash'], 
                                                            fill_value = 0, margins = True)

(43, 16)
0.7674418604651163
Right    33
Wrong    10
Name: O_R_W, dtype: int64


Unnamed: 0_level_0,len,len,len,len,len,len
Unnamed: 0_level_1,urlhash,urlhash,urlhash,urlhash,urlhash,urlhash
online_label,交通,公司内部管理,噪音,监管,行业,All
local_label,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3
公司内部管理,0,0,2,1,0,3
其他相关报道,0,0,1,0,0,1
噪音,2,0,0,0,1,3
消费服务,1,1,0,0,0,2
行业,0,0,0,1,0,1
All,3,1,3,2,1,10


#### 线上线下一致性: online 与 mysql

In [104]:
combined_data['O_R_W'] = combined_data.apply(lambda x: 'Right' if x['predict_label'] == x['online_label'] else 'Wrong', axis = 1)
print(combined_data[combined_data['O_R_W'] == 'Right'].shape[0]/combined_data.shape[0])
print(combined_data['O_R_W'].value_counts())
# combined_data[combined_data['O_R_W'] == 'Wrong'].pivot_table(index = ['predict_label'], 
#                                                              columns = ['online_label'], 
#                                                              aggfunc = [len], values = ['urlhash'], 
#                                                              fill_value = 0, margins = True)

0.5348837209302325
Right    23
Wrong    20
Name: O_R_W, dtype: int64


### 倾向性

In [110]:
combined_data.iloc[:2, :]

Unnamed: 0,type,urlhash,predict_label,label,title,content,group_id,publishtime,tendency,title_content
10724,中国人保,4823901446525329408,噪音,,因祸得福！CDR基金惊现大动作,渐渐被市场遗忘CDR基金，最近又曝出新动向。\n\n11月7日晚间，中国人保A股IPO网下初...,新闻客户端,2018-11-11 12:28:47,-1,因祸得福！CDR基金惊现大动作。渐渐被市场遗忘CDR基金，最近又曝出新动向。\n\n11月7...
4469,中国人保,2875199957075597824,噪音,,【招聘】中国人民财产保险股份有限公司珠海市分公司2019年校招,一 公司简介 （一）中国人民财产保险股份有限公司简介 中国人民财产保险股份有限公司（PICC...,微信,2018-11-13 13:29:47,0,【招聘】中国人民财产保险股份有限公司珠海市分公司2019年校招。一 公司简介 （一）中国人民...


#### CBRC

In [None]:
from sklearn.externals import joblib
pipeline_old = joblib.load( "model/cbrc_tendency_pipeline_20181114.pkl.z")

url = 'http://47.93.183.157:6001/tendency_analysis_b'
col_name = 'tendency'
types = 1

combined_data['title_content'] = combined_data['title'].astype(str) + '。' + combined_data['content'].astype(str)
title_content = pre_cor_cbrc.handle_contents(combined_data['title_content'].tolist())
print(len(title_content))

#### CIRC

In [118]:
from sklearn.externals import joblib
pipeline_old = joblib.load( "model/circ_chapter_tendency_1113.pkl.z")

url = 'http://47.93.183.157:10000/tendency_analysis_i'
col_name = 'tendency'
types = 5

combined_data['title_content'] = combined_data['title'].astype(str) + '。' + combined_data['content'].astype(str)
title_content = pre_cor_circ.handle_contents(combined_data['title_content'].tolist())
print(len(title_content))

43


#### pre

In [119]:
local_label = pipeline_old.predict(title_content)
local_proba = pipeline_old.predict_proba(title_content)

In [120]:
combined_data['local_label'] = local_label
combined_data['local_proba'] = local_proba.max(axis = 1)
# combined_data['local_label'] = combined_data['local_label'].apply(lambda x:class_name_dict[x])
print(combined_data.shape)
combined_data.iloc[:2, :]

(43, 13)


Unnamed: 0,gather_type,type,urlhash,predict_label,label,title,content,group_id,publishtime,tendency,title_content,local_label,local_proba
0,补录,中国人保,-1451164327749029888,环保,,11月18日21时40分新疆和田地区于田县发生3.0级地震,据中国地震台网测定，北京时间2018年11月18日21时40分在新疆和田地区于田县（北纬36...,新闻,2018-11-18 21:58:50,-1,11月18日21时40分新疆和田地区于田县发生3.0级地震。据中国地震台网测定，北京时间20...,-1,1.0
0,补录,中国人保,-6173404825726358528,监管,,周亮：民营经济离场论调极其错误 银保监会对国有和民营经济一视同仁,11月19日，在第九届财新峰会上，中国银行保险监督管理委员会副主席周亮表示，改革开放40年中...,新闻,2018-11-19 11:09:19,0,周亮：民营经济离场论调极其错误 银保监会对国有和民营经济一视同仁。11月19日，在第九届财新...,0,1.0


#### 线上线下一致性: mysql 与 local

In [121]:
combined_data['R_W'] = combined_data.apply(lambda x: 'Right' if x['local_label'] == x['tendency'] else 'Wrong', axis = 1)
print(combined_data[combined_data['R_W'] == 'Right'].shape[0]/combined_data.shape[0])
print(combined_data['R_W'].value_counts())
combined_data[combined_data['R_W'] == 'Wrong'].pivot_table(index = ['local_label'], columns = ['tendency'], 
                                                            aggfunc = [len], values = ['urlhash'], 
                                                            fill_value = 0, margins = True)

0.813953488372093
Right    35
Wrong     8
Name: R_W, dtype: int64


Unnamed: 0_level_0,len,len
Unnamed: 0_level_1,urlhash,urlhash
tendency,0,All
local_label,Unnamed: 1_level_3,Unnamed: 2_level_3
-1,8,8
All,8,8


#### 线上线下一致性: online 与 local

In [122]:
combined_data['id'] = range(combined_data.shape[0])
combined_data['title'] = combined_data['title'].astype(str) 
combined_data['content'] = combined_data['content'].astype(str)

In [123]:
data = {"types":types, "record":combined_data.loc[:,['id', 'title' ,'content']].to_dict(orient = 'records')}
# url = "http://47.93.77.19:6001/judge_correlation_yjh"
# col_name = 'sec'

# parse_data = get_server_res_yjh(data, url, col_name)
parse_data, elapsed_time = get_server_res(data, url, col_name)
print('elapsed_time: ', elapsed_time)
parse_data.columns = ['id', 'online_label']
parse_data.head()

elapsed_time:  2.00


Unnamed: 0,id,online_label
0,0,-1
1,1,0
2,2,-1
3,3,0
4,4,0


In [124]:
# combined_data.head()
# combined_data.iloc[:2, :]

In [125]:
combined_data = pd.merge(combined_data, parse_data, on  = 'id')
print(combined_data.shape)
# combined_data['online_label'] = combined_data['online_label'].apply(lambda x:class_name_dict[x])
combined_data['O_R_W'] = combined_data.apply(lambda x: 'Right' if x['local_label'] == x['online_label'] else 'Wrong', axis = 1)
print(combined_data[combined_data['O_R_W'] == 'Right'].shape[0]/combined_data.shape[0])
print(combined_data['O_R_W'].value_counts())
combined_data[combined_data['O_R_W'] == 'Wrong'].pivot_table(index = ['local_label'], columns = ['online_label'], 
                                                            aggfunc = [len], values = ['urlhash'], 
                                                            fill_value = 0, margins = True)

(43, 16)
0.9069767441860465
Right    39
Wrong     4
Name: O_R_W, dtype: int64


Unnamed: 0_level_0,len,len
Unnamed: 0_level_1,urlhash,urlhash
online_label,0,All
local_label,Unnamed: 1_level_3,Unnamed: 2_level_3
-1,4,4
All,4,4


#### 线上线下一致性: online 与 mysql

In [126]:
combined_data['O_R_W'] = combined_data.apply(lambda x: 'Right' if x['tendency'] == x['online_label'] else 'Wrong', axis = 1)
print(combined_data[combined_data['O_R_W'] == 'Right'].shape[0]/combined_data.shape[0])
print(combined_data['O_R_W'].value_counts())
# combined_data[combined_data['O_R_W'] == 'Wrong'].pivot_table(index = ['predict_label'], columns = ['online_label'], 
#                                                             aggfunc = [len], values = ['urlhash'], 
#                                                             fill_value = 0, margins = True)

0.9069767441860465
Right    39
Wrong     4
Name: O_R_W, dtype: int64


# 保存本文件

In [None]:
if 0:
    import datetime as dt
    
    def output_HTML(read_file, output_file):
        from nbconvert import HTMLExporter
        import codecs
        import nbformat
        exporter = HTMLExporter()
        # read_file is '.ipynb', output_file is '.html'
        output_notebook = nbformat.read(read_file, as_version=4)
        output, resources = exporter.from_notebook_node(output_notebook)
        codecs.open(output_file, 'w', encoding='utf-8').write(output)

    html_file_folder = 'html_files'
    if not os.path.exists(html_file_folder):
        os.makedirs(html_file_folder)

    today = dt.datetime.now().strftime('%Y%m%d')
    current_file = 'circ_cor_model_2_train.ipynb'
    output_file = 'html_files\%s_%s.html'%(os.path.splitext(current_file)[0], today)
    output_HTML(current_file, output_file)