- 中国人寿及其同业数据，补充八分类和倾向性结果
> - 七月及以前数据：db_docinfo_backup、db_docinfo_text_backup
> - 七月之后数据：db_docinfo_trade、db_docinfo_text

# 基本设置

In [1]:
import numpy as np
import pandas as pd
import os
import datetime

import requests,json
from sklearn.externals import joblib

import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

In [2]:
from toolkits.setup.date_time import get_day_list
from toolkits.setup import specific_func
specific_func.set_ch_pd()

# 连接数据库

In [3]:
engine = specific_func.get_engine('cbirc')

In [34]:
label_dic={'监管':1,'行业':2,'产品销售':3,'资本市场':4,'公司内部管理':5,'消费服务':6,'其他相关报道':7,'噪音':8}
class_name_dict = {v: k for k, v in label_dic.items()}
class_name_dict

{1: '监管',
 2: '行业',
 3: '产品销售',
 4: '资本市场',
 5: '公司内部管理',
 6: '消费服务',
 7: '其他相关报道',
 8: '噪音'}

# 七月及以前数据

In [4]:
ip_port = '47.93.183.157:10000'

headers={'content-type':'application/json'}
url_cor = "http://%s/judge_correlation_i"%ip_port
url_tend = "http://%s/tendency_analysis_i"%ip_port
url_warn = "http://%s/early_warning_i"%ip_port

file_list_1 = ['raw/人寿 7月.xlsx', 'raw/同业  七月.xlsx']

In [None]:
for filename in file_list_1:
    print(filename, '  ----------------')
    data = pd.read_excel(filename)
    data_null = data[data['八大分险类型'].isnull() | data['文章倾向性'].isnull()]
    data_full = data[data['八大分险类型'].notnull() & data['文章倾向性'].notnull()]
    print('总量：', data.shape)
    print('缺失值数量：', data_null.shape)
    print('无缺失值数量：', data_full.shape)

    id_list = tuple(data_null['id'].tolist())

    chunksize = 2000
    sql_count = "select count(t1.id) \
                        from db_docinfo_backup t1 \
                            where t1.id in {0}".format(id_list)
    count = pd.read_sql_query(sql_count, engine)
    loop = int(list(count.values)[0] / chunksize) + 1

    sql_cbirc = "select t1.id, t1.title,t2.text as content \
                        from db_docinfo_backup t1, db_docinfo_text_backup t2 \
                            where t1.urlhash = t2.urlhash \
                                and t1.id in {0}".format(id_list)

    combine_data = pd.DataFrame()
    i = 0
    for tmp_data in pd.read_sql_query(sql_cbirc, engine, chunksize = chunksize):
        i += 1        
        print('--  一、共 %s次循环，第 %s 次获取数据，开始...'%(loop, i))
        print(tmp_data.shape)

        data = {"types":3, "record":tmp_data.iloc[:,[0, 1, 2]].to_dict(orient = 'records')}

        # 相关性模型
        result = requests.post(url_cor, data = json.dumps(data),
                               headers=headers, allow_redirects=True)
        json_data = json.loads(result.text)
        cor_elapsed_time = json_data['elapsed_time']
        print('cor elapsed_time: ', cor_elapsed_time, '    tmp_data: ',tmp_data.shape)
        cor_list = [[j['cor'], j['id']] for j in json_data['docs']]
        cor_list = pd.DataFrame(cor_list, columns = ['八大分险类型', 'id'])

        # 倾向性模型
        result = requests.post(url_tend, data = json.dumps(data),
                               headers=headers, allow_redirects=True)
        json_data = json.loads(result.text)
        tend_elapsed_time = json_data['elapsed_time'] 
        print('tend elapsed_time: ', tend_elapsed_time, '    tmp_data: ',tmp_data.shape)
        tendency_list = [[j['tendency'], j['id']] for j in json_data['docs']]
        tendency_list = pd.DataFrame(tendency_list, columns = ['文章倾向性', 'id'])

        cor_tend = pd.merge(cor_list, tendency_list, on = 'id', how = 'inner')
        combine_data = pd.concat([combine_data, cor_tend], axis = 0)

    print('combine_data: ', combine_data.shape)    
    print('data_null: ', data_null.shape) 
    data_null.update(combine_data)
    data_null_still = data_null[data_null['八大分险类型'].isnull() | data_null['文章倾向性'].isnull()]
    print('data_null_still: ', data_null_still.shape) 

    update_data = pd.concat([data_null, data_full], axis = 0)
    print('update_data: ', update_data.shape) 

    writer = pd.ExcelWriter('result/{0}'.format(filename),
                            engine='xlsxwriter',
                            options={'strings_to_urls': False})

    update_data.to_excel(writer, sheet_name='Sheet1', index = False)
    writer.save()

raw/人寿 7月.xlsx   ----------------
总量： (8053, 12)
缺失值数量： (8028, 12)
无缺失值数量： (25, 12)
--  一、共 1次循环，第 1 次获取数据，开始...
(2000, 3)
cor elapsed_time:  39.00     tmp_data:  (2000, 3)


# 七月之后数据

In [5]:
file_list_2 = ['raw/人寿  8-10月.xlsx', 'raw/同业  8-10月.xlsx']

In [30]:
for filename in file_list_2:
    print(filename, '  ----------------')
    data = pd.read_excel(filename)
    data_null = data[data['八大分险类型'].isnull() | data['文章倾向性'].isnull()]
    data_full = data[data['八大分险类型'].notnull() & data['文章倾向性'].notnull()]
    print('总量：', data.shape)
    print('缺失值数量：', data_null.shape)
    print('无缺失值数量：', data_full.shape)

    id_list = tuple(data_null['id'].unique().tolist())
    print('id_list: ', len(id_list))
    
    chunksize = 100
    loop = int(len(id_list) / chunksize) + 1
    title_content_com = pd.DataFrame()
    for i in range(loop):
        print('id_list_sel: ', 0 + i * chunksize, chunksize + i * chunksize)
        id_list_sel = id_list[0 + i * chunksize:chunksize + i * chunksize]
        sql_count = "select count(t1.id) \
                            from db_docinfo t1 \
                                where t1.id in {0}".format(id_list_sel)
        count = pd.read_sql_query(sql_count, engine)
        print('count: ', list(count.values)[0])

        sql_title = "select t1.id, t1.title \
                            from db_docinfo t1 \
                                where t1.id in {0}".format(id_list_sel)

        sql_content = "select t1.id, t2.text as content \
                            from db_docinfo t1, db_docinfo_text t2 \
                                where t1.urlhash = t2.urlhash \
                                    and t1.id in {0}".format(id_list_sel)

        title_id = pd.read_sql_query(sql_title, engine)
        content_id = pd.read_sql_query(sql_content, engine)
        title_content = pd.merge(title_id, content_id, on = 'id', how = 'left')
        
        data = {"types":3, "record":title_content.iloc[:,[0, 1, 2]].to_dict(orient = 'records')}

        # 相关性模型
        result = requests.post(url_cor, data = json.dumps(data),
                               headers=headers, allow_redirects=True)
        json_data = json.loads(result.text)
        cor_elapsed_time = json_data['elapsed_time']
        print('cor elapsed_time: ', cor_elapsed_time)
        cor_list = [[j['cor'], j['id']] for j in json_data['docs']]
        cor_list = pd.DataFrame(cor_list, columns = ['八大分险类型', 'id'])

        # 倾向性模型
        try :
            result = requests.post(url_tend, data = json.dumps(data),
                                   headers=headers, allow_redirects=True)
            json_data = json.loads(result.text)
            tend_elapsed_time = json_data['elapsed_time'] 
            print('tend elapsed_time: ', tend_elapsed_time)
            tendency_list = [[j['tendency'], j['id']] for j in json_data['docs']]            
        except Exception as e:
            print('error: ', e)
            tendency_list = []
            for index in range(len(data['record'])):
#                 print(index, '.................')
                data_sel = {"types":3, "record":[data['record'][index]]}
#                 print('data_sel: ', data_sel)
                try :
                    result = requests.post(url_tend, data = json.dumps(data_sel),
                                           headers=headers, allow_redirects=True)
                    json_data = json.loads(result.text) 
                    tendency_list.append([json_data['docs'][0]['tendency'], json_data['docs'][0]['id']])
                except Exception as e1:
                    print('error again...    ', e1)
                    print(data['record'][index])
                    tendency_list.append([0, data['record'][index]['id']])            

        tendency_list = pd.DataFrame(tendency_list, columns = ['文章倾向性', 'id'])

        cor_tend = pd.merge(cor_list, tendency_list, on = 'id', how = 'inner')
        title_content_com = pd.concat([title_content_com, cor_tend], axis = 0)                
        print('    %s  title_id: '%i, title_id.shape)
        print('    %s  content_id: '%i, content_id.shape)
        print('    %s  title_content: '%i, title_content.shape)
        print('    %s  title_content_com: '%i, title_content_com.shape)
    
    title_content_com.index = range(title_content_com.shape[0])
    data_null = data_null.drop(['八大分险类型', '文章倾向性'], axis = 1)
    print('data_null: ', data_null.shape) 
    data_null = pd.merge(data_null, title_content_com, on = 'id', how = 'left')
    print('combined_data: ', data_null.shape)    
    data_null_still = data_null[data_null['八大分险类型'].isnull() | data_null['文章倾向性'].isnull()]
    print('data_null_still: ', data_null_still.shape) 

    update_data = pd.concat([data_null, data_full], axis = 0)
    print('update_data: ', update_data.shape) 
    
    writer = pd.ExcelWriter('result/{0}'.format(filename.split('/')[1]),
                            engine='xlsxwriter',
                            options={'strings_to_urls': False})

    update_data.to_excel(writer, sheet_name='Sheet1', index = False)
    writer.save()    

raw/人寿  8-10月.xlsx   ----------------
总量： (16316, 12)
缺失值数量： (8598, 12)
无缺失值数量： (7718, 12)
id_list:  7472
id_list_sel:  0 100
count:  [100]
cor elapsed_time:  1.00
tend elapsed_time:  10.00
    0  title_id:  (100, 2)
    0  content_id:  (100, 2)
    0  title_content:  (100, 3)
    0  title_content_com:  (100, 3)
id_list_sel:  100 200
count:  [100]
cor elapsed_time:  1.00
error:  Expecting value: line 1 column 1 (char 0)
error again...     Expecting value: line 1 column 1 (char 0)
{'id': 63620278, 'title': '反洗钱不到位 中国人寿被处罚70万元', 'content': nan}
    1  title_id:  (100, 2)
    1  content_id:  (99, 2)
    1  title_content:  (100, 3)
    1  title_content_com:  (200, 3)
id_list_sel:  200 300
count:  [100]
cor elapsed_time:  2.00
tend elapsed_time:  15.00
    2  title_id:  (100, 2)
    2  content_id:  (100, 2)
    2  title_content:  (100, 3)
    2  title_content_com:  (300, 3)
id_list_sel:  300 400
count:  [100]
cor elapsed_time:  3.00
tend elapsed_time:  15.00
    3  title_id:  (100, 2)
    3

cor elapsed_time:  3.00
tend elapsed_time:  18.00
    37  title_id:  (100, 2)
    37  content_id:  (100, 2)
    37  title_content:  (100, 3)
    37  title_content_com:  (3800, 3)
id_list_sel:  3800 3900
count:  [100]
cor elapsed_time:  4.00
tend elapsed_time:  32.00
    38  title_id:  (100, 2)
    38  content_id:  (100, 2)
    38  title_content:  (100, 3)
    38  title_content_com:  (3900, 3)
id_list_sel:  3900 4000
count:  [100]
cor elapsed_time:  4.00
tend elapsed_time:  49.00
    39  title_id:  (100, 2)
    39  content_id:  (100, 2)
    39  title_content:  (100, 3)
    39  title_content_com:  (4000, 3)
id_list_sel:  4000 4100
count:  [100]
cor elapsed_time:  4.00
tend elapsed_time:  28.00
    40  title_id:  (100, 2)
    40  content_id:  (100, 2)
    40  title_content:  (100, 3)
    40  title_content_com:  (4100, 3)
id_list_sel:  4100 4200
count:  [100]
cor elapsed_time:  3.00
tend elapsed_time:  26.00
    41  title_id:  (100, 2)
    41  content_id:  (100, 2)
    41  title_content:  

raw/同业  8-10月.xlsx   ----------------
总量： (69244, 12)
缺失值数量： (16560, 12)
无缺失值数量： (52684, 12)
id_list:  6070
id_list_sel:  0 100
count:  [100]
cor elapsed_time:  5.00
tend elapsed_time:  27.00
    0  title_id:  (100, 2)
    0  content_id:  (100, 2)
    0  title_content:  (100, 3)
    0  title_content_com:  (100, 3)
id_list_sel:  100 200
count:  [100]
cor elapsed_time:  3.00
tend elapsed_time:  23.00
    1  title_id:  (100, 2)
    1  content_id:  (100, 2)
    1  title_content:  (100, 3)
    1  title_content_com:  (200, 3)
id_list_sel:  200 300
count:  [100]
cor elapsed_time:  3.00
tend elapsed_time:  75.00
    2  title_id:  (100, 2)
    2  content_id:  (100, 2)
    2  title_content:  (100, 3)
    2  title_content_com:  (300, 3)
id_list_sel:  300 400
count:  [100]
cor elapsed_time:  3.00
tend elapsed_time:  23.00
    3  title_id:  (100, 2)
    3  content_id:  (100, 2)
    3  title_content:  (100, 3)
    3  title_content_com:  (400, 3)
id_list_sel:  400 500
count:  [100]
cor elapsed_time: 

KeyboardInterrupt: 

In [31]:
title_content_com.head()

Unnamed: 0,八大分险类型,id,文章倾向性
0,2,62673426,0
1,2,62822330,0
2,2,62848651,0
3,2,62934961,0
4,8,63068523,0


In [32]:
data_null.head()

Unnamed: 0,标题,url,网站,八大分险类型,发布时间,渠道,转发量,文章倾向性,机构,机构倾向性,风险等级,id
3,权重股有护盘力量出现,http://blog.sina.com.cn/s/blog_9c1ddfd40102xjqr.html,新浪博客,,2018-07-19 11:29:36,博客渠道,3.0,,平安,非负,相关,107986117
4,权重股有护盘力量出现,http://blog.sina.cn/dpool/blog/s/blog_9c1ddfd40102xjqr.html?vt=4&cid=95643,手机新浪网,,2018-07-19 11:29:00,博客渠道,3.0,,平安,非负,相关,107986116
5,超跌股表现活跃,http://blog.sina.com.cn/s/blog_9c1ddfd40102xjrq.html,新浪博客,,2018-07-19 15:04:36,博客渠道,3.0,,平安,非负,相关,107969961
6,权重股有护盘力量出现,http://blog.sina.com.cn/s/blog_9c1ddfd40102xjqr.html?tj=fina,新浪网,,2018-07-19 11:29:36,博客渠道,3.0,,平安,非负,相关,107974691
7,超跌股表现活跃,http://blog.sina.com.cn/s/blog_9c1ddfd40102xjrq.html?tj=fina,新浪博客,,2018-07-19 15:04:36,博客渠道,3.0,,平安,非负,相关,107990118


In [None]:
    title_content_com.index = range(title_content_com.shape[0])
    data_null = data_null.drop(['八大分险类型', '文章倾向性'], axis = 1)
    print('data_null: ', data_null.shape) 
    data_null = pd.merge(data_null, title_content_com, on = 'id', how = 'left')
    print('combined_data: ', data_null.shape)    
    data_null_still = data_null[data_null['八大分险类型'].isnull() | data_null['文章倾向性'].isnull()]
    print('data_null_still: ', data_null_still.shape) 

    update_data = pd.concat([data_null, data_full], axis = 0)
    update_data[''] = update_data['八大分险类型'].apply(lambda x: class_name_dict[x])
    update_data[''] = update_data['八大分险类型'].apply(lambda x: class_name_dict[x])
    print('update_data: ', update_data.shape) 

In [33]:
    writer = pd.ExcelWriter('result/{0}'.format(filename.split('/')[1]),
                            engine='xlsxwriter',
                            options={'strings_to_urls': False})

    update_data.to_excel(writer, sheet_name='Sheet1', index = False)
    writer.save()