- 中国人寿及其同业数据，补充八分类和倾向性结果
> - 七月及以前数据：db_docinfo_backup、db_docinfo_text_backup
> - 七月之后数据：db_docinfo_trade、db_docinfo_text

# 基本设置

In [1]:
import numpy as np
import pandas as pd
import os
import datetime

import requests,json
from sklearn.externals import joblib

import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

In [2]:
from toolkits.setup.date_time import get_day_list
from toolkits.setup import specific_func
specific_func.set_ch_pd()

# 连接数据库

In [3]:
engine = specific_func.get_engine('cbirc')

In [4]:
label_dic={'监管':1,'行业':2,'产品销售':3,'资本市场':4,'公司内部管理':5,'消费服务':6,'其他相关报道':7,'噪音':8}
class_name_dict = {v: k for k, v in label_dic.items()}
class_name_dict

{1: '监管',
 2: '行业',
 3: '产品销售',
 4: '资本市场',
 5: '公司内部管理',
 6: '消费服务',
 7: '其他相关报道',
 8: '噪音'}

# 七月及以前数据

In [5]:
ip_port = '47.93.183.157:10000'

headers={'content-type':'application/json'}
url_cor = "http://%s/judge_correlation_i"%ip_port
url_tend = "http://%s/tendency_analysis_i"%ip_port
url_warn = "http://%s/early_warning_i"%ip_port

# 七月之后数据

In [6]:
filename = 'raw/同业  8-10月.xlsx'
print(filename, '  ----------------')
data = pd.read_excel(filename)
data_null = data[data['八大分险类型'].isnull() | data['文章倾向性'].isnull()]
data_full = data[data['八大分险类型'].notnull() & data['文章倾向性'].notnull()]
print('总量：', data.shape)
print('缺失值数量：', data_null.shape)
print('无缺失值数量：', data_full.shape)

id_list = tuple(data_null['id'].unique().tolist())
print('id_list: ', len(id_list))

sql_count = "select count(t1.id) \
                    from db_docinfo t1 \
                        where t1.id in {0}".format(id_list)
count = pd.read_sql_query(sql_count, engine)
print('count: ', list(count.values)[0])

sql_title = "select t1.id, t1.title \
                    from db_docinfo t1 \
                        where t1.id in {0}".format(id_list)

sql_content = "select t1.id, t2.text as content \
                    from db_docinfo t1, db_docinfo_text t2 \
                        where t1.urlhash = t2.urlhash \
                            and t1.id in {0}".format(id_list)

title_id = pd.read_sql_query(sql_title, engine)
content_id = pd.read_sql_query(sql_content, engine)
title_content = pd.merge(title_id, content_id, on = 'id', how = 'left')
print('title_content: ', title_content.shape)

raw/同业  8-10月.xlsx   ----------------
总量： (69244, 12)
缺失值数量： (16560, 12)
无缺失值数量： (52684, 12)
id_list:  6070
count:  [6070]
title_content:  (6070, 3)


NameError: name 'i' is not defined

In [7]:
chunksize = 100
loop = int(len(id_list) / chunksize) + 1
title_content_com = pd.DataFrame()

for i in range(loop):
    print('id_list_sel: ', 0 + i * chunksize, chunksize + i * chunksize)        
    data = {"types":3, "record":title_content.iloc[0 + i * chunksize:chunksize + i * chunksize,
                                                   [0, 1, 2]].to_dict(orient = 'records')}

    if i < 57: continue
        
    # 相关性模型
    result = requests.post(url_cor, data = json.dumps(data),
                           headers=headers, allow_redirects=True)
    json_data = json.loads(result.text)
    cor_elapsed_time = json_data['elapsed_time']
    print('cor elapsed_time: ', cor_elapsed_time)
    cor_list = [[j['cor'], j['id']] for j in json_data['docs']]
    cor_list = pd.DataFrame(cor_list, columns = ['八大分险类型', 'id'])

    # 倾向性模型
    if i != 58:
        try :
            result = requests.post(url_tend, data = json.dumps(data),
                                   headers=headers, allow_redirects=True)
            json_data = json.loads(result.text)
            tend_elapsed_time = json_data['elapsed_time'] 
            print('tend elapsed_time: ', tend_elapsed_time)
            tendency_list = [[j['tendency'], j['id']] for j in json_data['docs']]            
        except Exception as e:
            print('error: ', e)
            tendency_list = []
            for index in range(len(data['record'])):
    #                 print(index, '.................')
                data_sel = {"types":3, "record":[data['record'][index]]}
    #                 print('data_sel: ', data_sel)
                try :
                    result = requests.post(url_tend, data = json.dumps(data_sel),
                                           headers=headers, allow_redirects=True)
                    json_data = json.loads(result.text) 
                    tendency_list.append([json_data['docs'][0]['tendency'], json_data['docs'][0]['id']])
                except Exception as e1:
                    print('error again...    ', e1)
                    print(data['record'][index])
                    tendency_list.append([0, data['record'][index]['id']])  
    else :
        tendency_list = []
        for index in range(len(data['record'])):
            tendency_list.append([0, data['record'][index]['id']])         

    tendency_list = pd.DataFrame(tendency_list, columns = ['文章倾向性', 'id'])

    cor_tend = pd.merge(cor_list, tendency_list, on = 'id', how = 'inner')
    title_content_com = pd.concat([title_content_com, cor_tend], axis = 0)                
    print('    %s  title_id: '%i, title_id.shape)
    print('    %s  content_id: '%i, content_id.shape)
    print('    %s  title_content: '%i, title_content.shape)
    print('    %s  title_content_com: '%i, title_content_com.shape)
    cor_tend.to_excel('res/cor_tend_%s.xlsx'%i, index = False)

id_list_sel:  0 100
id_list_sel:  100 200
id_list_sel:  200 300
id_list_sel:  300 400
id_list_sel:  400 500
id_list_sel:  500 600
id_list_sel:  600 700
id_list_sel:  700 800
id_list_sel:  800 900
id_list_sel:  900 1000
id_list_sel:  1000 1100
id_list_sel:  1100 1200
id_list_sel:  1200 1300
id_list_sel:  1300 1400
id_list_sel:  1400 1500
id_list_sel:  1500 1600
id_list_sel:  1600 1700
id_list_sel:  1700 1800
id_list_sel:  1800 1900
id_list_sel:  1900 2000
id_list_sel:  2000 2100
id_list_sel:  2100 2200
id_list_sel:  2200 2300
id_list_sel:  2300 2400
id_list_sel:  2400 2500
id_list_sel:  2500 2600
id_list_sel:  2600 2700
id_list_sel:  2700 2800
id_list_sel:  2800 2900
id_list_sel:  2900 3000
id_list_sel:  3000 3100
id_list_sel:  3100 3200
id_list_sel:  3200 3300
id_list_sel:  3300 3400
id_list_sel:  3400 3500
id_list_sel:  3500 3600
id_list_sel:  3600 3700
id_list_sel:  3700 3800
id_list_sel:  3800 3900
id_list_sel:  3900 4000
id_list_sel:  4000 4100
id_list_sel:  4100 4200
id_list_sel: 

In [9]:
chunksize = 100
loop = int(len(id_list) / chunksize) + 1
title_content_com = pd.DataFrame()

for i in range(loop):
#     print(i, '-----')
    cor_tend = pd.read_excel('res/cor_tend_%s.xlsx'%i)
    title_content_com = pd.concat([title_content_com, cor_tend], axis = 0)    
    print('    %s  title_content_com: '%i, title_content_com.shape)

print('title_content_com: ', title_content_com.shape)    

    0  title_content_com:  (100, 3)
    1  title_content_com:  (200, 3)
    2  title_content_com:  (300, 3)
    3  title_content_com:  (400, 3)
    4  title_content_com:  (500, 3)
    5  title_content_com:  (600, 3)
    6  title_content_com:  (700, 3)
    7  title_content_com:  (800, 3)
    8  title_content_com:  (900, 3)
    9  title_content_com:  (1000, 3)
    10  title_content_com:  (1100, 3)
    11  title_content_com:  (1200, 3)
    12  title_content_com:  (1300, 3)
    13  title_content_com:  (1400, 3)
    14  title_content_com:  (1500, 3)
    15  title_content_com:  (1600, 3)
    16  title_content_com:  (1700, 3)
    17  title_content_com:  (1800, 3)
    18  title_content_com:  (1900, 3)
    19  title_content_com:  (2000, 3)
    20  title_content_com:  (2100, 3)
    21  title_content_com:  (2200, 3)
    22  title_content_com:  (2300, 3)
    23  title_content_com:  (2400, 3)
    24  title_content_com:  (2500, 3)
    25  title_content_com:  (2600, 3)
    26  title_content_com:  (27

In [None]:
title_content_com.index = range(title_content_com.shape[0])
data_null = data_null.drop(['八大分险类型', '文章倾向性'], axis = 1)
print('data_null: ', data_null.shape) 
data_null = pd.merge(data_null, title_content_com, on = 'id', how = 'left')
print('combined_data: ', data_null.shape)    
data_null_still = data_null[data_null['八大分险类型'].isnull() | data_null['文章倾向性'].isnull()]
print('data_null_still: ', data_null_still.shape) 
data_null['八大分险类型'] = data_null['八大分险类型'].apply(lambda x: class_name_dict[x])
data_null['文章倾向性'] = data_null['文章倾向性'].apply(lambda x: '非负' if x == 0 else '负面')

update_data = pd.concat([data_null, data_full], axis = 0)
print('update_data: ', update_data.shape) 

writer = pd.ExcelWriter('result/{0}'.format(filename.split('/')[1]),
                        engine='xlsxwriter',
                        options={'strings_to_urls': False})

update_data.to_excel(writer, sheet_name='Sheet1', index = False)
writer.save()    