# 基本设置

In [1]:
import datetime
import os

from sqlalchemy import create_engine
from pandas.io import sql
import pymysql

import pandas as pd

In [2]:
def set_ch():
    '''
    功能：设定绘图时显示中文
    '''	
    from pylab import mpl
    mpl.rcParams['font.sans-serif'] = ['FangSong'] # 指定默认字体
    mpl.rcParams['axes.unicode_minus'] = False   # 解决保存图像是负号'-'显示为方块的问题
set_ch()

In [17]:
def get_server_res(data, url):
    '''
    服务器接口测试程序
    传入 dict, 传出 DataFrame
    '''
    # data = {'record':[{'id':0,'title':'ss','content':'zzz'},]}
    # data = {"record":marked_human_data.iloc[:5,:3].to_dict(orient = 'records')}
    # url "http://47.93.77.19:10000/correlation_negative"
    headers={'content-type':'application/json'}
    result = requests.post(url,
                      data = json.dumps(data),
                      headers=headers, allow_redirects=True)
    # print(result.text)
    json_data = json.loads(result.text)
    parse_data = []
    elapsed_time = json_data['elapsed_time']
    for i in range(len(json_data['docs'])):
        parse_data.append([json_data['docs'][i]['id'],
                          json_data['docs'][i]['jaccard'],
                          json_data['docs'][i]['repeated'],
                          json_data['docs'][i]['repeated_id']])
    parse_data = pd.DataFrame(parse_data, columns = ['id', 'jaccard','repeated','repeated_id'])    
    return parse_data, elapsed_time

## 类别和来源

In [7]:
label_dic={'补录':0,'监管':1,'行业':2,'产品销售':3,'资本市场':4,'公司内部管理':5,'消费服务':6,'其他相关报道':7,'噪音':8}
class_name_dict = {v: k for k, v in label_dic.items()}
class_name_dict

{0: '补录',
 1: '监管',
 2: '行业',
 3: '产品销售',
 4: '资本市场',
 5: '公司内部管理',
 6: '消费服务',
 7: '其他相关报道',
 8: '噪音'}

In [8]:
group = '1-新闻，2-论坛，3-博客，4-微博，5-纸媒，6-视频，7-外媒，11-微信，13-新闻客户端，15-推特'
group_dict = dict([x.split('-') for x in group.split('，')])
group_dict

{'1': '新闻',
 '11': '微信',
 '13': '新闻客户端',
 '15': '推特',
 '2': '论坛',
 '3': '博客',
 '4': '微博',
 '5': '纸媒',
 '6': '视频',
 '7': '外媒'}

# 获取数据
- wise_web_docinfo （业务表）
- wise_web_docinfo_center  相关数据表对应对应正文

In [4]:
try :
    DB_CON_STR = 'mysql+pymysql://wisedb:Wi$eWeb123@10.80.88.73:5718/pom?charset=utf8'  
    engine = create_engine(DB_CON_STR, echo=False) 
    sql.execute('show databases', engine)
except :
    DB_CON_STR = 'mysql+pymysql://wisedb:Wi$eWeb123@47.95.148.133:5718/pom?charset=utf8'  
    engine = create_engine(DB_CON_STR, echo=False) 
    sql.execute('show databases', engine) 

In [5]:
day_select = '2018-08-24'
limit_num = 100

In [16]:
sql_one_day = "select t1.group_id,t1.classify,\
                    t1.id, t1.title,t2.center as content, t1.publishtime as publishtime \
                    from wise_web_docinfo t1, wise_web_docinfo_center t2 \
                        where t1.id=t2.doc_id \
                              and date_format(t1.publishtime, '%%Y-%%m-%%d') = '{0}' \
                              group by t1.titlehash \
                              limit {1}".format(day_select, limit_num)

# titlehash 去重后
circ_cor = pd.read_sql(sql_one_day, engine)
circ_cor['group_id'] = circ_cor['group_id'].apply(lambda x: group_dict[str(x)])
circ_cor['classify'] = circ_cor['classify'].apply(lambda x:class_name_dict[x])
circ_cor['publishtime'] = circ_cor['publishtime'].apply(lambda x: x.strftime("%Y-%m-%d %H-%M-%S"))
print(circ_cor.shape  )
circ_cor.head()

(100, 6)


Unnamed: 0,group_id,classify,id,title,content,publishtime
0,新闻,资本市场,10311602,8月24日A股全天行业、概念资金动向,8月24日A股全天行业、概念资金动向来源:抓取2018/08/24 18:12:35 ...,2018-08-24 18-12-35
1,新闻,资本市场,10313535,定期财报标题：NEWTREEGROUP：有关截至二零一八年三月三十一日止年度之年度业绩公布及...,扫一扫，慧博手机终端下载！ (图片)(图片)(图片)中文研报 |--宏观经济 |--投资...,2018-08-24 18-06-00
2,微博,消费服务,10348821,发布了头条文章：《核保理赔人员如何对待先天性疾病被保险人？》 http://t.cn/Rk...,发布了头条文章：《核保理赔人员如何对待先天性疾病被保险人？》 http://t.cn/Rk...,2018-08-24 13-16-55
3,新闻,监管,10300401,银行业重磅！中资银行和AMC外资持股比例限制取消,银行业重磅！中资银行和AMC外资持股比例限制取消(图片)2018-08-24来源：券商中国摘...,2018-08-24 00-00-00
4,微信,产品销售,10329008,【邮·保障】邮政简易险，小保单，大保障！,没有人希望出现意外，但是风险的确无处不在 没有人害怕生活，没有人害怕过日子。 因为我们有手有...,2018-08-24 08-37-48


# 测试函数

In [13]:
import line_profiler
import sys

import requests,json

In [18]:
prof = line_profiler.LineProfiler(get_server_res)
prof.enable()  # 开始性能分析

data = {"record":circ_cor.loc[:,['id', 'title' ,'content', 'publishtime']].to_dict(orient = 'records')}
url = "http://192.168.0.104:11000/decide_similarity_i"
# url = "http://47.93.77.19:10000/decide_similarity_i"
parse_data, elapsed_time = get_server_res(data, url)

prof.disable()  # 停止性能分析
prof.print_stats(sys.stdout)

Timer unit: 3.3108e-07 s

Total time: 55.796 s
File: <ipython-input-17-22e2e9867877>
Function: get_server_res at line 1

Line #      Hits         Time  Per Hit   % Time  Line Contents
     1                                           def get_server_res(data, url):
     2                                               '''
     3                                               服务器接口测试程序
     4                                               传入 dict, 传出 DataFrame
     5                                               '''
     6                                               # data = {'record':[{'id':0,'title':'ss','content':'zzz'},]}
     7                                               # data = {"record":marked_human_data.iloc[:5,:3].to_dict(orient = 'records')}
     8                                               # url "http://47.93.77.19:10000/correlation_negative"
     9         1          9.0      9.0      0.0      headers={'content-type':'application/json'}
    10         1          7.0      