- 保监会 相关性模型 1 预处理

# 基本设置

In [1]:
import jieba
import sys
import re
import time
import string

%matplotlib inline
import numpy as np
import pandas as pd
import pre
import os
from sqlalchemy import create_engine
from pandas.io import sql

import warnings
warnings.filterwarnings('ignore')

In [2]:
def set_ch():
    '''
    功能：设定绘图时显示中文
    '''	
    from pylab import mpl
    mpl.rcParams['font.sans-serif'] = ['FangSong'] # 指定默认字体
    mpl.rcParams['axes.unicode_minus'] = False   # 解决保存图像是负号'-'显示为方块的问题
set_ch()

# 导入数据

In [4]:
folder = '20180808' # 数据文件夹

In [12]:
file_name = '{0}/mfa_cor_english_2.xlsx'.format(folder)
print(file_name)
cor_1 = pd.read_excel('data/%s'%file_name)
cor_1.columns = ['title', 'sensitivity', 'content']
cor_1['title_content'] = cor_1['title'] + '. ' + cor_1['content']
cor_1['sensitivity'] = cor_1['sensitivity'].apply(lambda x:0 if x != 1 else x)
print('去重前：', cor_1.shape)
cor_1 = cor_1.drop_duplicates(subset = 'title')
print('去重后：', cor_1.shape)
cor_1.head()

20180808/mfa_cor_english_2.xlsx
去重前： (6000, 4)
去重后： (5807, 4)


Unnamed: 0,title,sensitivity,content,title_content
0,"Despite US Warning, Turkey Has No Plans to Aba...",0,"Recai Berber, member of Turkish parliament fro...","Despite US Warning, Turkey Has No Plans to Aba..."
1,Moscow denounces ’anti-Russian’ US nuclear policy,0,MOSCOW (AFP) - Moscow on Saturday (Feb 3) deno...,Moscow denounces ’anti-Russian’ US nuclear pol...
2,Iran’s Rouhani raps US nuclear expansion plan ...,0,Dubai: Iran accused the United States on Sunda...,Iran’s Rouhani raps US nuclear expansion plan ...
3,Iranian President Hassan Rouhani raps new US n...,0,Iran accused the United States on Sunday of th...,Iranian President Hassan Rouhani raps new US n...
4,Iran’s Rouhani raps new U.S. nuclear plan as t...,0,DUBAI (Reuters) - Iran accused the United Stat...,Iran’s Rouhani raps new U.S. nuclear plan as t...


In [13]:
cor_1['sensitivity'].value_counts()

0    2927
1    2880
Name: sensitivity, dtype: int64

## 外交部-敏感

In [14]:
sen_1 = cor_1[cor_1['sensitivity'] == 1]
print(sen_1.shape)
sen_1.head()

(2880, 4)


Unnamed: 0,title,sensitivity,content,title_content
6,"Russia, Japan call for Korean Peninsula denucl...",1,"TOKYO, February 7. /TASS/. The common objectiv...","Russia, Japan call for Korean Peninsula denucl..."
7,UK’s May lays out measures against Russia,1,PRIME MINISTER THERESA MAY’S LIST OF ACTIONS A...,UK’s May lays out measures against Russia. PRI...
8,Russia receives list of 23 Russian diplomats e...,1,\n\nRead also\n\n \n\n\n\nUK to expel 23 Russi...,Russia receives list of 23 Russian diplomats e...
9,US ’stands in solidarity’ with UK on Russia,1,"The White House says it ""stands in solidarity""...",US ’stands in solidarity’ with UK on Russia. T...
10,Russia to expel UK diplomats as row over spy a...,1,\nMOSCOW/LONDON - Russia will soon expel Briti...,Russia to expel UK diplomats as row over spy a...


## 外交部-不敏感

In [15]:
unsen_1 = cor_1[cor_1['sensitivity'] == 0]
print(unsen_1.shape)
unsen_1.head()

(2927, 4)


Unnamed: 0,title,sensitivity,content,title_content
0,"Despite US Warning, Turkey Has No Plans to Aba...",0,"Recai Berber, member of Turkish parliament fro...","Despite US Warning, Turkey Has No Plans to Aba..."
1,Moscow denounces ’anti-Russian’ US nuclear policy,0,MOSCOW (AFP) - Moscow on Saturday (Feb 3) deno...,Moscow denounces ’anti-Russian’ US nuclear pol...
2,Iran’s Rouhani raps US nuclear expansion plan ...,0,Dubai: Iran accused the United States on Sunda...,Iran’s Rouhani raps US nuclear expansion plan ...
3,Iranian President Hassan Rouhani raps new US n...,0,Iran accused the United States on Sunday of th...,Iranian President Hassan Rouhani raps new US n...
4,Iran’s Rouhani raps new U.S. nuclear plan as t...,0,DUBAI (Reuters) - Iran accused the United Stat...,Iran’s Rouhani raps new U.S. nuclear plan as t...


## 合并数据

In [16]:
circ_sen_raw = pd.concat([sen_1, unsen_1], axis = 0)

circ_sen_raw['title_content'] = circ_sen_raw['title'].astype(str) + '。' + circ_sen_raw['content'].astype(str)
circ_sen_raw['index'] = range(circ_sen_raw.shape[0])
print('去重前： ', circ_sen_raw.shape)
circ_sen_raw = circ_sen_raw.drop_duplicates(subset = 'title_content')
print('去重后： ', circ_sen_raw.shape)
circ_sen_raw.head()

去重前：  (5807, 5)
去重后：  (5807, 5)


Unnamed: 0,title,sensitivity,content,title_content,index
6,"Russia, Japan call for Korean Peninsula denucl...",1,"TOKYO, February 7. /TASS/. The common objectiv...","Russia, Japan call for Korean Peninsula denucl...",0
7,UK’s May lays out measures against Russia,1,PRIME MINISTER THERESA MAY’S LIST OF ACTIONS A...,UK’s May lays out measures against Russia。PRIM...,1
8,Russia receives list of 23 Russian diplomats e...,1,\n\nRead also\n\n \n\n\n\nUK to expel 23 Russi...,Russia receives list of 23 Russian diplomats e...,2
9,US ’stands in solidarity’ with UK on Russia,1,"The White House says it ""stands in solidarity""...",US ’stands in solidarity’ with UK on Russia。Th...,3
10,Russia to expel UK diplomats as row over spy a...,1,\nMOSCOW/LONDON - Russia will soon expel Briti...,Russia to expel UK diplomats as row over spy a...,4


# 预处理数据

In [17]:
# titles = pre_cor_1.handle_contents(cir_data_raw['title'].tolist())
titles = pre.handle_contents(circ_sen_raw['title'].tolist())
print(len(titles))

save_filename = 'data/titles.txt'
fid = open(save_filename, "w+", encoding='UTF-8')
for data in titles:
    fid.write(data + '\n')
fid.close()

5807


In [18]:
# contents = pre_cor_1.handle_contents(cir_data_raw['content'].tolist())
contents = pre.handle_contents(circ_sen_raw['content'].tolist())
print(len(contents))
print(contents[:2])
# contents = [re.sub(r'[a-z]*', '', x) for x in contents]
# print(len(contents))
# print(contents[:2])

coprus_save_filename = 'data/contents.txt'
f = open(coprus_save_filename, "w+", encoding='UTF-8')
for data in contents:
    f.write(data + '\n')
f.close()

5807
['tokyo february ta common objective japan russia term north korean issue denuclearization korean peninsula party continue cooperate closely including ensure compliance security council resolution stance reiterated kenji kanasugi directorgeneral japanese foreign ministry asian oceanian affair bureau represents country talk situation korean peninsula russian deputy foreign minister igor morgulov japanese foreign ministry reported wednesday meeting tokyo senior diplomat exchanged view situation north korea welcomed dialogue seoul pyongyang aimed pyeongchang game successful tuesday morgulov japanese counterpart takeo mori round consultation bilateral relation peace treaty russian foreign ministry talk focused issue joint economic activity southern kuril island', 'prime minister theresa list action russia expel russian diplomat identified undeclared intelligence officer week leave single biggest expulsion reflecting time russia acted britain freeze russian asset evidence threaten life

In [20]:
label = circ_sen_raw['sensitivity'].tolist()
print(len(label))

coprus_save_filename = 'data/labels.txt'
f = open(coprus_save_filename, "w+", encoding='UTF-8')
for data in label:
    f.write(str(data) + '\n')
f.close()

5807


In [22]:
circ_sen_raw.to_excel('data/result/title_content_label.xlsx', index = False)
circ_sen_raw.shape

(5807, 5)

# 保存本文件

In [70]:
if 0:
    import datetime as dt
    
    def output_HTML(read_file, output_file):
        from nbconvert import HTMLExporter
        import codecs
        import nbformat
        exporter = HTMLExporter()
        # read_file is '.ipynb', output_file is '.html'
        output_notebook = nbformat.read(read_file, as_version=4)
        output, resources = exporter.from_notebook_node(output_notebook)
        codecs.open(output_file, 'w', encoding='utf-8').write(output)

    html_file_folder = 'html_files'
    if not os.path.exists(html_file_folder):
        os.makedirs(html_file_folder)

    today = dt.datetime.now().strftime('%Y%m%d')
    current_file = 'circ_cor_model_1_pre.ipynb'
    output_file = 'html_files\%s_%s.html'%(os.path.splitext(current_file)[0], today)
    output_HTML(current_file, output_file)