- 保监会 相关性模型 1 预处理

# 基本设置

In [2]:
import jieba
import sys
import re
import time
import string

%matplotlib inline
import numpy as np
import pandas as pd
# import pre_cor
import os
from sqlalchemy import create_engine
from pandas.io import sql

import warnings
warnings.filterwarnings('ignore')

In [3]:
def set_ch():
    '''
    功能：设定绘图时显示中文
    '''	
    from pylab import mpl
    mpl.rcParams['font.sans-serif'] = ['FangSong'] # 指定默认字体
    mpl.rcParams['axes.unicode_minus'] = False   # 解决保存图像是负号'-'显示为方块的问题
set_ch()

In [23]:
def pre_save(pre_func, data, save_folder):
    print(data.shape)
    print('save_folder: ', save_folder)
    
    titles = pre_func(data['title'].tolist())
    print('title num: ', len(titles))
    save_filename = save_folder + 'corpus/titles.txt'
    fid = open(save_filename, "w+", encoding='UTF-8')
    for line in titles:
        fid.write(line + '\n')
    fid.close()  
    
#     print(len(data['content'].tolist()))
    contents = pre_func(data['content'].tolist())
    print('content num: ', len(contents))
    print(contents[0])
    # contents = [re.sub(r'[a-z]*', '', x) for x in contents]
    # print(len(contents))
    # print(contents[:2])
    coprus_save_filename = save_folder + 'corpus/contents.txt'
    f = open(coprus_save_filename, "w+", encoding='UTF-8')
    for line in contents:
        f.write(line + '\n')
    f.close()    

    label = data['label'].tolist()
    print('label num: ', len(label))
    coprus_save_filename = save_folder + 'corpus/labels.txt'
    f = open(coprus_save_filename, "w+", encoding='UTF-8')
    for line in label:
        f.write(str(line) + '\n')
    f.close()

    data.to_excel(save_folder + 'corpus/title_content_label.xlsx', index = False)
    

## 预处理设置

In [18]:
import nltk
from nltk.stem import WordNetLemmatizer
from string import digits
import re

stopwords = {}
stw = open("corpus/stopwords.txt", encoding='UTF-8')
for ws in stw:
    ws = ws.replace("\n", "")
    ws = ws.replace("\r", "")
    stopwords[ws] = 1
stw.close()

In [19]:
def handle_contents(l_contents):
    lines = []
    for line in l_contents:
        lines.append(handle_content(line))
    return lines    

In [20]:
def handle_content(content):
    content = str(content)
    raw = content.strip()
    line = ""
    if raw != "":       
        # 1 清理字符串
        content = clean_sent(content)

        # 2 分句
        sent_tokenize_list = nltk.sent_tokenize(content)
        
        # 3 清理句子
        clean_sent_list = [clean_sent(sent) for sent in sent_tokenize_list]
        
        # 4 分词 
        # 去掉长度小于3、去掉数字、去掉标点符号/去掉 non-alpha 词
        word_tokenize_list = []
        for sent in clean_sent_list:
            word_t_l = filter(lambda x: len(x) > 3, map(clean_word, nltk.word_tokenize(sent)))
            word_tokenize_list += list(word_t_l)
        
        # 5 清理词
        # 去掉停用词、，小写化
        word_list = [word.lower() for word in word_tokenize_list if word.lower() not in stopwords]
        
        # 6 词形还原
        wnl = WordNetLemmatizer()
        word_list = [wnl.lemmatize(word) for word in word_list]

        line = " ".join(word_list)
    return line

In [21]:
def clean_sent(sent):
    sent = sent.replace("\n", " ").replace('\r',' ').replace('\r\n',' ')
    sent = sent.replace('\t', ' ').replace('\xa0', ' ')
    reobj = re.compile('//@(.*?)[:\s]')
    sent = reobj.sub("", sent)
    reobj = re.compile("@(.*?)[:\s]")
    sent = reobj.sub("", sent)
    reobj = re.compile(r"\[[^\[\]]*?\]")
    sent = reobj.sub("", sent)

    sent = sent.replace("，", ",")
    sent = sent.replace("。", ".")
    sent = sent.replace("！", "!")
    sent = sent.replace("？", "?")
    reobj = re.compile("//(.*?)[:\s]")
    sent = reobj.sub("", sent)
    return sent

In [22]:
def clean_word(s):  
    # 去除标点和特殊字符、数字、汉字
    regex = re.compile(r"[^a-zA-Z]")
    s = regex.sub('', s)
    
    # 去除字符串中的数字 s = 'abc123def456ghi789zero0'
    remove_digits = str.maketrans('', '', digits)
    res = s.translate(remove_digits)
    return res

# 行业分类数据--网信办

## 导入数据

In [27]:
folder = 'industy_data/20180910' # 数据文件夹

In [28]:
filename_list = os.listdir(folder)
industy_data = pd.DataFrame()
for index, filename in enumerate(filename_list):
    class_name = os.path.splitext(filename)[0]
    file_path = '%s/%s'%(folder, filename)
    print(index, class_name, file_path)
    
    tmp_data = pd.read_excel(file_path)
    tmp_data.columns = ['url', 'title', 'content']
    tmp_data['label'] = class_name
    print(tmp_data.shape)
    
    industy_data = pd.concat([industy_data, tmp_data], axis = 0)

print('industy_data: ', industy_data.shape)
industy_data['title_content'] = industy_data['title'] + '. ' + industy_data['content']
industy_data.head()

0 体育 industy_data/20180910/体育.xlsx
(1999, 4)
1 军事 industy_data/20180910/军事.xlsx
(1999, 4)
2 政治 industy_data/20180910/政治.xlsx
(1999, 4)
3 文化 industy_data/20180910/文化.xlsx
(1999, 4)
4 法制 industy_data/20180910/法制.xlsx
(1999, 4)
5 社会 industy_data/20180910/社会.xlsx
(1999, 4)
6 科技 industy_data/20180910/科技.xlsx
(1999, 4)
7 经贸 industy_data/20180910/经贸.xlsx
(1999, 4)
industy_data:  (15992, 4)


Unnamed: 0,url,title,content,label,title_content
0,http://sports.inquirer.net/317017/im-club-man-...,‘I’m a club man’: Mourinho insists Manchester ...,\nJose Mourinho insists he is only interested...,体育,‘I’m a club man’: Mourinho insists Manchester ...
1,http://www.thejakartapost.com/news/2018/08/27/...,’No Ronaldo no problem’ insists Real coach Lop...,Real Madrid coach Julen Lopetegui insisted aft...,体育,’No Ronaldo no problem’ insists Real coach Lop...
2,https://www.usatoday.com/story/sports/tennis/2...,1 and done: Halep 1st No. 1 seed to lose 1st U...,NEW YORK (AP) — Simona Halep made a quick-as-c...,体育,1 and done: Halep 1st No. 1 seed to lose 1st U...
3,https://www.usatoday.com/story/sports/nfl/2018...,13 NFL players who were top performers in 2018...,With a fluctuating cast of characters and opaq...,体育,13 NFL players who were top performers in 2018...
4,https://www.indiatoday.in/sports/cricket/story...,18-year-old Prithvi Shaw trains with Team Indi...,Prithvi Shaw has joined the Indian cricket tea...,体育,18-year-old Prithvi Shaw trains with Team Indi...


In [17]:
industy_data['label'].value_counts()

科技    1999
体育    1999
军事    1999
文化    1999
经贸    1999
法制    1999
社会    1999
政治    1999
Name: label, dtype: int64

In [14]:
# industy_data.to_excel('industy_data/20180910_industy_data.xlsx', index = False)

## 预处理

In [26]:
# text = industy_data['title_content'].tolist()
# print(len(text))
# text[0]

In [25]:
save_folder = 'industy_result/'
pre_save(handle_contents, industy_data, save_folder)

(15992, 5)
save_folder:  industy_result/
title num:  15992
content num:  15992
jose mourinho insists future prosperity manchester united position troubled club mourinho future united manager subject discussion tempestuous period dating frustrating closeseason transfer window conjecture mourinho happy effort executive vicechairman woodward handle player signing trafford advertisement equally damaging mourinho relationship star player paul pogba close scrutiny comment france midfielder mourinho dour uncooperative mood ahead team premier league meeting mauricio pochettino tottenham monday talkative moment reveal gratitude support fan game season supporter fantastic match mourinho team matter career selfish thinking club happy supporter support team leicester normal situation team playing winning supportive team brighton didn play lost match fantastic feeling hope feeling player feel player ready winning style advertisement mourinho refused banal topic medium claim left frustrated defeat b

# 倾向性--网信办

## 导入数据

In [31]:
folder = 'tendency_data/20180914' # 数据文件夹

In [30]:
filename_list = os.listdir(folder)
tendency_data = pd.DataFrame()
for index, filename in enumerate(filename_list):
    class_name = os.path.splitext(filename)[0]
    file_path = '%s/%s'%(folder, filename)
    print(index, class_name, file_path)
    
    tmp_data = pd.read_excel(file_path)
    tmp_data.columns = ['url', 'title', 'content']
    tmp_data['label'] = class_name
    print(tmp_data.shape)
    
    tendency_data = pd.concat([tendency_data, tmp_data], axis = 0)

print('tendency_data: ', tendency_data.shape)
tendency_data['title_content'] = tendency_data['title'] + '. ' + tendency_data['content']
tendency_data.head()

0 倾向性-中 tendency_data/20180914/倾向性-中.xlsx
(2942, 4)
1 倾向性-正 tendency_data/20180914/倾向性-正.xlsx
(2007, 4)
2 倾向性-负 tendency_data/20180914/倾向性-负.xlsx
(2966, 4)
industy_data:  (7915, 4)


Unnamed: 0,url,title,content,label,title_content
0,https://www.straitstimes.com/singapore/environ...,More shopping malls seek help to fix rodent pr...,Published3 hours ago\nSue-Ann Tansuetan@sph.co...,倾向性-中,More shopping malls seek help to fix rodent pr...
1,https://www.thetimes.co.uk/edition/times2/what...,What’s on TV tonight,"\nJuly 27 2018, 12:01am, The Times\nViewing gu...",倾向性-中,"What’s on TV tonight. \nJuly 27 2018, 12:01am,..."
2,https://www.hindustantimes.com/india-news/form...,Former Sabarimala temple board chief plans Jal...,A former president of the Travancore Devaswom ...,倾向性-中,Former Sabarimala temple board chief plans Jal...
3,http://nationalinterest.org/blog/middle-east-w...,Can Trump Get America Out of Afghanistan?,\n Last week the White House ordered its top...,倾向性-中,Can Trump Get America Out of Afghanistan?. \n...
4,https://www.voanews.com/a/winners-of-2018-phil...,Winners of 2018 Philippine-Based Magsaysay Awa...,"\nEast Asia \n\n\n\nJuly 26, 2018 8:20 AM\n\n\...",倾向性-中,Winners of 2018 Philippine-Based Magsaysay Awa...


In [None]:
industy_data['label'].value_counts()

## 预处理

In [119]:
handle_contents(cor_data[:2])

['recai berber turkish parliament ruling justice development party chairman parliamentary turkishrussian friendship recalled ankara decided purchase system holding talk reaching relevant agreement ally decision contradict membership nato allied relation united framework agreement matter technology exchange russia turkey nato ally opposing question berber sputnik alexey malgavkous sanction turkey purchase russian system reportshe contradictory message public issued department pentagon president situation international community understand message reflects true administration department statement consultation position matter defined turkey nato doubt purchase turkey relevant decision clinched agreement develop entire subsequent process statement department oblige turkey berber emphasized echoed beyazt karatas retired majorgeneral turkish force cited ankara current sharp antiamerican stance repeatedly obstacle turkey attempt purchase longrange defense system decision separately nato provi

In [120]:
# 相关数据
cor_data = cor_data_raw['content'].tolist()
print(len(cor_data))
corpus_cor = handle_contents(cor_data)

save_filename = 'data/{0}/corpus_pre_cor_0809.txt'.format(folder)
print(save_filename)
fid = open(save_filename, "w+", encoding='UTF-8')
for data in corpus_cor:
    fid.write(data + '\n')
fid.close()

5776
data/20180808/corpus_pre_cor_0809.txt


In [121]:
# 不相关数据
uncor_data = uncor_data_raw['content'].tolist()
print(len(uncor_data))
corpus_uncor = handle_contents(uncor_data)

save_filename = 'data/{0}/corpus_pre_uncor_0809.txt'.format(folder)
print(save_filename)
fid = open(save_filename, "w+", encoding='UTF-8')
for data in corpus_uncor:
    fid.write(data + '\n')
fid.close()

5461
data/20180808/corpus_pre_uncor_0809.txt


# 保存本文件

In [70]:
if 0:
    import datetime as dt
    
    def output_HTML(read_file, output_file):
        from nbconvert import HTMLExporter
        import codecs
        import nbformat
        exporter = HTMLExporter()
        # read_file is '.ipynb', output_file is '.html'
        output_notebook = nbformat.read(read_file, as_version=4)
        output, resources = exporter.from_notebook_node(output_notebook)
        codecs.open(output_file, 'w', encoding='utf-8').write(output)

    html_file_folder = 'html_files'
    if not os.path.exists(html_file_folder):
        os.makedirs(html_file_folder)

    today = dt.datetime.now().strftime('%Y%m%d')
    current_file = 'circ_cor_model_1_pre.ipynb'
    output_file = 'html_files\%s_%s.html'%(os.path.splitext(current_file)[0], today)
    output_HTML(current_file, output_file)