- 保监会 相关性模型 1 预处理

# 基本设置

In [1]:
import jieba
import sys
import re
import time
import string

%matplotlib inline
import numpy as np
import pandas as pd
# import pre_cor
import os
from sqlalchemy import create_engine
from pandas.io import sql

import warnings
warnings.filterwarnings('ignore')

In [2]:
def set_ch():
    '''
    功能：设定绘图时显示中文
    '''	
    from pylab import mpl
    mpl.rcParams['font.sans-serif'] = ['FangSong'] # 指定默认字体
    mpl.rcParams['axes.unicode_minus'] = False   # 解决保存图像是负号'-'显示为方块的问题
set_ch()

# 导入数据

In [3]:
folder = '20180808' # 数据文件夹

## 外交部-相关

In [88]:
file_name = '{0}/mfa_cor_english_2.xlsx'.format(folder)
print(file_name)
cor_1 = pd.read_excel('data/%s'%file_name)
cor_1.columns = ['title', 'sensitivity', 'content']
print(cor_1.shape)
cor_1.head()

20180808/mfa_cor_english_2.xlsx
(6000, 3)


Unnamed: 0,title,sensitivity,content
0,"Despite US Warning, Turkey Has No Plans to Aba...",,"Recai Berber, member of Turkish parliament fro..."
1,Moscow denounces ’anti-Russian’ US nuclear policy,,MOSCOW (AFP) - Moscow on Saturday (Feb 3) deno...
2,Iran’s Rouhani raps US nuclear expansion plan ...,,Dubai: Iran accused the United States on Sunda...
3,Iranian President Hassan Rouhani raps new US n...,,Iran accused the United States on Sunday of th...
4,Iran’s Rouhani raps new U.S. nuclear plan as t...,,DUBAI (Reuters) - Iran accused the United Stat...


## 外交部-不相关

In [85]:
file_name = '{0}/mfa_uncor_english_2_1.xlsx'.format(folder)
print(file_name)
uncor_1 = pd.read_excel('data/%s'%file_name)
uncor_1.columns = ['title', 'content']
print(uncor_1.shape)
uncor_1.head()

20180808/mfa_uncor_english_2_1.xlsx
(5497, 2)


Unnamed: 0,title,content
0,Thai vendors rubbish criticism of famous Mae K...,BANGKOK (THE NATION/ASIA NEWS NETWORK) - Vendo...
1,French Politician Arrested for Justifying Terr...,Police arrested Stephane Poussier in his home ...
2,The classic explanation for the Black Death pl...,Plague came to Europe in the 14th century and ...
3,BRIEF-Sinopec Oilfield Service Gets Approval T...,Jan 17 (Reuters) - Sinopec Oilfield Service Co...
4,BRIEF-Sinopec Oilfield Service Says Lock-Up Pe...,Dec 22 (Reuters) - Sinopec Oilfield Service Co...


## 合并数据

In [86]:
# 相关
cor_data_raw = cor_1
print('去重前：', cor_data_raw.shape)
cor_data_raw = cor_data_raw.drop_duplicates(subset = 'title')
print('去重后：', cor_data_raw.shape)
cor_data_raw.head()

去重前： (6000, 4)
去重后： (5807, 4)


Unnamed: 0,title,sensitivity,content,title_content
0,"Despite US Warning, Turkey Has No Plans to Aba...",,"Recai Berber, member of Turkish parliament fro...","Despite US Warning, Turkey Has No Plans to Aba..."
1,Moscow denounces ’anti-Russian’ US nuclear policy,,MOSCOW (AFP) - Moscow on Saturday (Feb 3) deno...,Moscow denounces ’anti-Russian’ US nuclear pol...
2,Iran’s Rouhani raps US nuclear expansion plan ...,,Dubai: Iran accused the United States on Sunda...,Iran’s Rouhani raps US nuclear expansion plan ...
3,Iranian President Hassan Rouhani raps new US n...,,Iran accused the United States on Sunday of th...,Iranian President Hassan Rouhani raps new US n...
4,Iran’s Rouhani raps new U.S. nuclear plan as t...,,DUBAI (Reuters) - Iran accused the United Stat...,Iran’s Rouhani raps new U.S. nuclear plan as t...


In [87]:
# 不相关
uncor_data_raw = uncor_1
print('去重前：', uncor_data_raw.shape)
uncor_data_raw = uncor_data_raw.drop_duplicates( subset = 'title')
print('去重后：', uncor_data_raw.shape)
uncor_data_raw.head()

去重前： (5497, 2)
去重后： (5492, 2)


Unnamed: 0,title,content
0,Thai vendors rubbish criticism of famous Mae K...,BANGKOK (THE NATION/ASIA NEWS NETWORK) - Vendo...
1,French Politician Arrested for Justifying Terr...,Police arrested Stephane Poussier in his home ...
2,The classic explanation for the Black Death pl...,Plague came to Europe in the 14th century and ...
3,BRIEF-Sinopec Oilfield Service Gets Approval T...,Jan 17 (Reuters) - Sinopec Oilfield Service Co...
4,BRIEF-Sinopec Oilfield Service Says Lock-Up Pe...,Dec 22 (Reuters) - Sinopec Oilfield Service Co...


In [8]:
# 相关与不相关
cor_data_raw['label'] = 1
uncor_data_raw['label'] = 0
data_raw = pd.concat([cor_data_raw, uncor_data_raw])
print('去重前：', data_raw.shape)
data_raw = data_raw.drop_duplicates( subset = 'title', keep = False)
print('去重后：', data_raw.shape)

cor_data_raw = data_raw[data_raw['label'] == 1][['title', 'content']]
uncor_data_raw = data_raw[data_raw['label'] == 0][['title', 'content']]
print('cor_data_raw: ', cor_data_raw.shape)
print('uncor_data_raw: ', uncor_data_raw.shape)

去重前： (11299, 5)
去重后： (11237, 5)
cor_data_raw:  (5776, 2)
uncor_data_raw:  (5461, 2)


# 预处理数据

In [9]:
text = cor_1['title_content'].tolist()
print(len(text))
text[0]

6000




In [22]:
text[0].replace('\xa0', '').replace('\n', '').replace('\t', '')



## Tokenization & Segmentation 单词化 / 语块化 分词

### Sentence Tokenize（分割句子）

In [24]:
from nltk.tokenize import sent_tokenize  
sent_tokenize_list = sent_tokenize(text[0])
sent_tokenize_list

 'Recai Berber, member of\xa0Turkish parliament from\xa0the ruling Justice and Development Party and chairman of\xa0the parliamentary Turkish-Russian Friendship Group, has recalled that Ankara decided to\xa0purchase the S-400 systems after\xa0holding talks and reaching relevant agreements with\xa0all its allies.',
 '"Consequently, we know that this decision cannot contradict our membership in NATO and allied relations with\xa0the United States.',
 'Within the framework of\xa0the agreement on\xa0S-400, it is also a matter of\xa0technology exchange between\xa0Russia and Turkey.',
 'So NATO allies opposing it is out\xa0of the question," Berber said.',
 '©\n                    Sputnik/ Alexey MalgavkoUS May Sanction Turkey Following Purchase of Russian S-400 Systems - ReportsHe pointed to "contradictory messages to\xa0the public " issued by\xa0the US State Department, the Pentagon and the US President, which he said led to\xa0the situation when "neither we nor the international community c

### Word Tokenize(分割单词)

In [27]:
from nltk.tokenize import word_tokenize  

word_list = word_tokenize(sent_tokenize_list[0])
print(word_list)
print()
word_tokenize_list = word_tokenize(text[0])  
print(word_tokenize_list)  




## Normalization 数据标准化

### Noise Removal 非文本数据去除
- 对于自己爬虫爬下来的文本(如HTML格式)需要做非文本数据去除。
- 这一步主要是针对我们用爬虫收集的语料数据，由于爬下来的内容中有很多html的一些标签，需要去掉。少量的非文本内容的可以直接用Python的正则表达式(re)删除, 复杂的则可以用beautifulsoup来去除。另外还有一些特殊的非英文字符(non-alpha),也可以用Python的正则表达式(re)删除。

### Spell Check 拼写检查
- 由于英文文本中可能有拼写错误，因此一般需要进行拼写检查。如果确信我们分析的文本没有拼写问题，可以略去此步。
- 拼写检查，我们一般用pyenchant类库完成。

In [29]:
# from enchant.checker import SpellChecker

# chkr = SpellChecker("en_US")
# chkr.set_text("Many peope likee to watch In the Name of People.")
# for err in chkr:
#     print("ERROR:", err.word)

### Part-Of-Speech Tagging and POS Tagger(对词进行标注)
- lemmatization在词性标注后效果比较好
- 进行词性分析，去掉动词、助词等

In [35]:
from nltk.tokenize import word_tokenize
from nltk import pos_tag  #tokens是句子分词后的结果，同样是句子级的标注
 
print('-- 1 ', sent_tokenize_list[0])
word_tokenize_list = word_tokenize(sent_tokenize_list[0])  
print('-- 2 ', word_tokenize_list)   
pos_tag = pos_tag(word_tokenize_list)  
print('-- 3 ', pos_tag)  



### Stemming / Lemmatization 词干提取/词形还原
- 先词形还原后词干提取，归一化不同词性的单词。仅词形还原可能会有复数还原不全的问题。
- 词干提取(stemming)和词型还原(lemmatization)是英文文本预处理的特色。
> - 两者其实有共同点，即都是要找到词的原始形式。
> - 只不过词干提取(stemming)会更加激进一点，它在寻找词干的时候可以会得到不是词的词干。比如”imaging”的词干可能得到的是”imag”, 并不是一个词。而词形还原则保守一些，它一般只对能够还原成一个正确的词的词进行处理。个人比较喜欢使用词型还原而不是词干提取。

- 在nltk中，做词干提取的方法有PorterStemmer，LancasterStemmer和SnowballStemmer。
- 个人推荐使用SnowballStemmer。这个类可以处理很多种语言，当然，除了中文。

In [36]:
from nltk.stem import SnowballStemmer

stemmer = SnowballStemmer("english") # Choose a language
stemmer.stem("countries") # Stem a word

'countri'

In [38]:
import nltk  

sent1='The cat is walking in the bedroom.'  
sent2='A dog was running across the kitchen.'  
tokens_1=nltk.word_tokenize(sent1)  
print (tokens_1)  
stemmer = nltk.stem.PorterStemmer()  
stem_1 = [stemmer.stem(t) for t in tokens_1]  
print(stem_1) 

['The', 'cat', 'is', 'walking', 'in', 'the', 'bedroom', '.']
['the', 'cat', 'is', 'walk', 'in', 'the', 'bedroom', '.']


- 如果是做词型还原，则一般可以使用WordNetLemmatizer类，即wordnet词形还原方法，Lemmatization 把一个任何形式的语言词汇还原为一般形式，标记词性的前提下效果比较好。
- 在实际的英文文本挖掘预处理的时候，建议使用基于wordnet的词形还原就可以了。

In [37]:
from nltk.stem import WordNetLemmatizer

wnl = WordNetLemmatizer()
print(wnl.lemmatize('countries'))  

country


In [None]:
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return None

def lemmatize_sentence(sentence):
    res = []
    lemmatizer = WordNetLemmatizer()
    for word, pos in pos_tag(word_tokenize(sentence)):
        wordnet_pos = get_wordnet_pos(pos) or wordnet.NOUN
        res.append(lemmatizer.lemmatize(word, pos=wordnet_pos))
    return res

### Set All Characters to Lowercase 转化为小写
- 由于英文单词有大小写之分，我们期望统计时像“Home”和“home”是一个词。因此一般需要将所有的词都转化为小写。这个直接用python的API（.lower()）就可以搞定。

### Remove Stop Words 去除停用词

In [43]:
from nltk.corpus import stopwords   #去停用词

print('-- 1 ', sent_tokenize_list[0])
word_tokenize_list = word_tokenize(sent_tokenize_list[0])  
print('-- 2 ', word_tokenize_list)  

cachedStopWords = stopwords.words("english")
print(len(cachedStopWords))

filtered = [w for w in word_tokenize_list if (w not in cachedStopWords)]    
print(filtered)    

127


127

## 合并步骤

In [56]:
import nltk
from nltk.stem import WordNetLemmatizer
from string import digits
import re

stopwords = {}
stw = open("corpus/stopwords.txt", encoding='UTF-8')
for ws in stw:
    ws = ws.replace("\n", "")
    ws = ws.replace("\r", "")
    stopwords[ws] = 1
stw.close()

In [57]:
def handle_contents(l_contents):
    lines = []
    for line in l_contents:
        lines.append(handle_content(line))
    return lines    

In [78]:
def handle_content(content):
    content = str(content)
    raw = content.strip()
    line = ""
    if raw != "":       
        # 1 清理字符串
        content = clean_sent(content)

        # 2 分句
        sent_tokenize_list = nltk.sent_tokenize(content)
        
        # 3 清理句子
        clean_sent_list = [clean_sent(sent) for sent in sent_tokenize_list]
        
        # 4 分词 
        # 去掉长度小于3、去掉数字、去掉标点符号/去掉 non-alpha 词
        word_tokenize_list = []
        for sent in clean_sent_list:
            word_t_l = filter(lambda x: len(x) > 3, map(clean_word, nltk.word_tokenize(sent)))
            word_tokenize_list += list(word_t_l)
        
        # 5 清理词
        # 去掉停用词、，小写化
        word_list = [word.lower() for word in word_tokenize_list if word.lower() not in stopwords]
        
        # 6 词形还原
        wnl = WordNetLemmatizer()
        word_list = [wnl.lemmatize(word) for word in word_list]

        line = " ".join(word_list)
    return line

In [81]:
def clean_sent(sent):
    sent = sent.replace("\n", " ").replace('\r',' ').replace('\r\n',' ')
    sent = sent.replace('\t', ' ').replace('\xa0', ' ')
    reobj = re.compile('//@(.*?)[:\s]')
    sent = reobj.sub("", sent)
    reobj = re.compile("@(.*?)[:\s]")
    sent = reobj.sub("", sent)
    reobj = re.compile(r"\[[^\[\]]*?\]")
    sent = reobj.sub("", sent)

    sent = sent.replace("，", ",")
    sent = sent.replace("。", ".")
    sent = sent.replace("！", "!")
    sent = sent.replace("？", "?")
    reobj = re.compile("//(.*?)[:\s]")
    sent = reobj.sub("", sent)
    return sent

In [60]:
def clean_word(s):  
    # 去除标点和特殊字符、数字、汉字
    regex = re.compile(r"[^a-zA-Z]")
    s = regex.sub('', s)
    
    # 去除字符串中的数字 s = 'abc123def456ghi789zero0'
    remove_digits = str.maketrans('', '', digits)
    res = s.translate(remove_digits)
    return res

In [80]:
cor_data[:2]

['Recai Berber, member of\xa0Turkish parliament from\xa0the ruling Justice and Development Party and chairman of\xa0the parliamentary Turkish-Russian Friendship Group, has recalled that Ankara decided to\xa0purchase the S-400 systems after\xa0holding talks and reaching relevant agreements with\xa0all its allies.\n"Consequently, we know that this decision cannot contradict our membership in NATO and allied relations with\xa0the United States. Within the framework of\xa0the agreement on\xa0S-400, it is also a matter of\xa0technology exchange between\xa0Russia and Turkey. So NATO allies opposing it is out\xa0of the question," Berber said.\n\n                        ©\n                    Sputnik/ Alexey MalgavkoUS May Sanction Turkey Following Purchase of Russian S-400 Systems - ReportsHe pointed to "contradictory messages to\xa0the public " issued by\xa0the US State Department, the Pentagon and the US President, which he said led to\xa0the situation when "neither we nor the international

In [82]:
handle_contents(cor_data[:2])

['recai berber turkish parliament ruling justice development party chairman parliamentary turkishrussian friendship recalled ankara decided purchase system holding talk reaching relevant agreement ally decision contradict membership nato allied relation united framework agreement matter technology exchange russia turkey nato ally opposing question berber sputnik alexey malgavkous sanction turkey purchase russian system reportshe contradictory message public issued department pentagon president situation international community understand message reflects true administration department statement consultation position matter defined turkey nato doubt purchase turkey relevant decision clinched agreement develop entire subsequent process statement department oblige turkey berber emphasized echoed beyazt karatas retired majorgeneral turkish force cited ankara current sharp antiamerican stance repeatedly obstacle turkey attempt purchase longrange defense system decision separately nato provi

In [83]:
# 相关数据
cor_data = cor_data_raw['content'].tolist()
print(len(cor_data))
corpus_cor = handle_contents(cor_data)

save_filename = 'data/{0}/corpus_pre_cor_0809.txt'.format(folder)
print(save_filename)
fid = open(save_filename, "w+", encoding='UTF-8')
for data in corpus_cor:
    fid.write(data + '\n')
fid.close()

5776
data/20180808/corpus_pre_cor_0809.txt


In [74]:
# 不相关数据
uncor_data = uncor_data_raw['content'].tolist()
print(len(uncor_data))
corpus_uncor = handle_contents(uncor_data)

save_filename = 'data/{0}/corpus_pre_uncor_0809.txt'.format(folder)
print(save_filename)
fid = open(save_filename, "w+", encoding='UTF-8')
for data in corpus_uncor:
    fid.write(data + '\n')
fid.close()

8949
data/20180703/corpus_pre_uncor_0703.txt


# 保存本文件

In [70]:
if 0:
    import datetime as dt
    
    def output_HTML(read_file, output_file):
        from nbconvert import HTMLExporter
        import codecs
        import nbformat
        exporter = HTMLExporter()
        # read_file is '.ipynb', output_file is '.html'
        output_notebook = nbformat.read(read_file, as_version=4)
        output, resources = exporter.from_notebook_node(output_notebook)
        codecs.open(output_file, 'w', encoding='utf-8').write(output)

    html_file_folder = 'html_files'
    if not os.path.exists(html_file_folder):
        os.makedirs(html_file_folder)

    today = dt.datetime.now().strftime('%Y%m%d')
    current_file = 'circ_cor_model_1_pre.ipynb'
    output_file = 'html_files\%s_%s.html'%(os.path.splitext(current_file)[0], today)
    output_HTML(current_file, output_file)