# Table of Contents
 <p><div class="lev1 toc-item"><a href="#搜狐新闻数据" data-toc-modified-id="搜狐新闻数据-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>搜狐新闻数据</a></div><div class="lev1 toc-item"><a href="#全网新闻数据" data-toc-modified-id="全网新闻数据-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>全网新闻数据</a></div><div class="lev1 toc-item"><a href="#任务数据" data-toc-modified-id="任务数据-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>任务数据</a></div>

In [1]:
import os
import json
import jieba
import re
from tqdm import tqdm
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [2]:
def load_data(path):
    """
    Load date from file
    """
    input_file = os.path.join(path)
    with open(input_file, 'r', encoding='gb18030') as f:
        return f.read()

In [3]:
def load_mdata(path):
    """
    Load mission date from file
    """
    data = []
    input_file = os.path.join(path)
    with open(input_file) as f:
        lines = f.readlines()
    for line in lines:
        item = json.loads(line)
        data.append(item['abstract'])
    return data

In [4]:
not_cuts = re.compile(u'([\da-zA-Z\.]+)|《(.*?)》|“(.{1,10})”')
re_replace = re.compile(u'[^\u4e00-\u9fa50-9a-zA-Z\%《》\(\)（）“”·\.]')

In [5]:
def newcut(s):
    """
    Word Segmentation
    """
    result = []
    j = 0
    s = re_replace.sub(' ', s)
    
    for i in not_cuts.finditer(s):
        result.extend(jieba.lcut(s[j:i.start()], HMM=False))
        if s[i.start()] in [u'《', u'“']:
            result.extend([s[i.start()], s[i.start()+1:i.end()-1], s[i.end()-1]])
        else:
            result.append(s[i.start():i.end()])
        j = i.end()
    result.extend(jieba.lcut(s[j:], HMM=False))
    return result

def clean(s):
    """
    Clean data
    """
    for i in range(len(s)):
        if s[i] == ' ':
            s[i] = None
        if s[i] == '(':
            for j in range(i+1, len(s)):
                if s[j] == ')':
                    for k in range(i, j+1):
                        s[k] = None
        if s[i] == '（':
            for j in range(i+1, len(s)):
                if s[j] == '）':
                    for k in range(i, j+1):
                        s[k] = None
        if s[i] == '%':
            if s[i-1] != None:
                s[i-1] = s[i-1]+'%'
            s[i] = None    
    return [i for i in s if i != None]

In [6]:
def parse(html):
    pattern = re.compile('<content>(.*?)</content>', re.S)
    items = re.findall(pattern, html)
    for item in tqdm(items):
        yield {
            'content': item
        }
        
def write_to_file(content, filename):
    with open(filename, 'a', encoding='utf-8') as f:
        f.write(json.dumps(content, ensure_ascii=False) + '\n')
        f.close()

# 搜狐新闻数据

In [7]:
data_path = '/Users/lizhn7/Downloads/DATA/chinese_news/news_sohusite_xml.dat'
rawData = load_data(data_path)

In [8]:
content = parse(rawData)
filename = '/Users/lizhn7/Downloads/DATA/chinese_news/content_1.json'
for c in content:
    if c['content'] != '':
        write_to_file({'contWords': clean(newcut(c['content']))}, filename)

  0%|          | 0/1411996 [00:00<?, ?it/s]Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/kz/hqjl_dfx3g3_2vxylxlj1s940000gn/T/jieba.cache
Loading model cost 1.069 seconds.
Prefix dict has been built succesfully.
100%|██████████| 1411996/1411996 [50:37<00:00, 464.86it/s] 


In [11]:
!wc -l /Users/lizhn7/Downloads/DATA/chinese_news/content_1.json

 1298156 /Users/lizhn7/Downloads/DATA/chinese_news/content_1.json


# 全网新闻数据

In [9]:
data_path = '/Users/lizhn7/Downloads/DATA/chinese_news/news_tensite_xml.dat'
rawData = load_data(data_path)

In [10]:
content = parse(rawData)
filename = '/Users/lizhn7/Downloads/DATA/chinese_news/content_2.json'
for c in content:
    if c['content'] != '':
        write_to_file({'contWords': clean(newcut(c['content']))}, filename)

100%|██████████| 1294233/1294233 [49:44<00:00, 433.70it/s] 


In [13]:
!wc -l /Users/lizhn7/Downloads/DATA/chinese_news/content_2.json

 1143529 /Users/lizhn7/Downloads/DATA/chinese_news/content_2.json


# 任务数据

In [15]:
rawData = load_mdata('/Users/lizhn7/Documents/Github/深度炼丹炉/causal_relation_extraction/raw_data.json')
filename = '/Users/lizhn7/Downloads/DATA/chinese_news/content_3.json'
for s in rawData:
    write_to_file({'contWords': clean(newcut(s))}, filename)

In [16]:
!wc -l /Users/lizhn7/Downloads/DATA/chinese_news/content_3.json

   21273 /Users/lizhn7/Downloads/DATA/chinese_news/content_3.json
