# Table of Contents
 <p><div class="lev1 toc-item"><a href="#Dataset-Preprocessing" data-toc-modified-id="Dataset-Preprocessing-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Dataset Preprocessing</a></div><div class="lev1 toc-item"><a href="#Word-Segmentation" data-toc-modified-id="Word-Segmentation-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Word Segmentation</a></div><div class="lev1 toc-item"><a href="#Tokenize-Text" data-toc-modified-id="Tokenize-Text-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Tokenize Text</a></div>

# Dataset Preprocessing 

In [2]:
import json
import os

In [2]:
with open('/Users/lizhn7/Downloads/DATA/news/sample-1M.jsonl') as f:
    lines = f.readlines()
    
content = []
for line in lines:
    item = json.loads(line)
    if item['media-type'] == 'Blog':
        content.append(item['content'])

def write_to_file(content, name):
    with open(name, 'a', encoding='utf-8') as f:
        f.write(json.dumps(content, ensure_ascii=False) + '\n')
        f.close()
        
def gen_data(data):
    for i in range(len(data)):
        yield {
            'content': data[i]
        }

for i in gen_data(content):
    write_to_file(i, '/Users/lizhn7/Downloads/EXPERIMENT/COPA/LM/data/data.json')

- **Load Data**

In [3]:
def load_data(path, name):
    """
    Load date from file
    """
    data = []
    input_file = os.path.join(path)
    with open(input_file) as f:
        lines = f.readlines()
    for line in lines:
        item = json.loads(line)
        data.append(item[name])
    return data

In [4]:
content = load_data('/Users/lizhn7/Downloads/EXPERIMENT/COPA/LM/data/data.json', 'content')

# Word Segmentation

In [5]:
from nltk import regexp_tokenize
from tqdm import tqdm

In [6]:
def cut(s):
    """
    Word segmentation
    """
    pattern = r'''
              (?x)                   # set flag to allow verbose regexps 
              (?:[A-Z]\.)+           # abbreviations, e.g. U.S.A. 
              |\w+(?:[-&']\w+)*      # words w/ optional internal hyphens/apostrophe  
           '''  
    return regexp_tokenize(s, pattern=pattern)

def clean(s):
    """
    Clean data
    """
    for i in range(len(s)):
        if s[i].isdigit():
            s[i] = '0'
        if s[i] == 'p' and i < len(s)-1:
            if s[i+1] == 'm':
                s[i] = 'pm'
                s[i+1] = ''
        if s[i] == 'a' and i < len(s)-1:
            if s[i+1] == 'm':
                s[i] = 'am'
                s[i+1] = ''
        if s[i] == 's':
            s[i] = ''
    return s

In [7]:
contWords = [clean(cut(c)) for c in tqdm(content)]

100%|██████████| 265512/265512 [01:43<00:00, 2574.16it/s]


# Tokenize Text

In [40]:
cut("readed's")

["readed's"]

In [44]:
' '.join(contWords[0])

'the return of the nike air max sensation has 80 s babies hyped home style the return of the nike air max sensation has 80 s babies hyped posted on sep 22 2015 if you were a basketball fan who was born in the 80 s you were lucky enough to witness the beauty that is 90 s basketball it was truly a great time to be basketball fan if you played close attention to what the players were wearing on their feet you would have also noticed the wide array of footwear these player used to rock one of those happens to the the nike air max sensation which is also set to receive the retro treatment this year originally released back in read more author kicksonfire share this post on googlefacebooktwitter'

In [12]:
from keras.preprocessing.text import Tokenizer
from nltk import WordNetLemmatizer
import numpy as np

Using TensorFlow backend.


In [14]:
wnl = WordNetLemmatizer()
contWords = [[wnl.lemmatize(t.lower()) for t in toks] for toks in tqdm(contWords)]

100%|██████████| 265512/265512 [10:03<00:00, 439.74it/s]          


In [21]:
contWords = [[t.lower() for t in toks] for toks in tqdm(contWords)]

100%|██████████| 265512/265512 [01:56<00:00, 2281.78it/s] 


In [22]:
tok_sentWords = contWords.copy()
tokTexts = [' '.join(i) for i in tok_sentWords]

In [23]:
tokenizer = Tokenizer(filters='')
tokenizer.fit_on_texts(tokTexts)
word2index = tokenizer.word_index
index2word = {i: w for w, i in word2index.items()}
sentLens = np.array([len(i) for i in tok_sentWords])
print('Number of sentences: \t{:d}'.format(len(sentLens)))
print('Distribution of sentence lengths (number of words):')
print('Min: {:d}   Max: {:d}   Mean: {:.3f}   Med: {:.3f}'.format(np.min(sentLens), np.max(sentLens), np.mean(sentLens), np.median(sentLens)))
print('Found %s unique tokens.' % len(word2index))

Number of sentences: 	265512
Distribution of sentence lengths (number of words):
Min: 7   Max: 17413   Mean: 396.790   Med: 233.000
Found 1060614 unique tokens.


In [17]:
tokenizer = Tokenizer(filters='')
tokenizer.fit_on_texts(tokTexts)
word2index = tokenizer.word_index
index2word = {i: w for w, i in word2index.items()}
sentLens = np.array([len(i) for i in tok_sentWords])
print('Number of sentences: \t{:d}'.format(len(sentLens)))
print('Distribution of sentence lengths (number of words):')
print('Min: {:d}   Max: {:d}   Mean: {:.3f}   Med: {:.3f}'.format(np.min(sentLens), np.max(sentLens), np.mean(sentLens), np.median(sentLens)))
print('Found %s unique tokens.' % len(word2index))

Number of sentences: 	265512
Distribution of sentence lengths (number of words):
Min: 7   Max: 17413   Mean: 396.790   Med: 233.000
Found 1039702 unique tokens.
Found 1039702 unique tokens.


In [40]:
s = "boy's "
s.endswith("'s")

True

In [None]:
def clean(w):
    """
    Clean data
    """
    if w.endswith("'s")
    
    for i in range(len(s)):
        if s[i] == ' ':
            s[i] = None
        if s[i] == '(':
            for j in range(i+1, len(s)):
                if s[j] == ')':
                    for k in range(i, j+1):
                        s[k] = None
        if s[i] == '（':
            for j in range(i+1, len(s)):
                if s[j] == '）':
                    for k in range(i, j+1):
                        s[k] = None
        if s[i] == '《':
            for j in range(i+1, len(s)):
                if s[j] == '》':
                    for k in range(i, j+1):
                        s[k] = None
        if s[i] == '%':
            if s[i-1] != None:
                s[i-1] = s[i-1]+'%'
            s[i] = None   
        if s[i] == '“' or s[i] == '”' or s[i] == '"':
            s[i] = None
        if s[i] == '於':
            s[i] = '于'
        if s[i] == '後':
            s[i] == '后'
    return [i for i in s if i != None]

- **Save Dictionarie**

In [18]:
import pickle

In [24]:
with open('/Users/lizhn7/Downloads/EXPERIMENT/COPA/LM/data/index.pkl', 'wb') as fp:
    pickle.dump((word2index, index2word), fp, -1)