# Table of Contents
 <p><div class="lev1 toc-item"><a href="#Dataset-Preprocessing" data-toc-modified-id="Dataset-Preprocessing-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Dataset Preprocessing</a></div><div class="lev2 toc-item"><a href="#Save-Data" data-toc-modified-id="Save-Data-11"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Save Data</a></div><div class="lev2 toc-item"><a href="#Load-Data" data-toc-modified-id="Load-Data-12"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>Load Data</a></div><div class="lev1 toc-item"><a href="#Word-Segmentation" data-toc-modified-id="Word-Segmentation-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Word Segmentation</a></div><div class="lev1 toc-item"><a href="#Tokenize-Text" data-toc-modified-id="Tokenize-Text-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Tokenize Text</a></div><div class="lev1 toc-item"><a href="#Create-Word-Embeddings-with-GloVe" data-toc-modified-id="Create-Word-Embeddings-with-GloVe-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Create Word Embeddings with GloVe</a></div><div class="lev2 toc-item"><a href="#Read-GloVe" data-toc-modified-id="Read-GloVe-41"><span class="toc-item-num">4.1&nbsp;&nbsp;</span>Read GloVe</a></div><div class="lev2 toc-item"><a href="#Use-Glove-to-Initialize-Embedding-Matrix" data-toc-modified-id="Use-Glove-to-Initialize-Embedding-Matrix-42"><span class="toc-item-num">4.2&nbsp;&nbsp;</span>Use Glove to Initialize Embedding Matrix</a></div>

# Dataset Preprocessing 

In [50]:
import json
import os
from tqdm import tqdm

## Save Data

In [2]:
with open('/Users/lizhn7/Downloads/DATA/news/sample-1M.jsonl') as f:
    lines = f.readlines()
    
content = []
for line in lines:
    item = json.loads(line)
    if item['media-type'] == 'Blog':
        content.append(item['content'])

def write_to_file(content, name):
    with open(name, 'a', encoding='utf-8') as f:
        f.write(json.dumps(content, ensure_ascii=False) + '\n')
        f.close()
        
def gen_data(data):
    for i in range(len(data)):
        yield {
            'content': data[i]
        }

for i in gen_data(content):
    write_to_file(i, '/Users/lizhn7/Downloads/EXPERIMENT/COPA/LM/data/data.json')

## Load Data

In [51]:
def load_data(path, name):
    """
    Load date from file
    """
    data = []
    input_file = os.path.join(path)
    with open(input_file) as f:
        lines = f.readlines()
    for line in lines:
        item = json.loads(line)
        data.append(item[name])
    return data

def add_token(s):
    """
    Add end token
    """
    s = s.split()
    n = []
    for i in s:
        if i.endswith('.') or i.endswith('!') or i.endswith('?'):
            i += 'endtok'
        n.append(i)
    return ' '.join(n)

In [52]:
content = load_data('/Users/lizhn7/Downloads/EXPERIMENT/COPA/LM/data/data.json', 'content')
content = [add_token(c) for c in tqdm(content)]

100%|██████████| 265512/265512 [01:02<00:00, 4249.29it/s]


# Word Segmentation

In [54]:
from nltk import regexp_tokenize

In [85]:
def cut(s):
    """
    Word segmentation
    """
    pattern = r'''
              (?x)                   # set flag to allow verbose regexps 
              (?:[A-Z]\.)+           # abbreviations, e.g. U.S.A. 
              |\w+(?:[-&']\w+)*      # words w/ optional internal hyphens/apostrophe  
            '''  
    return regexp_tokenize(s, pattern=pattern)

def clean(s):
    """
    Clean data
    """
    for i in range(len(s)):
        for d in ['0', '1', '2', '3', '4', '5' ,'6', '7', '8', '9']:
            if d in s[i]:
                s[i] = '0'
        if s[i] == 'p' and i < len(s)-1:
            if s[i+1] == 'm':
                s[i] = 'pm'
                s[i+1] = ''
        if s[i] == 'a' and i < len(s)-1:
            if s[i+1] == 'm':
                s[i] = 'am'
                s[i+1] = ''
        if s[i] == 's':
            s[i] = ''
        if s[i].endswith("'s"):
            s[i] = s[i][:-2]
    return [i for i in s if i != '']

In [86]:
contWords = [clean(cut(c)) for c in tqdm(content)]

100%|██████████| 265512/265512 [03:23<00:00, 1307.07it/s]


# Tokenize Text

In [67]:
from keras.preprocessing.text import Tokenizer
import numpy as np

Using TensorFlow backend.


In [111]:
tok_sentWords = contWords.copy()
tokTexts = [' '.join(i) for i in tok_sentWords]
tokenizer = Tokenizer(num_words=None,
                      filters='',
                      lower=True)
tokenizer.fit_on_texts(tokTexts)
word2index = tokenizer.word_index
index2word = {i: w for w, i in word2index.items()}
sentLens = np.array([len(i) for i in tok_sentWords])
print('Number of sentences: \t{:d}'.format(len(sentLens)))
print('Distribution of sentence lengths (number of words):')
print('Min: {:d}   Max: {:d}   Mean: {:.3f}   Med: {:.3f}'.format(np.min(sentLens), np.max(sentLens), np.mean(sentLens), np.median(sentLens)))
print('Found %s unique tokens.' % len(word2index))

Number of sentences: 	265512
Distribution of sentence lengths (number of words):
Min: 0   Max: 12998   Mean: 403.903   Med: 234.000
Found 855058 unique tokens.


# Create Word Embeddings with GloVe

In [158]:
SEED=42
VOCAB_SIZE = 20000
EMBEDDING_DIM = 200

## Read GloVe

In [156]:
glove_n_symbols = !wc -l /Users/lizhn7/Downloads/DATA/glove/glove.twitter.27B.200d.txt
glove_n_symbols = int(glove_n_symbols[0].split()[0])
glove_n_symbols

1193513

In [None]:
glove_index_dict = {}
glove_embedding_weights = np.empty((glove_n_symbols, EMBEDDING_DIM))
globale_scale = 0.1
with open('/Users/lizhn7/Downloads/DATA/glove/glove.twitter.27B.200d.txt', 'r') as fp:
    index = 0
    for l in fp:
        l = l.strip().split()
        word = l[0]
        glove_index_dict[word] = index
        glove_embedding_weights[index, :] = [float(n) for n in l[1:]]
        index += 1
glove_embedding_weights *= globale_scale

## Use Glove to Initialize Embedding Matrix

In [152]:
from nltk import PorterStemmer, LancasterStemmer, WordNetLemmatizer

In [None]:
# generate random embedding with same scale as glove
np.random.seed(SEED)
shape = (VOCAB_SIZE, EMBEDDING_DIM)
scale = glove_embedding_weights.std() * np.sqrt(12) / 2 
embedding = np.random.uniform(low=-scale, high=scale, size=shape)

In [None]:
wnl = WordNetLemmatizer()
porter = PorterStemmer()
lancaster = LancasterStemmer()

In [None]:
# Copy from glove weights of words that appear in index2word
count = 0 
for i in range(1, VOCAB_SIZE):
    w = index2word[i]
    g = glove_index_dict.get(w)
    if g is None:
        ww = wnl.lemmatize(w)
        g = glove_index_dict.get(ww)
    if g is None:
        ww = porter.stem(w)
        g = glove_index_dict.get(ww)
    if g is None:
        ww = lancaster.stem(w)
        g = glove_index_dict.get(ww)
    if g is not None:
        embedding[i, :] = glove_embedding_weights[g, :]
        count += 1
print('{num_tokens}-{per:.2f}% tokens in vocab found in glove and copied to embedding.'.format(num_tokens=count, per=count/float(VOCAB_SIZE)*100))

- **Save Dictionarie**

In [18]:
import pickle

In [24]:
with open('/Users/lizhn7/Downloads/EXPERIMENT/COPA/LM/data/index.pkl', 'wb') as fp:
    pickle.dump((word2index, index2word), fp, -1)