In [1]:
import sys
import pickle
import nltk
import re
import numpy as np
import random
import os

In [2]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ygao\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
def make_sentences(datafile):
    """
    We will break the raw text in datafile into
    sentences. The nltk package will handle edge cases
    (ex. Mr. Potter) and will give us the list of sentences.
    """

    with open(datafile, 'r') as f:
        text = f.read()

    sentences = nltk.tokenize.sent_tokenize(text)
    return sentences

In [4]:
def prepare_data(filename,savename):

    original_data = filename
    # Break into sentences
    original_sentences = make_sentences(original_data)
    print("We have %i sentences." %(len(original_sentences)))
    
    print(original_sentences[0])
    # Store sentences into data file
    with open(savename, 'wb') as f:
        pickle.dump(original_sentences, f)

In [5]:
prepare_data('A_tale_of_two_cities.100kwords.original.txt','originaltwotale.p',)

We have 5809 sentences.
"She was mightily pleased to have your message, when I gave it her.


In [11]:
prepare_data('A_tale_of_two_cities.100kwords.obfuscated.txt','obfuscatedtwotale.p',)

We have 5809 sentences.
"She was mightily pleased to have your message, when I gave it her.


In [15]:
_PAD = "_PAD"
_GO = "_GO"
_EOS = "_EOS"
_UNK = "_UNK"
_START_VOCAB = [_PAD, _GO, _EOS, _UNK]

PAD_ID = 0
GO_ID = 1
EOS_ID = 2
UNK_ID = 3

_WORD_SPLIT = re.compile("([.,!?\"':;)(])")
_DIGIT_RE = re.compile(R"\d")

In [91]:
def basic_tokenizer(sentence):
    """ Split sentence into list of tokens """
    words = []
    for space_separated_item in sentence.strip().split():
        words.extend(_WORD_SPLIT.split(space_separated_item))
    return [w for w in words if w] # if w removes the ""

def get_vocab(tokenized, max_vocab_size):
    """
    Get vocab_list, vocab_dict and rev_vocab_dict given the
    tokenized sentences.
    """
    # Replace word count
    vocab = {}
    for sentence in tokenized:
        for word in sentence:
            if word in vocab:
                vocab[word] += 1
            else:
                vocab[word] = 1
    vocab_list = _START_VOCAB + sorted(vocab, key=vocab.get, reverse=True)
    if len(vocab_list) > max_vocab_size:
        vocab_list = vocab_list[:max_vocab_size]

    # Get vocab dict (word -> token) and rev dict (token -> word)
    vocab_dict = dict([(x,y) for (y,x) in enumerate(vocab_list)])
    rev_vocab_dict = {v: k for k, v in vocab_dict.items()}

    return vocab_list, vocab_dict, rev_vocab_dict

def sentence_to_token_ids(sentence, vocab_dict, target_lang,
    normalize_digits=True):
    """
    Convert a single sentence of words to token ids. If it is the target
    language, we will append an EOS token to the end.
    """
    if not normalize_digits:
        # replace words not in vocab_dict with UNK_ID
        tokens = [vocab_dict.get(w, UNK_ID) for w in sentence]
    else:
        tokens = [vocab_dict.get(_DIGIT_RE.sub("0", w), UNK_ID)
            for w in sentence]

    # Append EOS token if target langauge sentence

    return tokens


def data_to_token_ids(tokenized, vocab_dict,normalize_digits=True):
    """
    Convert tokens into ids used vocab_dict and normalize all digits
    to 0.
    """
    data_as_tokens = []
    seq_lens = []
    max_len = max(len(sentence) for sentence in tokenized)# 
    for sentence in tokenized:
        sentence=sentence[:max_len]
        token_ids = sentence_to_token_ids(sentence, vocab_dict, normalize_digits)
        # Padding
        data_as_tokens.append(token_ids + [PAD_ID]*(max_len - len(token_ids)))
        # Store original sequence length
        seq_lens.append(len(token_ids))

    return np.array(data_as_tokens), np.array(seq_lens)

def process_data(datafile, max_vocab_size):
    """
    Read the sentences from our datafiles.
    """
    with open(datafile, 'rb') as f:
        sentences = pickle.load(f)

    # Split into tokens
    tokenized = []
    for i in range(len(sentences)):
        tokenized.append(basic_tokenizer(sentences[i]))

    # Get vocab information
    vocab_list, vocab_dict, rev_vocab_dict = get_vocab(tokenized,
        max_vocab_size)

    # Convert data to token ids
    data_as_tokens, seq_lens = data_to_token_ids(tokenized, vocab_dict, normalize_digits=True)

    return tokenized, seq_lens, vocab_dict, rev_vocab_dict

In [92]:
def compare_a_the(x_text,y_text):
    theid=vocab_to_int["the"]
    aid=vocab_to_int["a"]
    total=0
    mix=0
    for i,word in enumerate(x_text):
        if word==theid or word==aid:
            total+=1
            if x_text[i]!=y_text[i]:
                mix+=1
    return mix,total

In [93]:
def process_file_by_dict(filename,vocab_dict):
    with open(filename, 'rb') as f:
        sentences = pickle.load(f)

    # Split into tokens
    tokenized = []
    for i in range(len(sentences)):
        tokenized.append(basic_tokenizer(sentences[i]))
    
    data_as_tokens, seq_lens = data_to_token_ids(tokenized, vocab_dict, normalize_digits=True)
    
    return tokenized,seq_lens

In [101]:
token_original,len_original, vocab_to_int, int_to_vocab = \
        process_data('originaltwotale.p', max_vocab_size=8000)

In [102]:
token_obfused,len_obfused=process_file_by_dict("obfuscatedtwotale.p",vocab_to_int)

In [103]:
len(token_original), len(token_obfused)

(5809, 5809)

In [109]:
token_label[:2], token_obfused[:2],len_original[:10], len_obfused[:10],max(len_original)

([['"',
   'She',
   'was',
   'mightily',
   'pleased',
   'to',
   'have',
   'your',
   'message',
   ',',
   'when',
   'I',
   'gave',
   'it',
   'her',
   '.'],
  ['Not',
   'that',
   'she',
   'showed',
   'she',
   'was',
   'pleased',
   ',',
   'but',
   'I',
   'suppose',
   'she',
   'was',
   '.',
   '"']],
 [['"',
   'She',
   'was',
   'mightily',
   'pleased',
   'to',
   'have',
   'your',
   'message',
   ',',
   'when',
   'I',
   'gave',
   'it',
   'her',
   '.'],
  ['Not',
   'that',
   'she',
   'showed',
   'she',
   'was',
   'pleased',
   ',',
   'but',
   'I',
   'suppose',
   'she',
   'was',
   '.',
   '"']],
 array([16, 15, 30, 14, 17, 27, 12, 13, 10, 25]),
 array([16, 15, 30, 14, 17, 27, 12, 13, 10, 25]),
 190)

In [108]:
with open('twotaledumped.p', 'wb') as f:
        pickle.dump((token_original,len_original,token_obfused,len_obfused), f)

In [90]:
mix,total=compare_a_the(int_label,int_text)
mix/total

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()