In [2]:
import nltk
import re
import os

## 1. Initial cleaning

Read original data

In [4]:
original_raw = open('original.txt').read()
modern_raw = open('modern.txt').read()

A function for processing text

In [5]:
def process_text(data, sub_list=['ì','î','`', 'ó']):
    for sub_char in sub_list:
        data = re.sub(sub_char, '', data)
        
    data = re.sub('í', "'", data)
    
    # Delete empty lines
    while ('\n\n' in data):
        data = re.sub('\n\n', '\n', data)
    
    # Remove blanks at end of paragraph
    while (' \n' in data):
        data = re.sub(' \n', '\n', data)
        
    # Mask \n to prevent from removing
    data = re.sub('\n', '<\n>', data)
    
    # Seperate punctuations, insert space before and after 
    
    data = re.sub('([.,!?;:])', r' \1 ', data)
    data = re.sub('\s{2,}', ' ', data)
    
    # Put \n back
    data = re.sub('<\n>', '\n', data)
    return data

In [6]:
original_all_cleaned = process_text(original_raw)
modern_all_cleaned = process_text(modern_raw)

In [1]:
#original_all_cleaned

Save processed files. (Make sure to delete original ones from directory first)

In [7]:
open('original_all_cleaned.txt', 'w').write(original_all_cleaned)

In [8]:
open('modern_all_cleaned.txt', 'w').write(modern_all_cleaned)

## 2. Split into train/valid/test sets

In [22]:
check_equal = (len(original_all_cleaned.splitlines())==len(modern_all_cleaned.splitlines()))
                   
print('Check original & modern text has same num of paragraph:', check_equal)

('Check original & modern text has same num of paragraph:', True)


In [23]:
train_size = int(len(original_all_cleaned.splitlines()) * 0.6)
valid_size = int(len(original_all_cleaned.splitlines()) * 0.2) + 1
test_size = int(len(original_all_cleaned.splitlines()) * 0.2) + 1

print(len(original_all_cleaned.splitlines()))
print(train_size + valid_size + test_size)

3269
3269


In [26]:
train_ori_list = original_all_cleaned.splitlines()[: train_size]
train_ori = '\n'.join(train_ori_list)

valid_ori_list = original_all_cleaned.splitlines()[train_size: train_size + valid_size]
valid_ori = '\n'.join(valid_ori_list)

test_ori_list = original_all_cleaned.splitlines()[train_size + valid_size : ]
test_ori = '\n'.join(test_ori_list)

train_mod_list = modern_all_cleaned.splitlines()[: train_size]
train_mod = '\n'.join(train_mod_list)

valid_mod_list = modern_all_cleaned.splitlines()[train_size: train_size + valid_size]
valid_mod = '\n'.join(valid_mod_list)

test_mod_list = modern_all_cleaned.splitlines()[train_size + valid_size : ]
test_mod = '\n'.join(test_mod_list)

In [29]:
open('train.original.nltktok', 'w').write(train_ori)
open('valid.original.nltktok', 'w').write(valid_ori)
open('test.original.nltktok', 'w').write(test_ori)
open('train.modern.nltktok', 'w').write(train_mod)
open('valid.modern.nltktok', 'w').write(valid_mod)
open('test.modern.nltktok', 'w').write(test_mod)

## 2. More cleaning with NLTK (in progress)

In [20]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/yuweiwang/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [21]:
from nltk.corpus.reader.plaintext import PlaintextCorpusReader

In [51]:
original_corpus = PlaintextCorpusReader(os.getcwd(), 'original_all_cleaned.txt')
modern_corpus = PlaintextCorpusReader(os.getcwd(), 'modern_all_cleaned.txt')

In [52]:
original_para = original_corpus.paras()
print("Total paragraphs in this corpus:", len(original_para))

original_sent = original_corpus.sents()
print("Total sentences in this corpus: ", len(original_sent))
print("First sentence in this corpus: ", original_sent[0])

original_word = original_corpus.words()
print("Words in this corpus: ", original_word)

('Total paragraphs in this corpus:', 3269)
('Total sentences in this corpus: ', 7656)
('First sentence in this corpus: ', [u'It', u'was', u'the', u'best', u'of', u'times', u',', u'it', u'was', u'the', u'worst', u'of', u'times', u',', u'it', u'was', u'the', u'age', u'of', u'wisdom', u',', u'it', u'was', u'the', u'age', u'of', u'foolishness', u',', u'it', u'was', u'the', u'epoch', u'of', u'belief', u',', u'it', u'was', u'the', u'epoch', u'ofincredulity', u',', u'it', u'was', u'the', u'season', u'of', u'Light', u',', u'it', u'was', u'the', u'season', u'of', u'Darkness', u',', u'it', u'wasthe', u'spring', u'of', u'hope', u',', u'it', u'was', u'the', u'winter', u'of', u'despair', u',', u'we', u'had', u'everything', u'before', u'us', u',', u'we', u'had', u'nothing', u'before', u'us', u',', u'we', u'were', u'all', u'going', u'direct', u'to', u'Heaven', u',', u'we', u'were', u'all', u'goingdirect', u'the', u'other', u'wayin', u'short', u',', u'the', u'period', u'was', u'so', u'far', u'like', u

## 3. Check load data function (in progress)

### 3.1 Shakespear text input

In [67]:
inp_data = open('sample',"r").readlines()

In [10]:
def preprocess(text_rows):
	return [row.strip().lower().split(' ') for row in text_rows]

In [73]:
print(inputs[0:5])

[['a', 'jumbled', 'confession', 'can', 'only', 'receive', 'a', 'jumbled', 'absolution', '.'], ['i', 'love', 'rich', "capulet's", 'daughter', '.'], ["we're", 'bound', 'to', 'each', 'other', 'in', 'every', 'possible', 'way', ',', 'except', 'we', 'need', 'you', 'to', 'marry', 'us', '.'], ["i'll", 'tell', 'you', 'more', 'later', 'about', 'when', 'and', 'where', 'we', 'met', ',', 'how', 'we', 'fell', 'in', 'love', ',', 'and', 'how', 'we', 'exchanged', 'promises', ',', 'but', 'now', "i'm", 'begging', 'you', ':', 'please', ',', 'agree', 'to', 'marry', 'us', 'today', '.'], ['holy', 'saint', 'francis', ',', 'this', 'is', 'a', 'drastic', 'change', '!']]


### 3.2 New input

In [30]:
new_data = open('train.original.nltktok',"r").readlines()
new_inputs = preprocess(new_data)
print(new_inputs[0:3])

[['it', 'was', 'the', 'best', 'of', 'times', ',', 'it', 'was', 'the', 'worst', 'of', 'times', ',', 'it', 'was', 'the', 'age', 'of', 'wisdom', ',', 'it', 'was', 'the', 'age', 'of', 'foolishness', ',', 'it', 'was', 'the', 'epoch', 'of', 'belief', ',', 'it', 'was', 'the', 'epoch', 'of', 'incredulity', ',', 'it', 'was', 'the', 'season', 'of', 'light', ',', 'it', 'was', 'the', 'season', 'of', 'darkness', ',', 'it', 'was', 'the', 'spring', 'of', 'hope', ',', 'it', 'was', 'the', 'winter', 'of', 'despair', ',', 'we', 'had', 'everything', 'before', 'us', ',', 'we', 'had', 'nothing', 'before', 'us', ',', 'we', 'were', 'all', 'going', 'direct', 'to', 'heaven', ',', 'we', 'were', 'all', 'going', 'direct', 'the', 'other', 'wayin', 'short', ',', 'the', 'period', 'was', 'so', 'far', 'like', 'the', 'present', 'period', ',', 'that', 'some', 'of', 'its', 'noisiest', 'authorities', 'insisted', 'on', 'its', 'being', 'received', ',', 'for', 'good', 'or', 'for', 'evil', ',', 'in', 'the', 'superlative', 'deg