In [78]:
import nltk
import re
import os

## 1. Initial cleaning

Read original data

In [135]:
original_raw = open('original_all.txt').read()
modern_raw = open('modern_all.txt').read()

A function for processing text

In [136]:
def process_text(data, sub_list=['ì','î','`', 'ó']):
    for sub_char in sub_list:
        data = re.sub(sub_char, '', data)
        
    data = re.sub('í', "'", data)
    
    # Delete empty lines
    while ('\n\n' in data):
        data = re.sub('\n\n', '\n', data)
    
    # Remove blanks at end of paragraph
    while (' \n' in data):
        data = re.sub(' \n', '\n', data)
        
    # Seperate punctuations, insert space before and after 
    
    data = re.sub('([.,!?;:])', r' \1 ', data)
    data = re.sub('\s{2,}', ' ', data)
    
    
    return data

In [137]:
original_all_cleaned = process_text(original_raw)
modern_all_cleaned = process_text(modern_raw)

Save processed files. (Make sure to delete original ones from directory first)

In [138]:
open('original_all_cleaned.txt', 'w').write(original_all_cleaned)

In [139]:
open('modern_all_cleaned.txt', 'w').write(modern_all_cleaned)

## 2. More cleaning with NLTK (in progress)

In [20]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/yuweiwang/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [21]:
from nltk.corpus.reader.plaintext import PlaintextCorpusReader

In [51]:
original_corpus = PlaintextCorpusReader(os.getcwd(), 'original_all_cleaned.txt')
modern_corpus = PlaintextCorpusReader(os.getcwd(), 'modern_all_cleaned.txt')

In [52]:
original_para = original_corpus.paras()
print("Total paragraphs in this corpus:", len(original_para))

original_sent = original_corpus.sents()
print("Total sentences in this corpus: ", len(original_sent))
print("First sentence in this corpus: ", original_sent[0])

original_word = original_corpus.words()
print("Words in this corpus: ", original_word)

('Total paragraphs in this corpus:', 3269)
('Total sentences in this corpus: ', 7656)
('First sentence in this corpus: ', [u'It', u'was', u'the', u'best', u'of', u'times', u',', u'it', u'was', u'the', u'worst', u'of', u'times', u',', u'it', u'was', u'the', u'age', u'of', u'wisdom', u',', u'it', u'was', u'the', u'age', u'of', u'foolishness', u',', u'it', u'was', u'the', u'epoch', u'of', u'belief', u',', u'it', u'was', u'the', u'epoch', u'ofincredulity', u',', u'it', u'was', u'the', u'season', u'of', u'Light', u',', u'it', u'was', u'the', u'season', u'of', u'Darkness', u',', u'it', u'wasthe', u'spring', u'of', u'hope', u',', u'it', u'was', u'the', u'winter', u'of', u'despair', u',', u'we', u'had', u'everything', u'before', u'us', u',', u'we', u'had', u'nothing', u'before', u'us', u',', u'we', u'were', u'all', u'going', u'direct', u'to', u'Heaven', u',', u'we', u'were', u'all', u'goingdirect', u'the', u'other', u'wayin', u'short', u',', u'the', u'period', u'was', u'so', u'far', u'like', u

## 3. Check load data function (in progress)

### 3.1 Shakespear text input

In [67]:
inp_data = open('sample',"r").readlines()

In [70]:
def preprocess(text_rows):
	return [row.strip().lower().split(' ') for row in text_rows]

inputs = preprocess(inp_data)

In [73]:
print(inputs[0:5])

[['a', 'jumbled', 'confession', 'can', 'only', 'receive', 'a', 'jumbled', 'absolution', '.'], ['i', 'love', 'rich', "capulet's", 'daughter', '.'], ["we're", 'bound', 'to', 'each', 'other', 'in', 'every', 'possible', 'way', ',', 'except', 'we', 'need', 'you', 'to', 'marry', 'us', '.'], ["i'll", 'tell', 'you', 'more', 'later', 'about', 'when', 'and', 'where', 'we', 'met', ',', 'how', 'we', 'fell', 'in', 'love', ',', 'and', 'how', 'we', 'exchanged', 'promises', ',', 'but', 'now', "i'm", 'begging', 'you', ':', 'please', ',', 'agree', 'to', 'marry', 'us', 'today', '.'], ['holy', 'saint', 'francis', ',', 'this', 'is', 'a', 'drastic', 'change', '!']]


### 3.2 New input

In [140]:
new_data = open('original_all_cleaned.txt',"r").readlines()
new_inputs = preprocess(new_data)
print(new_inputs[0][0:100])

['it', 'was', 'the', 'best', 'of', 'times', ',', 'it', 'was', 'the', 'worst', 'of', 'times', ',', 'it', 'was', 'the', 'age', 'of', 'wisdom', ',', 'it', 'was', 'the', 'age', 'of', 'foolishness', ',', 'it', 'was', 'the', 'epoch', 'of', 'belief', ',', 'it', 'was', 'the', 'epoch', 'ofincredulity', ',', 'it', 'was', 'the', 'season', 'of', 'light', ',', 'it', 'was', 'the', 'season', 'of', 'darkness', ',', 'it', 'wasthe', 'spring', 'of', 'hope', ',', 'it', 'was', 'the', 'winter', 'of', 'despair', ',', 'we', 'had', 'everything', 'before', 'us', ',', 'we', 'had', 'nothing', 'before', 'us', ',', 'we', 'were', 'all', 'going', 'direct', 'to', 'heaven', ',', 'we', 'were', 'all', 'goingdirect', 'the', 'other', 'wayin', 'short', ',', 'the', 'period', 'was']


Process splitted data

In [116]:
def pro_data(input_file, save_name):
    open_file = open(input_file).read()
    process_file = process_text(open_file)
    open(save_name, 'w').write(process_file)

In [141]:
pro_data('train.original.para', 'train.original.newpara')
pro_data('train.modern.para', 'train.modern.newpara')
pro_data('valid.original.para', 'valid.original.newpara')
pro_data('valid.modern.para','valid.modern.newpara')
pro_data('test.original.para','test.original.newpara')
pro_data('test.modern.para','test.modern.newpara')