# EB5002 Text Processing

# Data Exploration

## Set the working directory

In [1]:
import os
os.chdir(os.path.join(os.path.sep, 'home', 'tkokkeng', 'Documents', 'EB5002-TextProcessing'))
os.getcwd()

'/home/tkokkeng/Documents/EB5002-TextProcessing'

In [2]:
import sys
if os.path.join(os.getcwd(), 'source') not in sys.path:
    sys.path.append(os.path.join(os.getcwd(), 'source'))
sys.path

['/home/tkokkeng/Documents/EB5002-TextProcessing',
 '/home/tkokkeng/Documents/EB5002-TextProcessing/source',
 '/home/tkokkeng/python/python367/ptvenv/lib/python36.zip',
 '/home/tkokkeng/python/python367/ptvenv/lib/python3.6',
 '/home/tkokkeng/python/python367/ptvenv/lib/python3.6/lib-dynload',
 '/usr/lib/python3.6',
 '',
 '/home/tkokkeng/python/python367/ptvenv/lib/python3.6/site-packages',
 '/home/tkokkeng/.local/lib/python3.6/site-packages',
 '/usr/local/lib/python3.6/dist-packages',
 '/usr/lib/python3/dist-packages',
 '/home/tkokkeng/.local/lib/python3.6/site-packages/IPython/extensions',
 '/home/tkokkeng/.ipython']

## Import libraries

In [3]:
import pandas as pd
import re
import io
import json

In [4]:
pd.set_option('max_colwidth', 200)

## Load data, pre-processed and save to file.

Before, the top and bottom non-relevant paragraphs were removed manually from the data.

In [5]:
outfile = open(os.path.join('data', '1289-0-preprocessed.txt'), mode='w', encoding='utf-8')
if os.path.isfile(os.path.join('data', '1289-0-lesstopbottom.txt')):
    with open(os.path.join('data', '1289-0-lesstopbottom.txt'), mode='r', encoding='utf-8-sig') as infile:  # encoding to ignore Byte Order Marker (BOM)
        for a_line in infile:
            if not (re.match('^\ *\[.*\]$', a_line)  # remove e.g. [Picture: ...]
                    or a_line.isupper()  # remove titles
                    or re.match('^\ *Chapter|\ *chapter|\ *CHAPTER', a_line)  # remove e.g. Chapter I
                    or re.match('^\ *[\*]+\ *', a_line)):  # remove combinations of * * * * * * *
                outfile.write(a_line)
outfile.close()

In [6]:
with open(os.path.join('data', '1289-0-preprocessed.txt'), mode='r', encoding='utf-8') as infile:
    book1 = infile.read()

In [7]:
book1[:2000]

'UNDER none of the accredited ghostly circumstances, and environed by none\nof the conventional ghostly surroundings, did I first make acquaintance\nwith the house which is the subject of this Christmas piece.  I saw it in\nthe daylight, with the sun upon it.  There was no wind, no rain, no\nlightning, no thunder, no awful or unwonted circumstance, of any kind, to\nheighten its effect.  More than that: I had come to it direct from a\nrailway station: it was not more than a mile distant from the railway\nstation; and, as I stood outside the house, looking back upon the way I\nhad come, I could see the goods train running smoothly along the\nembankment in the valley.  I will not say that everything was utterly\ncommonplace, because I doubt if anything can be that, except to utterly\ncommonplace people—and there my vanity steps in; but, I will take it on\nmyself to say that anybody might see the house as I saw it, any fine\nautumn morning.\n\nThe manner of my lighting on it was this.\n\nI

## Tokenise the data

In [8]:
try: # Use the default NLTK tokenizer.
    from nltk import word_tokenize, sent_tokenize 
    # Testing whether it works. 
    # Sometimes it doesn't work on some machines because of setup issues.
    word_tokenize(sent_tokenize("This is a foobar sentence. Yes it is.")[0])
except: # Use a naive sentence tokenizer and toktok.
    from nltk.tokenize import ToktokTokenizer
    # See https://stackoverflow.com/a/25736515/610569
    sent_tokenize = lambda x: re.split(r'(?<=[^A-Z].[.?]) +(?=[A-Z])', x)
    # Use the toktok tokenizer that requires no dependencies.
    toktok = ToktokTokenizer()
    word_tokenize = toktok.tokenize

In [9]:
book1_tokenised = [ list(map(str.lower, word_tokenize(sent))) for sent in sent_tokenize(book1) ]

In [10]:
book1_tokenised[0]

['under',
 'none',
 'of',
 'the',
 'accredited',
 'ghostly',
 'circumstances',
 ',',
 'and',
 'environed',
 'by',
 'none',
 'of',
 'the',
 'conventional',
 'ghostly',
 'surroundings',
 ',',
 'did',
 'i',
 'first',
 'make',
 'acquaintance',
 'with',
 'the',
 'house',
 'which',
 'is',
 'the',
 'subject',
 'of',
 'this',
 'christmas',
 'piece',
 '.']

In [11]:
len(book1_tokenised)

507

## Preprocessed, tokenised and save data to file

In [12]:
filenames = ['1289-0', '1400-0', '1467-0', '27924-0', '564-0']  # add all the filenames of the books here.

* Removed the top and bottom irrelevant parts of the text manually and save the files to the folder .../manual-processed
* The pre-processed files will be saved to the folder .../pre-processed
* The consolidated tokenised data will be saved in .../final/all_books.txt

In [13]:
all_books = []

for filename in filenames:
    
    outfile = open(os.path.join('data', 'pre-processed', filename + '-preprocessed.txt'), mode='w', encoding='utf-8')
    if os.path.isfile(os.path.join('data', 'manual-processed', filename + '-lesstopbottom.txt')):
        with open(os.path.join('data', 'manual-processed', filename + '-lesstopbottom.txt'), mode='r', encoding='utf-8-sig') as infile:  # encoding to ignore Byte Order Marker (BOM)
            for a_line in infile:
                if not (re.match('^\ *\[.*\]$', a_line)  # remove e.g. [Picture: ...]
                        or a_line.isupper()  # remove titles
                        or re.match('^\ *Chapter|\ *chapter|\ *CHAPTER', a_line)  # remove e.g. Chapter I
                        or re.match('^\ *[\*]+\ *', a_line)):  # remove combinations of * * * * * * *
                    outfile.write(a_line)
    outfile.close()
    
    with open(os.path.join('data', 'pre-processed', filename + '-preprocessed.txt'), mode='r', encoding='utf-8') as infile:
        a_book = infile.read()
        
    a_book_tokenised = [ list(map(str.lower, word_tokenize(sent))) for sent in sent_tokenize(a_book) ]
    all_books += (a_book_tokenised)

In [14]:
with io.open(os.path.join('data', 'final', 'all_books.txt'), 'w', encoding='utf8') as outfile:
    json.dump(all_books, outfile, ensure_ascii=False)

## Test the saved file

In [15]:
with io.open(os.path.join('data', 'final', 'all_books.txt'), encoding='utf8') as infile:
    tokenized_text = json.load(infile)


In [16]:
len(all_books)

8975

In [17]:
all_books[2000]

['i',
 'informed',
 'him',
 'in',
 'exchange',
 'that',
 'my',
 'christian',
 'name',
 'was',
 'philip.',
 '“i',
 'don',
 '’',
 't',
 'take',
 'to',
 'philip',
 ',',
 '”',
 'said',
 'he',
 ',',
 'smiling',
 ',',
 '“for',
 'it',
 'sounds',
 'like',
 'a',
 'moral',
 'boy',
 'out',
 'of',
 'the',
 'spelling-book',
 ',',
 'who',
 'was',
 'so',
 'lazy',
 'that',
 'he',
 'fell',
 'into',
 'a',
 'pond',
 ',',
 'or',
 'so',
 'fat',
 'that',
 'he',
 'couldn',
 '’',
 't',
 'see',
 'out',
 'of',
 'his',
 'eyes',
 ',',
 'or',
 'so',
 'avaricious',
 'that',
 'he',
 'locked',
 'up',
 'his',
 'cake',
 'till',
 'the',
 'mice',
 'ate',
 'it',
 ',',
 'or',
 'so',
 'determined',
 'to',
 'go',
 'a',
 'bird',
 '’',
 's-nesting',
 'that',
 'he',
 'got',
 'himself',
 'eaten',
 'by',
 'bears',
 'who',
 'lived',
 'handy',
 'in',
 'the',
 'neighborhood',
 '.']