# Pre-processing Texts

This notebook is used to cleaned the text and put into chunks based on the model requirements.

Current process:
* Clean off top and bottom unnecessary words e.g. table of content
* Chunk into 450 tokens (for BERT) - can be changed later depends on the model
* Put into rows with Author name as csv


In [325]:
import nltk
import numpy as np
import random
import pandas as pd
import re
from collections import OrderedDict, defaultdict
nltk.download('punkt')


[nltk_data] Downloading package punkt to /Users/Hadoop/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [326]:
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

## Support Functions

In [327]:
def split_by_chapter(text):
    thisdict = defaultdict()
    # split by chapter with one or two digits number with newline
    split_text = re.split("chapter (\d{1,2})+", text, flags=re.IGNORECASE)
    i = 0
    for s in split_text:
        if i%2 ==0 or i==0:
            Chapter = "chapter_" + str(int(i/2+1))
            thisdict[Chapter] = s
        i+=1
    return thisdict

In [328]:
def split_by_chapter_lst(text , lst):
    lst_j = '|'.join(lst)
    print(lst_j)
    thisdict = defaultdict()
    # split by chapter with one or two digits number with newline
    split_text = re.split(lst_j, text, flags=re.IGNORECASE)
    i = 0
    for s in split_text:
        Chapter = "chapter_" + str(i+1)
        thisdict[Chapter] = s
        i+=1
    return thisdict


In [329]:
def nth_split(s, delim, n): 
    p, c = -1, 0
    while c < n:  
        p = s.index(delim, p + 1)
        c += 1
        print(p,c)
    return [s[:p], s[p + len(delim):]]

## Read Data

In [330]:
# Author = "Charles_Dickens"
# Book = "Oliver_Twist"
# lst = [' CHAPTER I.' , ' CHAPTER II.' , ' CHAPTER III.' , ' CHAPTER IV.' ,
#  ' CHAPTER V.', ' CHAPTER VI.' , ' CHAPTER VII.' , ' CHAPTER VIII.',
#  ' CHAPTER IX .' ,' CHAPTER X.' ,' CHAPTER XI.' ,' CHAPTER XII.' ,' CHAPTER XIII.' ,
#  ' CHAPTER XIV.' ,' CHAPTER XV.' ,' CHAPTER XVI.' ,' CHAPTER XVII.' ,' CHAPTER XVIII.' ,
#  ' CHAPTER XIX .' ,' CHAPTER XX.' ,' CHAPTER XXI.' ,' CHAPTER XXII .' ,' CHAPTER XXIII.' ,
#  ' CHAPTER XXIV.' ,' CHAPTER XXV .' ,' CHAPTER XXVI.' ,' CHAPTER XXVII.' ,' CHAPTER XXVIII.' ,
#  ' CHAPTER XXIX.' ,' CHAPTER XXX.' ,' CHAPTER XXXI.' ,' CHAPTER XXXII.' ,' CHAPTER XXXIII.' ,
#  ' CHAPTER XXXIV .' ,' CHAPTER XXXV.' ,' CHAPTER XXXVI.' ,' CHAPTER XXXVII.' ,' CHAPTER XXXVIII.' ,
#  ' CHAPTER XXXIX.' ,' CHAPTER XL.' ,' CHAPTER XLI.' ,' CHAPTER XLII.' ,' CHAPTER XLIII.' ,
#  ' CHAPTER XLIV.' ,' CHAPTER XLV.' ,' CHAPTER XLVI.' ,' CHAPTER XLVII.' ,' CHAPTER XLVIII.' ,
#  ' CHAPTER XLIX.' ,' CHAPTER L.' ,' CHAPTER LI.' ,' CHAPTER LII.' ,' CHAPTER LIII.' ]
# first_clean = ' CHAPTER I.'
# n_split = 1
# last_clean = 'End of the Project Gutenberg EBook of Oliver Twist, by Charles Dickens' 
# file_name = "data/CD-books/" + Book + ".txt"
# split_func = 'split_by_chapter_lst'

In [331]:
# Author = "Charles_Dickens"
# Book = "The_Pickwick_Papers"
# lst = ['CHAPTER I.' , 'CHAPTER II.' , 'CHAPTER III.' , 'CHAPTER IV.' ,
#  'CHAPTER V.', 'CHAPTER VI.' , 'CHAPTER VII.' , 'CHAPTER VIII.',
#  'CHAPTER IX .' ,'CHAPTER X.' ,'CHAPTER XI.' ,'CHAPTER XII.' ,'CHAPTER XIII.' ,
#  'CHAPTER XIV.' ,'CHAPTER XV.' ,'CHAPTER XVI.' ,'CHAPTER XVII.' ,'CHAPTER XVIII.' ,
#  'CHAPTER XIX .' ,'CHAPTER XX.' ,'CHAPTER XXI.' ,'CHAPTER XXII .' ,'CHAPTER XXIII.' ,
#  'CHAPTER XXIV.' ,'CHAPTER XXV .' ,'CHAPTER XXVI.' ,'CHAPTER XXVII.' ,'CHAPTER XXVIII.' ,
#  'CHAPTER XXIX.' ,'CHAPTER XXX.' ,'CHAPTER XXXI.' ,'CHAPTER XXXII.' ,'CHAPTER XXXIII.' ,
#  'CHAPTER XXXIV .' ,'CHAPTER XXXV.' ,'CHAPTER XXXVI.' ,'CHAPTER XXXVII.' ,'CHAPTER XXXVIII.' ,
#  'CHAPTER XXXIX.' ,'CHAPTER XL.' ,'CHAPTER XLI.' ,'CHAPTER XLII.' ,'CHAPTER XLIII.' ,
#  'CHAPTER XLIV.' ,'CHAPTER XLV.' ,'CHAPTER XLVI.' ,'CHAPTER XLVII.' ,'CHAPTER XLVIII.' ,
#  'CHAPTER XLIX.' ,'CHAPTER L.' ,'CHAPTER LI.' ,'CHAPTER LII.' ,'CHAPTER LIII.','CHAPTER LIV.',
#  'CHAPTER LV.',   'CHAPTER LVI.' , 'CHAPTER LVII.'           ]

# first_clean = ' CHAPTER I.'
# n_split = 2
# last_clean = 'End of the Project Gutenberg EBook of The Pickwick Papers, by Charles Dickens'

# file_name = "data/CD-books/" + Book + ".txt"
# split_func = 'split_by_chapter_lst'

In [332]:
# Author = "Charles_Dickens"
# Book = "The_Life_And_Adventures_Of_Nicholas_Nickleby"
# first_clean = 'CHAPTER 1'
# n_split = 1
# last_clean = 'End of the Project Gutenberg EBook of The Life And Adventures Of Nicholas Nickleby, by Charles Dickens' 
# file_name = "data/CD-books/" + Book + ".txt"
# split_func = 'split_by_chapter'


In [333]:
# Author = "Charles_Dickens"
# Book = "04.The_Old_Curiosity_Shop"

# first_clean = 'CHAPTER 1'
# n_split = 2
# last_clean = 'End of Project Gutenberg’s The Old Curiosity Shop, by Charles Dickens'

# file_name = "data/CD-books/" + Book + ".txt"
# split_func = 'split_by_chapter'

In [359]:
# Author = "Charles_Dickens"
# Book = "05.Barnaby_Rudge"

# first_clean = 'Chapter 1'
# n_split = 2
# last_clean = 'End of the Project Gutenberg EBook of Barnaby Rudge, by Charles Dickens'

# file_name = "data/CD-books/" + Book + ".txt"
# split_func = 'split_by_chapter'
# tgt_file_name = "data/CD-books/"+Author+"_"+ Book + ".csv"


In [None]:
Author = "Charles_Dickens"
Book = "07.The_Life And Adventures Of Martin Chuzzlewit"
lst = ['CHAPTER ONE','CHAPTER TWO','CHAPTER THREE'
first_clean = 'CHAPTER ONE'
n_split = 1
last_clean = 'End of the Project Gutenberg EBook of Life And Adventures Of Martin Chuzzlewit, by Charles Dickens'

file_name = "data/CD-books/" + Book + ".txt"
split_func = 'split_by_chapter'
tgt_file_name = "data/CD-books/"+Author+"_"+ Book + ".csv"

In [360]:
f = open(file_name, "r", encoding="utf8")

In [361]:
cd = f.read().replace('\n', ' ')

### Clean top and botten unnecessary words

In [362]:
cd.find(first_clean)

8096

In [363]:
cd.find(last_clean)

1419281

In [364]:
cd = nth_split(cd,first_clean,n_split)[1]

#cd = cd.split(first_clean)[1]

8096 1
172180 2


In [365]:
len(cd)

1265863

In [366]:
cd = cd.split(last_clean)[0]

### Clean words, space, newline

In [367]:
cd = re.sub('([.,!?()""])', r' \1 ', cd)

In [368]:
if split_func == 'split_by_chapter':
    cd_dict = split_by_chapter(cd)
else:
    cd_dict = split_by_chapter_lst(cd,lst)

In [369]:
for key,item in cd_dict.items():
    ## remove addtional space
    #hemingway_dict[key] = item.strip(' ')
    ## remove newline and space at beginning and end
    cd_dict[key] = re.sub(' +', ' ', item)  

### Chunk data into 450 tokens each

BERT can hadle up to 512.

In [370]:
# First check the chapter length
for key,item in cd_dict.items():
    if len(item.split()) > 450:
        print(key, len(item.split()))

chapter_1 4848
chapter_2 2217
chapter_3 3614
chapter_4 4915
chapter_5 2099
chapter_6 4409
chapter_7 2644
chapter_8 4671
chapter_9 2148
chapter_10 4745
chapter_11 2638
chapter_12 3804
chapter_13 2917
chapter_14 4532
chapter_15 2252
chapter_16 4445
chapter_17 2259
chapter_18 4834
chapter_19 2317
chapter_20 5256
chapter_21 1849
chapter_22 4957
chapter_23 2308
chapter_24 4566
chapter_25 2568
chapter_26 4939
chapter_27 2059
chapter_28 4811
chapter_29 2267
chapter_30 4071
chapter_31 2911
chapter_32 5088
chapter_33 2067
chapter_34 4956
chapter_35 2108
chapter_36 4755
chapter_37 2572
chapter_38 3579
chapter_39 3569
chapter_40 4616
chapter_41 2688
chapter_42 4435
chapter_43 2881
chapter_44 3852
chapter_45 3368
chapter_46 3746
chapter_47 3464
chapter_48 3919
chapter_49 3159
chapter_50 5007
chapter_51 1947
chapter_52 3112
chapter_53 3979
chapter_54 3983
chapter_55 3630
chapter_56 4667
chapter_57 2913
chapter_58 4911
chapter_59 2365
chapter_60 4495
chapter_61 3028
chapter_62 4603
chapter_63 2516
c

In [371]:
# create two empty lists for df 
chapterindex = []
text = []

for key,item in cd_dict.items():
    # wordcount starts from 0
    wordcount = 0 
    # keep adding sentences until 512 tokens
    chapter_chunk_text = ""
    # sentences in each chapter
    sentences = tokenizer.tokenize(item)
    # loop through each sentence 
    for sent in sentences:
        # check the word count per sentence
        wv = len(sent.split())
        # if adding this sentence makes it over 450 tokens 
        if wordcount + wv >= 450:
            # push row and clean cache
            chapterindex.append(key)
            text.append(chapter_chunk_text)
            wordcount = 0
            chapter_chunk_text = sent
        # if not exceed, append the text and add wordcount
        chapter_chunk_text += ' ' + sent
        wordcount += wv
    # once a chapter finished, push all rest text 
    chapterindex.append(key)
    text.append(chapter_chunk_text)
        

In [372]:
# create dataframe 
cd_df = pd.DataFrame(
    {'chapter': chapterindex,
     'text': text
    })

In [373]:
# create total words count per row
cd_df['totalwords'] = cd_df['text'].str.split().str.len()

In [374]:
cd_df.iloc[0]['text']

' 0 It was on one of those mornings , common in early spring , when the year , fickle and changeable in its youth like all other created things , is undecided whether to step backward into winter or forward into summer , and in its uncertainty inclines now to the one and now to the other , and now to both at once--wooing summer in the sunshine , and lingering still with winter in the shade--it was , in short , on one of those mornings , when it is hot and cold , wet and dry , bright and lowering , sad and cheerful , withering and genial , in the compass of one short hour , that old John Willet , who was dropping asleep over the copper boiler , was roused by the sound of a horse’s feet , and glancing out at window , beheld a traveller of goodly promise , checking his bridle at the Maypole door . He was none of your flippant young fellows , who would call for a tankard of mulled ale , and make themselves as much at home as if they had ordered a hogshead of wine; none of your audacious yo

In [375]:
cd_df['author'] = Author

### Save as csv with author name

In [376]:

cd_df.to_csv(tgt_file_name)