In [1]:
#Preprocessing code
from cleaner_utils import super_cleaner
from preprocessing_utils import book_to_sentences, whole_word_MO_tokenization_and_masking
from gutenberg.acquire import load_etext

#Training code
from transformers import BertConfig
from transformers import BertForMaskedLM
from transformers import BertTokenizer
from transformers import AdamW
from transformers import Trainer, TrainingArguments

#General imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import spacy
import re
import json
import torch

In [2]:
# Read scraped metadata from the gutenberg metadata database 
#(Original data was scraped by using https://github.com/c-w/gutenberg)
#The data is then further preprocessed by https://github.com/hugovk/gutenberg-metadata so it is actually usable.

f = open('gutenberg-metadata.json', 'r')
metadata = json.load(f)
f.close()

In [3]:
#retrieve how many english books there are that we can use
english_book_keys = [key for key in metadata.keys() if metadata[key]['language'] == ['en']]
len(english_book_keys)

13142

In [4]:
#Randomly select 20 books that we can query
np.random.seed(42)
rand_20_books = [x for x in np.random.choice(english_book_keys, size=20)]

In [5]:
# Titles and authors for the first 5 books
for book_id in rand_20_books[:5]:
    print(book_id, metadata[book_id]['author'], metadata[book_id]['title'])

16968 ['Browne, Porter Emerson', 'Towne, Charles Hanson'] ['The Bad Man: A Novel']
1741 ['Packard, Frank L. (Frank Lucius)'] ['The White Moll']
14575 ['Cable, George Washington'] ['Bylow Hill']
14334 ['Bower, B. M.'] ['The Range Dwellers']
22924 ['Douglas, Alan, Captain'] ['Pathfinder; or, The Missing Tenderfoot']


In [6]:
# The third book cant be retrieved because of faults in retrieval. This happens sometimes.
import traceback
import sys

try:
    super_cleaner(load_etext(14575), -1, verify_deletions=True)
except Exception as e:
    try:
        exc_info = sys.exc_info()
    finally:
        # Display the *original* exception
        traceback.print_exception(*exc_info)
        del exc_info


Traceback (most recent call last):
  File "<ipython-input-6-0358ce9648a3>", line 6, in <module>
    super_cleaner(load_etext(14575), -1, verify_deletions=True)
  File "C:\Users\s145733\Anaconda3\lib\site-packages\gutenberg\acquire\text.py", line 152, in load_etext
    text = cache.read().decode('utf-8')
  File "C:\Users\s145733\Anaconda3\lib\gzip.py", line 292, in read
    return self._buffer.read(size)
  File "C:\Users\s145733\Anaconda3\lib\gzip.py", line 470, in read
    self._read_eof()
  File "C:\Users\s145733\Anaconda3\lib\gzip.py", line 516, in _read_eof
    raise BadGzipFile("CRC check failed %s != %s" % (hex(crc32),
gzip.BadGzipFile: CRC check failed 0x0 != 0xd0c5998f


In [7]:
# original unprocessed text
text = load_etext(50000)[:1000]

text

'The Project Gutenberg EBook of John Gutenberg, by Franz von Dingelstedt\r\n\r\nThis eBook is for the use of anyone anywhere at no cost and with\r\nalmost no restrictions whatsoever.  You may copy it, give it away or\r\nre-use it under the terms of the Project Gutenberg License included\r\nwith this eBook or online at www.gutenberg.org/license\r\n\r\n\r\nTitle: John Gutenberg\r\n       First Master Printer, His Acts and Most Remarkable\r\n       Discourses and his Death\r\n\r\nAuthor: Franz von Dingelstedt\r\n\r\nRelease Date: September 17, 2015 [EBook #50000]\r\n\r\nLanguage: English\r\n\r\nCharacter set encoding: UTF-8\r\n\r\n*** START OF THIS PROJECT GUTENBERG EBOOK JOHN GUTENBERG ***\r\n\r\n\r\n\r\n\r\nProduced by WebRover, Chris Curnow and the Online\r\nDistributed Proofreading Team at http://www.pgdp.net (This\r\nfile was produced from images generously made available\r\nby The Internet Archive)\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\nJOHN GUTENBERG,\r\n\r\n  _First Master Printer

In [8]:
#Text with formatting
print(text)

The Project Gutenberg EBook of John Gutenberg, by Franz von Dingelstedt

This eBook is for the use of anyone anywhere at no cost and with
almost no restrictions whatsoever.  You may copy it, give it away or
re-use it under the terms of the Project Gutenberg License included
with this eBook or online at www.gutenberg.org/license


Title: John Gutenberg
       First Master Printer, His Acts and Most Remarkable
       Discourses and his Death

Author: Franz von Dingelstedt

Release Date: September 17, 2015 [EBook #50000]

Language: English

Character set encoding: UTF-8

*** START OF THIS PROJECT GUTENBERG EBOOK JOHN GUTENBERG ***




Produced by WebRover, Chris Curnow and the Online
Distributed Proofreading Team at http://www.pgdp.net (This
file was produced from images generously made available
by The Internet Archive)









JOHN GUTENBERG,

  _First Master Printer,_
  His Acts, and most remarkable Discourses,
  and his Death.

  FROM THE G

In [9]:
# Use the cleaner to retrieve cleaned text from the first book of the random selection
cleaned_book = super_cleaner(load_etext(16968), -1, verify_deletions=False)

In [10]:
#Text is now a single very long string
cleaned_book[:3000]

" Looking back now, after so many months of struggle and foreboding, he wondered how he had ever had the high courage to come to this strange country. Had he been a few years older he would not have started forth--he was sure of that now. But the flame of youth was in him, the sure sense that he could conquer where others had miserably failed; and, like all virile young Americans, he had love of adventure, and zest for the unknown was in his blood. The glamour of Arizona lured him; the color of these great hills and mountains he had come to love captivated him from the first. It was as if a siren beckoned, and he had to follow. For days he had been worried almost to the breaking point. Things had not shaped themselves as he had planned. Event piled upon event, and now disaster--definite disaster--threatened to descend upon him. All morning, despite the intense heat, he had been about the ranch, appraising this and that, mentally; pottering in the shed; looking at his horses--the few th

In [11]:
# Use helper function to process it into sentences using spaCy and with corrections
sentences = book_to_sentences(cleaned_book)
sentences[:10]

[' Looking back now, after so many months of struggle and foreboding, he wondered how he had ever had the high courage to come to this strange country.',
 'Had he been a few years older he would not have started forth--he was sure of that now.',
 'But the flame of youth was in him, the sure sense that he could conquer where others had miserably failed; and, like all virile young Americans, he had love of adventure, and zest for the unknown was in his blood.',
 'The glamour of Arizona lured him; the color of these great hills and mountains he had come to love captivated him from the first.',
 'It was as if a siren beckoned, and he had to follow.',
 'For days he had been worried almost to the breaking point.',
 'Things had not shaped themselves as he had planned.',
 'Event piled upon event, and now disaster--definite disaster--threatened to descend upon him.',
 'All morning, despite the intense heat, he had been about the ranch, appraising this and that, mentally; pottering in the shed; 

In [12]:
#Some of the very short sentences in the pre-training data
print(sorted(sentences, key=len)[:200])

['Si.', 'Ah!', 'No!', 'Ah!', 'Ah!', 'Si.', 'So!', 'Ah!', 'No.', 'Ah!', 'Ah!', 'Oh!', 'Ah!', 'So!', 'Eh?', 'So!', 'No!', 'Ah!', 'Bah!', 'God!', 'Pah!', "H'm!", 'Why?', "H'm!", 'Yes!', 'Yes!', 'Come!', 'I am.', 'I am!', 'Sure!', "Red'!", 'Bueno!', 'Bueno!', 'Bueno!', 'Quick!', 'Pedro!', 'Sabbe!', 'Eh?" "', 'Bueno!', 'Bueno!', 'Oh!" "', 'Never!', 'Never!', 'Humph!', 'Bueno!', 'No." "', 'Bueno.', 'Hello!', 'But no!', 'He was.', 'Zey go.', 'Why?" "', "I 'ave.", 'Why not?', 'he said.', 'he said.', 'he said.', "Rob 'im?", 'Shut up!', 'For why?', 'he said.', 'Children?', 'he cried.', 'she said.', 'he cried.', 'he ended.', 'he asked.', 'Business?', 'No money.', 'And love!', 'Lopez?" "', 'he asked.', 'he cried.', 'he asked.', 'he cried.', 'he cried.', 'he cried.', 'he whined.', 'he yelled.', 'she cried.', 'he begged.', 'Of course.', 'It is you!', 'Pardon me!', 'Really?" "', 'A what?" "', "Like 'ell!", 'Y-y-y-yes.', 'Only good.', 'Joking?" "', 'Ze ranger.', 'No, indeed!', 'Let her go?', 'Not at a

In [13]:
# Load some stuff
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
nlp = spacy.load("en_core_web_sm")


In [14]:
#Amount of tokens in this book which will be used to train BERT
#Warning can be safely ignored as this part is just for counting tokens


book_inputs = tokenizer(cleaned_book, 
                   add_special_tokens=False,
                   return_attention_mask=False,
                   return_token_type_ids=False)

book_num_tokens = len(book_inputs['input_ids'])

book_num_tokens

Token indices sequence length is longer than the specified maximum sequence length for this model (75308 > 512). Running this sequence through the model will result in indexing errors


75308

In [15]:
#Find the longest sequence in terms of BERT-tokens
sentence_inputs = tokenizer(sentences, 
                            return_attention_mask=False,
                            return_token_type_ids=False)
longest_sequence = max([len(x) for x in sentence_inputs['input_ids']])
longest_sequence

510

In [16]:
sentence_inputs

{'input_ids': [[101, 2559, 2067, 2085, 1010, 2044, 2061, 2116, 2706, 1997, 5998, 1998, 18921, 5092, 4667, 1010, 2002, 4999, 2129, 2002, 2018, 2412, 2018, 1996, 2152, 8424, 2000, 2272, 2000, 2023, 4326, 2406, 1012, 102], [101, 2018, 2002, 2042, 1037, 2261, 2086, 3080, 2002, 2052, 2025, 2031, 2318, 5743, 1011, 1011, 2002, 2001, 2469, 1997, 2008, 2085, 1012, 102], [101, 2021, 1996, 8457, 1997, 3360, 2001, 1999, 2032, 1010, 1996, 2469, 3168, 2008, 2002, 2071, 16152, 2073, 2500, 2018, 28616, 6906, 6321, 3478, 1025, 1998, 1010, 2066, 2035, 6819, 15928, 2063, 2402, 4841, 1010, 2002, 2018, 2293, 1997, 6172, 1010, 1998, 27838, 3367, 2005, 1996, 4242, 2001, 1999, 2010, 2668, 1012, 102], [101, 1996, 22439, 1997, 5334, 26673, 2032, 1025, 1996, 3609, 1997, 2122, 2307, 4564, 1998, 4020, 2002, 2018, 2272, 2000, 2293, 14408, 21967, 2032, 2013, 1996, 2034, 1012, 102], [101, 2009, 2001, 2004, 2065, 1037, 19558, 10272, 17799, 1010, 1998, 2002, 2018, 2000, 3582, 1012, 102], [101, 2005, 2420, 2002, 2018, 2

In [17]:
inputs_sent0 = whole_word_MO_tokenization_and_masking(tokenizer, nlp, sentences[0])

loading: 14:08:02.383538
 Looking back now, after so many months of struggle and foreboding, he wondered how he had ever had the high courage to come to this strange country.
['looking', 'back', 'now', ',', 'after', 'so', 'many', 'months', 'of', 'struggle', 'and', 'fore', '##bo', '##ding', ',', 'he', 'wondered', 'how', 'he', 'had', 'ever', 'had', 'the', 'high', 'courage', 'to', 'come', 'to', 'this', 'strange', 'country', '.']
pos-start: 14:08:02.404483
SPACE looking back now, after so many months of struggle and foreboding, he wondered how he had ever had the high courage to come to this strange country.
VERB looking back now, after so many months of struggle and foreboding, he [MASK] how he [MASK] ever [MASK] the high courage to [MASK] to this strange country.
ADV looking [MASK] [MASK], after [MASK] many months of struggle and foreboding, he wondered [MASK] he had [MASK] had the high courage to come to this strange country.
PUNCT looking back now [MASK] after so many months of struggl

In [18]:
inputs_sent1 = whole_word_MO_tokenization_and_masking(tokenizer, nlp, sentences[1])

loading: 14:08:02.436399
Had he been a few years older he would not have started forth--he was sure of that now.
['had', 'he', 'been', 'a', 'few', 'years', 'older', 'he', 'would', 'not', 'have', 'started', 'forth', '-', '-', 'he', 'was', 'sure', 'of', 'that', 'now', '.']
pos-start: 14:08:02.454376
VERB had he [MASK] a few years older he would not [MASK] [MASK] forth - - he [MASK] sure of that now.
PRON had [MASK] been a few years older [MASK] would not have started forth - - [MASK] was sure of that now.
DET had he been [MASK] few years older he would not have started forth - - he was sure of [MASK] now.
ADJ had he been a [MASK] years [MASK] he would not have started forth - - he was [MASK] of that now.
NOUN had he been a few [MASK] older he would not have started forth - - he was sure of that now.
AUX had he been a few years older he [MASK] not have started forth - - he was sure of that now.
PART had he been a few years older he would [MASK] have started forth - - he was sure of that n

In [19]:
inputs_sent0

{'input_ids': tensor([[ 101, 2559, 2067,  ...,    0,    0,    0],
        [ 101, 2559, 2067,  ...,    0,    0,    0],
        [ 101, 2559,  103,  ...,    0,    0,    0],
        ...,
        [ 101, 2559, 2067,  ...,    0,    0,    0],
        [ 101, 2298, 2067,  ...,    0,    0,    0],
        [ 101, 2559, 2067,  ...,    0,    0,    0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'labels': tensor([[ 101, 2559, 2067,  ...,    0,    0,    0],
        [ 101, 2559, 2067,  ...,    0,    0,    0],
        [ 101, 2559, 2067,  ...,    0,    0,    0],
        ...,
      

In [20]:
#Future work: combine sentences for more efficient batchings
concat_inputs = {k: torch.cat((v, inputs_sent1[k]), 0) for k,v in inputs_sent0.items()}
concat_inputs

{'input_ids': tensor([[ 101, 2559, 2067,  ...,    0,    0,    0],
         [ 101, 2559, 2067,  ...,    0,    0,    0],
         [ 101, 2559,  103,  ...,    0,    0,    0],
         ...,
         [ 101, 2018, 2002,  ...,    0,    0,    0],
         [ 101, 2031, 2002,  ...,    0,    0,    0],
         [ 101, 2018, 2002,  ...,    0,    0,    0]]),
 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         ...,
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0]]),
 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 'labels': tensor([[ 101, 2559, 2067,  ...,    0,    0,    0],
         [ 101, 2559, 2067,  ...,    0,    0,    0],
         [ 101, 2559, 2067,  ...,    0,    0,    

In [21]:
concat_inputs['input_ids'].shape

torch.Size([25, 512])

In [22]:
class MODataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = {key: val for key, val in encodings.items() if key != 'labels'}
        self.labels = encodings['labels']

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = MODataset(concat_inputs)
train_dataset

<__main__.MODataset at 0x24980f305b0>

In [24]:
model = BertForMaskedLM(config=BertConfig())

In [31]:
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # total # of training epochs
    per_device_train_batch_size=1,  # batch size per device during training
    #per_device_eval_batch_size=256,   # batch size for evaluation
    learning_rate=1e-5,     
    logging_dir='./logs',            # directory for storing logs
)

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=None            # evaluation dataset
)

In [32]:
trainer.train()

HBox(children=(FloatProgress(value=0.0, description='Epoch', max=3.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=25.0, style=ProgressStyle(description_wid…

RuntimeError: CUDA out of memory. Tried to allocate 20.00 MiB (GPU 0; 4.00 GiB total capacity; 3.00 GiB already allocated; 5.17 MiB free; 3.00 GiB reserved in total by PyTorch)

In [34]:
torch.cuda.is_available()

True

In [35]:
torch.cuda.get_device_name(0)

'Quadro M1200'