In [1]:
#Preprocessing code
from cleaner_utils import super_cleaner
from preprocessing_utils import whole_word_MO_tokenization_and_masking
from pretraining_data_utils import book_properties, make_df_book_properties

from gutenberg.acquire import load_etext

#Training code
from transformers import BertConfig
from transformers import BertForMaskedLM
from transformers import BertTokenizer
from transformers import AdamW
from transformers import Trainer, TrainingArguments

from torch.utils.data import DataLoader

#General imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import spacy
import re
import json
import torch
import logging

In [2]:
# Read scraped metadata from the gutenberg metadata database 
#(Original data was scraped by using https://github.com/c-w/gutenberg)
#The data is then further preprocessed by https://github.com/hugovk/gutenberg-metadata so it is actually usable.

f = open('gutenberg-metadata.json', 'r')
metadata = json.load(f)
f.close()

In [3]:
#retrieve how many english books there are that we can use
english_book_keys = [key for key in metadata.keys() if metadata[key]['language'] == ['en']]
len(english_book_keys)

13142

In [4]:
#Randomly select 20 books that we can query
np.random.seed(42)
rand_20_books = [x for x in np.random.choice(english_book_keys, size=20)]

In [5]:
# Titles and authors for the first 5 books
for book_id in rand_20_books[:5]:
    print(book_id, metadata[book_id]['author'], metadata[book_id]['title'])

16968 ['Browne, Porter Emerson', 'Towne, Charles Hanson'] ['The Bad Man: A Novel']
1741 ['Packard, Frank L. (Frank Lucius)'] ['The White Moll']
14575 ['Cable, George Washington'] ['Bylow Hill']
14334 ['Bower, B. M.'] ['The Range Dwellers']
22924 ['Douglas, Alan, Captain'] ['Pathfinder; or, The Missing Tenderfoot']


In [6]:
# The third book cant be retrieved because of faults in retrieval. This happens sometimes.
import traceback
import sys

try:
    super_cleaner(load_etext(14575), -1, verify_deletions=True)
except Exception as e:
    try:
        exc_info = sys.exc_info()
    finally:
        # Display the *original* exception
        traceback.print_exception(*exc_info)
        del exc_info


Traceback (most recent call last):
  File "<ipython-input-6-0358ce9648a3>", line 6, in <module>
    super_cleaner(load_etext(14575), -1, verify_deletions=True)
  File "C:\Users\s145733\Anaconda3\lib\site-packages\gutenberg\acquire\text.py", line 152, in load_etext
    text = cache.read().decode('utf-8')
  File "C:\Users\s145733\Anaconda3\lib\gzip.py", line 292, in read
    return self._buffer.read(size)
  File "C:\Users\s145733\Anaconda3\lib\gzip.py", line 470, in read
    self._read_eof()
  File "C:\Users\s145733\Anaconda3\lib\gzip.py", line 516, in _read_eof
    raise BadGzipFile("CRC check failed %s != %s" % (hex(crc32),
gzip.BadGzipFile: CRC check failed 0x0 != 0xd0c5998f


In [7]:
# original unprocessed text
text = load_etext(50000)[:500]

text

'The Project Gutenberg EBook of John Gutenberg, by Franz von Dingelstedt\r\n\r\nThis eBook is for the use of anyone anywhere at no cost and with\r\nalmost no restrictions whatsoever.  You may copy it, give it away or\r\nre-use it under the terms of the Project Gutenberg License included\r\nwith this eBook or online at www.gutenberg.org/license\r\n\r\n\r\nTitle: John Gutenberg\r\n       First Master Printer, His Acts and Most Remarkable\r\n       Discourses and his Death\r\n\r\nAuthor: Franz von Dingelstedt\r\n\r\nRelease Da'

In [8]:
#Text with formatting
print(text)

The Project Gutenberg EBook of John Gutenberg, by Franz von Dingelstedt

This eBook is for the use of anyone anywhere at no cost and with
almost no restrictions whatsoever.  You may copy it, give it away or
re-use it under the terms of the Project Gutenberg License included
with this eBook or online at www.gutenberg.org/license


Title: John Gutenberg
       First Master Printer, His Acts and Most Remarkable
       Discourses and his Death

Author: Franz von Dingelstedt

Release Da


In [9]:
# Use the cleaner to retrieve cleaned text from the first book of the random selection
sentences = super_cleaner(load_etext(16968), -1, verify_deletions=False)

In [10]:
#Text is now a list of paragraphs
sentences[:10]

[' Looking back now, after so many months of struggle and foreboding, he wondered how he had ever had the high courage to come to this strange country. Had he been a few years older he would not have started forth--he was sure of that now. But the flame of youth was in him, the sure sense that he could conquer where others had miserably failed; and, like all virile young Americans, he had love of adventure, and zest for the unknown was in his blood. The glamour of Arizona lured him; the color of these great hills and mountains he had come to love captivated him from the first. It was as if a siren beckoned, and he had to follow.',
 'For days he had been worried almost to the breaking point. Things had not shaped themselves as he had planned. Event piled upon event, and now disaster--definite disaster--threatened to descend upon him.',
 'All morning, despite the intense heat, he had been about the ranch, appraising this and that, mentally; pottering in the shed; looking at his horses--t

In [11]:
#with some short sentences
sorted(sentences, key=len)[:20]

['"No."',
 '"No."',
 '"No."',
 '"Yes."',
 '"Why?"',
 '"Yes."',
 '"Gun?"',
 '"Pells?"',
 '"Really?"',
 '"A what?"',
 '"Joking?"',
 '"I have?"',
 '"I ain\'t!"',
 '"Kiss me!"',
 '"Uh--huh!"',
 '"Yes; why?"',
 'She nodded.',
 '"In a way."',
 '"What for?"',
 '"Yes, sir!"']

In [12]:
#Find some properties about the book
book_properties(sentences)

[2041, 5, 1532, 75308, 353]

In [13]:
#Setting to ignore warnings about sequences being longer than BERT can handle
logging.getLogger("transformers.tokenization_utils_base").setLevel(logging.ERROR)

np.random.seed(42)
rand_100_books = [x for x in np.random.choice(english_book_keys, size=100)]
df_books = make_df_book_properties(rand_100_books)

In [14]:
#Sort df and account for the fact that the column has both text and numbers
df_books.loc[pd.to_numeric(df_books['Shortest sentence (char)'], errors='coerce').sort_values().index]

Unnamed: 0,book_id,num_sentences,Shortest sentence (char),Longest sentence (char),Total tokens,Longest sequence (tokens)
98,15920,2790,4,1228,105158,282
65,16733,2314,5,1848,125803,460
51,14578,1256,5,2301,119420,565
54,21268,1971,5,1763,78698,374
44,2166,1517,5,2974,104315,670
...,...,...,...,...,...,...
47,12298,44,320,2303,9823,526
2,14575,Failed,Failed,Failed,Failed,Failed
15,10188,Failed,Failed,Failed,Failed,Failed
62,20039,Failed,Failed,Failed,Failed,Failed


In [15]:
#21350

In [16]:
bookoi = 1655
sorted(super_cleaner(load_etext(bookoi), -1, verify_deletions=False), key=len)[:30]

['"No."',
 '"No."',
 '"Ay."',
 '"No."',
 '"Go!"',
 '"Wot?"',
 '"Yep."',
 '"Nor--"',
 '"Sure?"',
 '"Sure."',
 '"What?"',
 '"Time!"',
 '"I had."',
 '"Where?"',
 ' Contents:',
 '"And now?"',
 '"No, suh."',
 '"And you?"',
 '"One inch."',
 '"Think so?"',
 '"\'Ow close?"',
 'Bill nodded.',
 '"But now! now!"',
 '"I know, but--"',
 '"Listen, Joy--"',
 '"And if I win?"',
 '"Good society?"',
 '"The gold-dust."',
 '"Hang who?  Me?"',
 '"Streak of fat?"']

In [None]:
has_plus_sign = 0
no_plus_sign = 0
for i, book_id in enumerate(english_book_keys):
    if i % 1000 == 0:
        print(i)
    try:
        text = load_etext(int(book_id))
        if '+' in text:
            has_plus_sign += 1
        else:
            no_plus_sign += 1
    except:
        continue
    
print('{} with plus sign'.format(has_plus_sign))
print('{} without plus sign'.format(no_plus_sign))

In [18]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
nlp = spacy.load("en_core_web_sm")

In [19]:
inputs_sent0 = whole_word_MO_tokenization_and_masking(tokenizer, nlp, sentences[0])

loading: 11:46:24.958870


In [20]:
inputs_sent1 = whole_word_MO_tokenization_and_masking(tokenizer, nlp, sentences[1])

loading: 11:46:25.067581


In [21]:
inputs_sent0

{'input_ids': tensor([[ 101, 2559, 2067,  ...,    0,    0,    0],
        [ 101, 2559, 2067,  ...,    0,    0,    0],
        [ 101, 2559,  103,  ...,    0,    0,    0],
        ...,
        [ 101, 2559, 2067,  ...,    0,    0,    0],
        [ 101, 2298, 2067,  ...,    0,    0,    0],
        [ 101, 2559, 2067,  ...,    0,    0,    0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'labels': tensor([[ 101, 2559, 2067,  ...,    0,    0,    0],
        [ 101, 2559, 2067,  ...,    0,    0,    0],
        [ 101, 2559, 2067,  ...,    0,    0,    0],
        ...,
      

In [22]:
#Future work: combine sentences for more efficient batchings
concat_inputs = {k: torch.cat((v, inputs_sent1[k]), 0) for k,v in inputs_sent0.items()}
concat_inputs

{'input_ids': tensor([[ 101, 2559, 2067,  ...,    0,    0,    0],
         [ 101, 2559, 2067,  ...,    0,    0,    0],
         [ 101, 2559,  103,  ...,    0,    0,    0],
         ...,
         [ 101, 2005, 2420,  ...,    0,    0,    0],
         [ 101, 2005, 2154,  ...,    0,    0,    0],
         [ 101, 2005, 2420,  ...,    0,    0,    0]]),
 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         ...,
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0]]),
 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 'labels': tensor([[ 101, 2559, 2067,  ...,    0,    0,    0],
         [ 101, 2559, 2067,  ...,    0,    0,    0],
         [ 101, 2559, 2067,  ...,    0,    0,    

In [23]:
concat_inputs['input_ids'].shape

torch.Size([29, 512])

In [24]:
class MODataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = {key: val for key, val in encodings.items() if key != 'labels'}
        self.labels = encodings['labels']

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = MODataset(concat_inputs)
train_dataset

<__main__.MODataset at 0x19515c4cfd0>

In [25]:
train_dataloader = DataLoader(train_dataset)

In [26]:
bert_tiny_config = {"hidden_size": 128, 
                    "hidden_act": "gelu", 
                    "initializer_range": 0.02, 
                    "vocab_size": 30522, 
                    "hidden_dropout_prob": 0.1, 
                    "num_attention_heads": 2, 
                    "type_vocab_size": 2, 
                    "max_position_embeddings": 512, 
                    "num_hidden_layers": 2, 
                    "intermediate_size": 512, 
                    "attention_probs_dropout_prob": 0.1}

model = BertForMaskedLM(config=BertConfig(**bert_tiny_config))

In [27]:
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # total # of training epochs
    per_device_train_batch_size=1,  # batch size per device during training
    #per_device_eval_batch_size=256,   # batch size for evaluation
    learning_rate=1e-5,     
    logging_dir='./logs',            # directory for storing logs
)

trainer = Trainer(
    model=model,                         # the instantiated ðŸ¤— Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataloader,         # training dataset
    eval_dataset=None            # evaluation dataset
)

In [28]:
trainer.train()

HBox(children=(FloatProgress(value=0.0, description='Epoch', max=3.0, style=ProgressStyle(description_width='iâ€¦

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=29.0, style=ProgressStyle(description_widâ€¦

TypeError: 'DataLoader' object is not subscriptable

In [None]:
torch.cuda.is_available()

In [None]:
torch.cuda.get_device_name(0)