In [1]:
#libraries involved in cleaning
from cleaner_utils import super_cleaner
from pretraining_data_utils import make_book_token_frequency, token_freq_df_to_dict, \
                                    all_available_tokens_from_df, optimize_book_subset_ratio
from pretraining_data_utils import book_properties, make_df_book_properties
from pretraining_data_utils import SentenceChunker, SentenceWriter
from gutenberg.acquire import load_etext


#Library utilities
from tokenizer.tokenizer import StrategizedTokenizer
from dataset.dataset import StrategizedTokenizerDataset
from dataset.dataset import DefaultTokenizerDataset

#Training code
from transformers import BertConfig
from transformers import BertForMaskedLM
from transformers import BertTokenizer
from transformers import AdamW
from transformers import Trainer, TrainingArguments

from torch.utils.data import DataLoader

#General imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import spacy
import re
import json
import torch
import logging
import pickle
import os

from more_itertools import take
from datetime import datetime

In [2]:
cache_dir = 'cached_files/'

In [3]:
# Read scraped metadata from the gutenberg metadata database 
#(Original data was scraped by using https://github.com/c-w/gutenberg)
#The data is then further preprocessed by https://github.com/hugovk/gutenberg-metadata so it is actually usable.

f = open(cache_dir + 'gutenberg-metadata.json', 'r')
metadata = json.load(f)
f.close()

In [4]:
#retrieve how many english books there in english
english_book_keys = [key for key in metadata.keys() if metadata[key]['language'] == ['en']]
len(english_book_keys)

13142

In [5]:
# The third book cant be retrieved because of faults in retrieval. This happens sometimes.
import traceback
import sys

try:
    super_cleaner(load_etext(14575), -1, verify_deletions=True)
except Exception as e:
    try:
        exc_info = sys.exc_info()
    finally:
        # Display the *original* exception
        traceback.print_exception(*exc_info)
        del exc_info


True 
True  Note: Project Gutenberg also has an HTML version of this file which includes the original illustrations. See 14575-h.htm or 14575-h.zip: (https://www.gutenberg.org/dirs/1/4/5/7/14575/14575-h/14575-h.htm) or (https://www.gutenberg.org/dirs/1/4/5/7/14575/14575-h.zip)
True 
True 
True BYLOW HILL
True by
True GEORGE W. CABLE
True With Illustrations by F. C. Yohn
True Charles Scribner's Sons New York
True MCMII
True 
True 
True 
True [Illustration: "Father," laughed the daughter, "isn't this rather youngish?"]
True 
True  CONTENTS
True  I. RUTH AND GODFREY II. ISABEL III. ARTHUR AND LEONARD IV. AND BRING DOWN THE REMAINDER V. SKY AND POOL VI. IN THE PUBLIC EYE VII. THE HOUR STRIKES VIII. GIVE YOU FIVE MINUTES IX. THE YOUNG YEAR SMILES X. THE STORM REGATHERS XI. HAS IT COME TO THIS? XII. THE LANTERN QUENCHED XIII. BABY XIV. THE TALKATIVE LEONARD XV. THE THIN ICE BREAKS XVI. MUST GIVE YOU UP XVII. SLEEP, OF A SORT XVIII. MISSING XIX. A DOUBLE STILL HUNT XX. A DOUBLE RETURN XXI. EV

False In a way she was; and particularly when they fondly rallied her upon her new accession of motherly practical manner, and she laughed with them, and ended with that merry, mellow sigh which still gave Ruth new pride in her and new hope. But another source of Ruth's new hope was that Arthur, who had written to the bishop and resigned his calling the day after Mrs. Morris's little namesake was born, had at length withdrawn his letter.
False "It is to your brother we owe its withdrawal," said the bishop, privately, to Ruth.
False She beamed gratefully, but did not tell him that, after the long, secret conference between her brother and the rector, Leonard had come to her and wept for Arthur the only tears he had ever shed in her presence. Now Leonard had found occasion to go West for a time, though he still held his office; and Arthur was filling the rectorate almost in the old first way. On some small parish matter the rustic vestryman with the spectacled daughter came to Arthur's l

In [6]:
#retrieve how many english books there are actually loadable
#If books arent cached this may take a while because it needs to scrape the books from gutenberg.org
#Therefore i provide a pre-processed file
if os.path.isfile(cache_dir + 'loadable_english_book_keys.pkl'):
    with open(cache_dir + 'loadable_english_book_keys.pkl', 'rb') as f:
        loadable_english_book_keys = pickle.load(f)              
else:
    loadable_english_book_keys = []
    i = 0
    for key in english_book_keys:
        if i % 1000 == 0:
            print(i, datetime.now())
        i += 1
        try:
            load_etext(int(key))
            loadable_english_book_keys.append(key)
        except:
            continue
    with open(cache_dir + 'loadable_english_book_keys.pkl', 'wb') as f:
        pickle.dump(loadable_english_book_keys, f)
            
len(loadable_english_book_keys)

12640

In [7]:
#Randomly select 10 books that we can query
np.random.seed(42)
rand_10_books = [x for x in np.random.choice(loadable_english_book_keys, size=10)]
rand_20_books = [x for x in np.random.choice(loadable_english_book_keys, size=20)]
print(rand_10_books), print(rand_20_books)

['17255', '1742', '14870', '14596', '23436', '22563', '15306', '15976', '1344', '13579']
['15116', '23050', '22669', '22310', '18782', '10343', '1650', '21698', '16831', '11194', '14752', '14429', '16170', '2078', '13766', '12310', '23892', '16144', '22293', '19224']


(None, None)

In [8]:
# Titles and authors for the first 5 books
# 1 book isnt actually loadable, see below.
for book_id in rand_10_books[:5]:
    print(book_id, metadata[book_id]['author'], metadata[book_id]['title'])

17255 ['Alma-Tadema, Laurence'] ['The Wings of Icarus: Being the Life of one Emilia Fletcher']
1742 ['Davis, Richard Harding'] ['Miss Civilization: A Comedy in One Act']
14870 ['Hopkinson, Alfred, Sir'] ['Rebuilding Britain: A Survey of Problems of Reconstruction After the World War']
14596 ['Inge, William Ralph'] ['Christian Mysticism']
23436 ['Anonymous'] ['Aladdin or The Wonderful Lamp']


Text preprocessing
===================

In [9]:
# original unprocessed text
text = load_etext(50000)[:500]
text

'The Project Gutenberg EBook of John Gutenberg, by Franz von Dingelstedt\r\n\r\nThis eBook is for the use of anyone anywhere at no cost and with\r\nalmost no restrictions whatsoever.  You may copy it, give it away or\r\nre-use it under the terms of the Project Gutenberg License included\r\nwith this eBook or online at www.gutenberg.org/license\r\n\r\n\r\nTitle: John Gutenberg\r\n       First Master Printer, His Acts and Most Remarkable\r\n       Discourses and his Death\r\n\r\nAuthor: Franz von Dingelstedt\r\n\r\nRelease Da'

In [10]:
#Text with formatting
print(text)

The Project Gutenberg EBook of John Gutenberg, by Franz von Dingelstedt

This eBook is for the use of anyone anywhere at no cost and with
almost no restrictions whatsoever.  You may copy it, give it away or
re-use it under the terms of the Project Gutenberg License included
with this eBook or online at www.gutenberg.org/license


Title: John Gutenberg
       First Master Printer, His Acts and Most Remarkable
       Discourses and his Death

Author: Franz von Dingelstedt

Release Da


Use the cleaner to retrieve cleaned text from the first book of the random selection.
The _super_cleaner_ strips a headers/disclaimers/tables that are not required for our purposes.

In [11]:
sentences = super_cleaner(load_etext(16968), -1, verify_deletions=False)

In [12]:
#Text is now a list of paragraphs
sentences[:10]

[' "And now for business," Lopez said. "And remember zat he what tells a lie shall be right away shotted." In his excitement he lost the little English he had.',
 ' "Put all ze men outside," Lopez ordered. Venustiano and Pedro, his chief lieutenants, obeyed at once, forcing them to march ahead of them, and standing guard over them near a great cactus bush a few feet from the adobe. "Leave ze women with me," the bandit continued. "But first, Alvarada, you find ze cook. I am \'ongry."',
 ' "Red" Giddings had been on the ranch with Gilbert since the very beginning. He came from the North with the young man, willing to stake all on this one venture. Like young Jones, he was not afraid. He was an efficient, well-set-up young fellow, with three consuming passions: Arizona, his harmonica, and Angela Hardy. The first saw a lot of "Red"; the second touched his lips frequently; but as for Angela--well, perhaps the poor boy kissed his harmonica so often in order to forget her lips. But if his own

In [13]:
#with some short sentences
sorted(sentences, key=len)[:20]

['"No."',
 '"Gun?"',
 '"Why?"',
 '"Yes."',
 '"Pells?"',
 '"A what?"',
 '"I have?"',
 '"Joking?"',
 '"Really?"',
 '"I ain\'t!"',
 '"Kiss me!"',
 '"Uh--huh!"',
 '"In a way."',
 '"What for?"',
 '"Yes, sir!"',
 '"Yes; why?"',
 'She nodded.',
 '"All those?"',
 '"You won\'t?"',
 'She started.']

In [14]:
list(sentences)[:10]

[' "And now for business," Lopez said. "And remember zat he what tells a lie shall be right away shotted." In his excitement he lost the little English he had.',
 ' "Put all ze men outside," Lopez ordered. Venustiano and Pedro, his chief lieutenants, obeyed at once, forcing them to march ahead of them, and standing guard over them near a great cactus bush a few feet from the adobe. "Leave ze women with me," the bandit continued. "But first, Alvarada, you find ze cook. I am \'ongry."',
 ' "Red" Giddings had been on the ranch with Gilbert since the very beginning. He came from the North with the young man, willing to stake all on this one venture. Like young Jones, he was not afraid. He was an efficient, well-set-up young fellow, with three consuming passions: Arizona, his harmonica, and Angela Hardy. The first saw a lot of "Red"; the second touched his lips frequently; but as for Angela--well, perhaps the poor boy kissed his harmonica so often in order to forget her lips. But if his own

In [15]:
#Find some properties about the book
book_properties(sentences)

[2041, 5, 1532, 75140, 353]

Tokenization in practice
============

In [16]:
#initialize custom tokenizer
ST_tokenizer = StrategizedTokenizer(padding=False)

In [17]:
inputs = ST_tokenizer.tokenize("Anne went to the Albert Heijn at 5 o'clock to buy some milk for me.")
inputs

{'input_ids': tensor([[  101,   103,  2253,  2000,  1996,   103,   103,   103,   103,  2012,
          1019,  1051,  1005,  5119,  2000,  4965,  2070,  6501,  2005,  2033,
          1012,   102],
        [  101,  4776,   103,  2000,  1996,  4789,  2002, 28418,  2078,  2012,
          1019,  1051,  1005,  5119,  2000,   103,  2070,  6501,  2005,  2033,
          1012,   102],
        [  101,  4776,  2253,   103,  1996,  4789,  2002, 28418,  2078,   103,
          1019,  1051,  1005,  5119,   103,  4965,  2070,  6501,   103,  2033,
          1012,   102],
        [  101,  4776,  2253,  2000,   103,  4789,  2002, 28418,  2078,  2012,
          1019,  1051,  1005,  5119,  2000,  4965,   103,  6501,  2005,  2033,
          1012,   102],
        [  101,  4776,  2253,  2000,  1996,  4789,  2002, 28418,  2078,  2012,
           103,  1051,  1005,  5119,  2000,  4965,  2070,  6501,  2005,  2033,
          1012,   102],
        [  101,  4776,  2253,  2000,  1996,  4789,  2002, 28418,  2078,  201

In [18]:
#Masks are at different places
for masked_line in ST_tokenizer.convert_ids_to_tokens(inputs['input_ids']):
    print(masked_line)

['[CLS]', '[MASK]', 'went', 'to', 'the', '[MASK]', '[MASK]', '[MASK]', '[MASK]', 'at', '5', 'o', "'", 'clock', 'to', 'buy', 'some', 'milk', 'for', 'me', '.', '[SEP]']
['[CLS]', 'anne', '[MASK]', 'to', 'the', 'albert', 'he', '##ij', '##n', 'at', '5', 'o', "'", 'clock', 'to', '[MASK]', 'some', 'milk', 'for', 'me', '.', '[SEP]']
['[CLS]', 'anne', 'went', '[MASK]', 'the', 'albert', 'he', '##ij', '##n', '[MASK]', '5', 'o', "'", 'clock', '[MASK]', 'buy', 'some', 'milk', '[MASK]', 'me', '.', '[SEP]']
['[CLS]', 'anne', 'went', 'to', '[MASK]', 'albert', 'he', '##ij', '##n', 'at', '5', 'o', "'", 'clock', 'to', 'buy', '[MASK]', 'milk', 'for', 'me', '.', '[SEP]']
['[CLS]', 'anne', 'went', 'to', 'the', 'albert', 'he', '##ij', '##n', 'at', '[MASK]', 'o', "'", 'clock', 'to', 'buy', 'some', 'milk', 'for', 'me', '.', '[SEP]']
['[CLS]', 'anne', 'went', 'to', 'the', 'albert', 'he', '##ij', '##n', 'at', '5', 'o', "'", 'clock', 'to', 'buy', 'some', '[MASK]', 'for', 'me', '.', '[SEP]']
['[CLS]', 'anne', 'we

In [19]:
#load original tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [20]:
for x in ST_tokenizer.convert_ids_to_tokens(inputs['input_ids']):
    print(tokenizer.convert_tokens_to_string(x))

[CLS] [MASK] went to the [MASK] [MASK] [MASK] [MASK] at 5 o ' clock to buy some milk for me . [SEP]
[CLS] anne [MASK] to the albert heijn at 5 o ' clock to [MASK] some milk for me . [SEP]
[CLS] anne went [MASK] the albert heijn [MASK] 5 o ' clock [MASK] buy some milk [MASK] me . [SEP]
[CLS] anne went to [MASK] albert heijn at 5 o ' clock to buy [MASK] milk for me . [SEP]
[CLS] anne went to the albert heijn at [MASK] o ' clock to buy some milk for me . [SEP]
[CLS] anne went to the albert heijn at 5 o ' clock to buy some [MASK] for me . [SEP]
[CLS] anne went [MASK] the albert heijn at 5 o ' clock [MASK] buy some milk for me . [SEP]
[CLS] anne went to the albert heijn at 5 o ' clock to buy some milk for [MASK] . [SEP]
[CLS] anne went to the albert heijn at 5 o ' clock to buy some milk for me [MASK] [SEP]
[CLS] anne go to the albert heijn at 5 o ' clock to buy some milk for i . [SEP]
[CLS] anne went to the albert heijn at o ' clock 5 to buy some milk for me . [SEP]


In [21]:
no_ner_text = 'Now why did Arthur Hoare pull out   A sovereign with a happy shout   And give it rashly to his scout,     Who almost had a fit?'
no_ner_text_inputs = ST_tokenizer.tokenize(no_ner_text)

for x in ST_tokenizer.convert_ids_to_tokens(no_ner_text_inputs['input_ids']):
    print(tokenizer.convert_tokens_to_string(x))

[CLS] [MASK] why did arthur hoare pull out a sovereign with a happy shout and give it rashly to his scout , who almost had a fit ? [SEP]
[CLS] now [MASK] did arthur hoare pull out a sovereign with a happy shout and give it [MASK] [MASK] to his scout , who [MASK] had a fit ? [SEP]
[CLS] now why [MASK] arthur hoare [MASK] out a sovereign with a happy shout and [MASK] it rashly to his scout , who almost [MASK] a fit ? [SEP]
[CLS] now why did [MASK] [MASK] [MASK] pull out a sovereign with a happy shout and give it rashly to his scout , who almost had a fit ? [SEP]
[CLS] now why did arthur hoare pull [MASK] a sovereign [MASK] a happy shout and give it rashly [MASK] his scout , who almost had a fit ? [SEP]
[CLS] now why did arthur hoare pull out a sovereign with a happy shout and give it rashly to his scout , who almost had a fit ? [SEP]
[CLS] now why did arthur hoare pull out [MASK] sovereign with [MASK] happy shout and give it rashly to his scout , who almost had [MASK] fit ? [SEP]
[CLS] n

Gutenberg book-selection
==============

In [22]:
#Setting to ignore warnings about sequences being longer than BERT can handle
logging.getLogger("transformers.tokenization_utils_base").setLevel(logging.ERROR)
from datetime import datetime
print(datetime.now())
df_books_10 = make_df_book_properties(rand_10_books)
print(datetime.now())

2021-08-12 14:59:28.063094
2021-08-12 15:00:17.059902


In [23]:
df_books_10

Unnamed: 0,book_id,num_sentences,Shortest sentence (char),Longest sentence (char),Total tokens,Longest sequence (tokens)
0,17255,677,8,2083,41672,522
1,1742,239,15,1051,8481,253
2,14870,317,22,5189,62696,1022
3,14596,528,18,7685,115181,1887
4,23436,11,41,293,579,71
5,22563,358,6,2344,16372,530
6,15306,390,9,1672,40223,383
7,15976,1419,10,1724,78126,453
8,1344,263,6,6391,27092,1536
9,13579,870,13,1939,65500,447


In [24]:
#Sort df and account for the fact that the column has both text and numbers
df_books_10.sort_values(by='Shortest sentence (char)')[:10]

Unnamed: 0,book_id,num_sentences,Shortest sentence (char),Longest sentence (char),Total tokens,Longest sequence (tokens)
5,22563,358,6,2344,16372,530
8,1344,263,6,6391,27092,1536
0,17255,677,8,2083,41672,522
6,15306,390,9,1672,40223,383
7,15976,1419,10,1724,78126,453
9,13579,870,13,1939,65500,447
1,1742,239,15,1051,8481,253
3,14596,528,18,7685,115181,1887
2,14870,317,22,5189,62696,1022
4,23436,11,41,293,579,71


In [25]:
#Some books have very few tokens.
df_books_10.sort_values(by='Total tokens', ascending=False).head()

Unnamed: 0,book_id,num_sentences,Shortest sentence (char),Longest sentence (char),Total tokens,Longest sequence (tokens)
3,14596,528,18,7685,115181,1887
7,15976,1419,10,1724,78126,453
9,13579,870,13,1939,65500,447
2,14870,317,22,5189,62696,1022
0,17255,677,8,2083,41672,522


In [26]:
df_books_10.sort_values(by='Total tokens').tail()

Unnamed: 0,book_id,num_sentences,Shortest sentence (char),Longest sentence (char),Total tokens,Longest sequence (tokens)
0,17255,677,8,2083,41672,522
2,14870,317,22,5189,62696,1022
9,13579,870,13,1939,65500,447
7,15976,1419,10,1724,78126,453
3,14596,528,18,7685,115181,1887


In [27]:
#Retrieve token occurences per book in a dataframe and another dataframe with total number of tokens
print(datetime.now())
df_book_token_freq_10, df_10_total_tokens = make_book_token_frequency(rand_10_books)
print(datetime.now())

2021-08-12 15:00:17.184397


100%|██████████| 10/10 [00:08<00:00,  1.12it/s]

2021-08-12 15:00:27.985041





In [28]:
# DataFrame is obviously very sparse
df_book_token_freq_10[:10]

Unnamed: 0,[PAD],[unused0],[unused1],[unused2],[unused3],[unused4],[unused5],[unused6],[unused7],[unused8],...,##！,##（,##）,##，,##－,##．,##／,##：,##？,##～
17255,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1742,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14870,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14596,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
23436,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
22563,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15306,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15976,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1344,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13579,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [29]:
#total number of tokens per book
df_10_total_tokens

17255     41679.0
1742       8491.0
14870     62696.0
14596    115181.0
23436       579.0
22563     16372.0
15306     40223.0
15976     78170.0
1344      27092.0
13579     65500.0
dtype: float64

In [30]:
#Total number of tokens in our small set
df_10_total_tokens.sum()

455983.0

In [31]:
#All tokens which are present in our subsample of 20 books
all_present_tokens_10 = all_available_tokens_from_df(df_book_token_freq_10)
all_present_tokens_10, len(all_present_tokens_10)

(array([  999,  1000,  1002, ..., 29645, 29664, 29667], dtype=int64), 15198)

In [32]:
#Show first 3 entries
tokens_per_book_10 = token_freq_df_to_dict(df_book_token_freq_10, df_10_total_tokens)
take(3, tokens_per_book_10.items())

[('17255',
  {'tokens': array([  999,  1000,  1005, ..., 29591, 29602, 29667], dtype=int64),
   'total_tokens': 41679.0}),
 ('1742',
  {'tokens': array([  999,  1000,  1005, ..., 28838, 29122, 29586], dtype=int64),
   'total_tokens': 8491.0}),
 ('14870',
  {'tokens': array([  999,  1000,  1005, ..., 29598, 29602, 29609], dtype=int64),
   'total_tokens': 62696.0})]

In [33]:
print(optimize_book_subset_ratio(all_present_tokens_10, tokens_per_book_10, threshold = 1e5))

book best:  22563 new tokens:  3650 book_total_tokens:  16372.0 ratio:  0.22294160762277057
book best:  23436 new tokens:  125 book_total_tokens:  579.0 ratio:  0.2158894645941278
book best:  1344 new tokens:  2211 book_total_tokens:  27092.0 ratio:  0.08161080761848516
book best:  1742 new tokens:  368 book_total_tokens:  8491.0 ratio:  0.0433400070663055
book best:  15306 new tokens:  2157 book_total_tokens:  40223.0 ratio:  0.053626034855679586
{'subset_booklist': ['22563', '23436', '1344', '1742', '15306'], 'subset_total_tokens': 92757.0, 'subset_present_tokens': array([  999.,  1000.,  1005., ..., 29602., 29664., 29667.]), 'subset_unique_tokens': 8511}


In [34]:
df_book_token_freq = pd.read_csv(os.path.join('../LessIsMore-cache','df_book_token_freq.csv'), index_col=0)
df_book_token_freq.head()

Unnamed: 0,[PAD],[unused0],[unused1],[unused2],[unused3],[unused4],[unused5],[unused6],[unused7],[unused8],...,##！,##（,##）,##，,##－,##．,##／,##：,##？,##～
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [35]:
#Total number of unique tokens in the data
len(np.flatnonzero(df_book_token_freq.sum(axis=0)))

27833

In [36]:
df_total_tokens = pd.read_csv(os.path.join('../LessIsMore-cache','df_total_tokens.csv'), index_col=0).squeeze()
df_total_tokens.head()

1       7640.0
10    901551.0
11     36249.0
12     40831.0
13      6731.0
Name: 0, dtype: float64

In [37]:
df_total_tokens.sort_values()[:10]

23398     0.0
10802     0.0
23524     0.0
2305      0.0
232       2.0
22818     4.0
19937    15.0
22335    22.0
20086    25.0
23147    34.0
Name: 0, dtype: float64

In [38]:
#Why do certain books have 0 tokens?
#Well because it is an illustration-only book
super_cleaner(load_etext(23398), -1, verify_deletions=True)

True 
True 
True 
True 
True  _INFANT'S CABINET_
True  _OF_
True  BIRDS & BEASTS.
True  _LONDON. Printed & Sold by Harvey & Darton._ 55, Gracechurch Street, 1820. Price 6d.
True  [Illustration: The Stork.]
True  [Illustration: The Robin.]
True  [Illustration: The Hyena.]
True  [Illustration: The Lion.]
True  [Illustration: The Rhinoceros.]
True  [Illustration: The Camel.]
True  [Illustration: The Swan.]
True  [Illustration: The Vulture.]
True  [Illustration: The Lark.]
True  [Illustration: The Turkey.]
True  [Illustration: The Fox.]
True  [Illustration: The Greyhound.]
True  [Illustration: The Elephant.]
True  [Illustration: The Zebra.]
True  [Illustration: The Crow.]
True  [Illustration: The Cock.]
True  [Illustration: The Pigeon.]
True  [Illustration: The Goldfinch.]
True  [Illustration: The Buffalo.]
True  [Illustration: The Hog.]
True  [Illustration: The Horse.]
True  [Illustration: The Stag.]
True  [Illustration: The Chaffinch.]
True  [Illustration: The Peacock.]
True  [Illustrati

[]

In [39]:
# Or because it is a DVD-cover and we only use the .txt file
super_cleaner(load_etext(10802), -1, verify_deletions=True)

True to complying with copyright laws. PGLAF has not verified that all the eBook files on these discs meet the copyright laws in countries outside of the United States. PGLAF recommends that you verify this before using these files and requests that you advise us of any problems by email to copyright AT pglaf.org
True ** A note on CD and DVD disc capacity. It turns out that disk drive manufacturers (including the people who make CD and DVD burners and blank discs) measure disk space differently than the rest of the computer world. To them, 1MB, which is 1 megabyte, is 1,000,000 bytes. For the rest of the computer world, 1MB is 1,046,576 bytes. We mention this because people might read their DVD disc package and expect it to hold 4.7GB, but be surprised to find it can only hold about 4.37GB as the rest of the world measures space.
True  


[]

In [40]:
#Some books just have very little parsable information. This is often the case with books that are really really old 
#(e.g. writtenpre 1800s). The english in these books is often much different than modern day english.

print(super_cleaner(load_etext(19937), -1))
print(super_cleaner(load_etext(232), -1))

['produced from scanned images of public domain material from the Google Print project.)']
['by Virgil']


In [41]:
#How many tokens do we have in total available?
df_total_tokens.sum()

939505600.0

In [42]:
if os.path.isfile(cache_dir + 'subset_meta_ratio_100K.pkl'):
    with open(cache_dir + 'subset_meta_ratio_100K.pkl', 'rb') as f:
        subset_ratio_100K = pickle.load(f)

In [43]:
print(subset_ratio_100K)

{'subset_booklist': ['232', '22818', '22335', '23594', '20086', '20360', '10557', '19571', '19177', '14100', '13536', '23436', '129', '21783', '11006', '19937', '22847', '1321', '23147', '21805', '22529', '12474', '13082', '14463', '23538', '13081', '116', '18589', '23446', '23450', '17124', '16780', '23146', '18935', '12554', '17254', '23429', '13203', '17365', '22236', '16169', '18417', '22579', '19634', '24044', '104', '1567', '23315', '24269', '12358', '23880'], 'subset_total_tokens': 99974.0, 'subset_present_tokens': array([  100.,   999.,  1000., ..., 29735., 29737., 29739.]), 'subset_unique_tokens': 13040}


In [44]:
if os.path.isfile(cache_dir + 'subset_meta_1M.pkl'):
    with open(cache_dir + 'subset_meta_1M.pkl', 'rb') as f:
        subset_1M = pickle.load(f)

In [45]:
if os.path.isfile(cache_dir + 'subset_meta_ratio_1M.pkl'):
    with open(cache_dir + 'subset_meta_ratio_1M.pkl', 'rb') as f:
        subset_ratio_1M = pickle.load(f)

In [46]:
if os.path.isfile(cache_dir + 'subset_meta_10M.pkl'):
    with open(cache_dir + 'subset_meta_10M.pkl', 'rb') as f:
        subset_10M = pickle.load(f)

In [47]:
if os.path.isfile(cache_dir + 'subset_meta_ratio_10M.pkl'):
    with open(cache_dir + 'subset_meta_ratio_10M.pkl', 'rb') as f:
        subset_ratio_10M = pickle.load(f)

In [48]:
if os.path.isfile(cache_dir + 'subset_meta_ratio_100M.pkl'):
    with open(cache_dir + 'subset_meta_ratio_100M.pkl', 'rb') as f:
        subset_ratio_100M = pickle.load(f)

In [49]:
#Make a file with the union of all books in the subsets (easy for processing on the server)

union_subsets = {'subset_booklist': np.union1d(np.union1d(np.union1d(subset_ratio_100K['subset_booklist'],
                                                          subset_ratio_1M['subset_booklist']),
                                                          subset_ratio_10M['subset_booklist']), 
                                               subset_ratio_100M['subset_booklist'])}

#with open(os.path.join(cache_dir, 'subset_meta_ratio_union.pkl'), 'wb') as f:
    #pickle.dump(union_subsets, f)

Check how many tokens are actually represented by the data
=============

In [50]:
#load original tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [51]:
#size of the vocabulary
len(tokenizer.vocab)

30522

In [52]:
#vocabulary contains a bunch of [unused] tokens which allow people to add their own tokens
num_unused = 0
for k,v in tokenizer.vocab.items():
    if k.startswith('[unused'):
        num_unused += 1
num_unused

994

In [53]:
#Check for tokens which are either alone or a continued token, e.g. 'a' or '##a'
num_char = 0
for k,v in tokenizer.vocab.items():
    #'a' or '##a'
    if not v in subset_ratio_100M['subset_present_tokens'] and (len(k) == 1 or (len(k) == 3 and k.startswith('##'))):
        num_char += 1
        #print(k, v)
num_char

1493

In [54]:
#Which tokens are not represented?
num_unrepresented = 0
for k,v in tokenizer.vocab.items():
    if not v in subset_ratio_100M['subset_present_tokens'] and not k.startswith('[unused') and not len(k) <= 1 and not (len(k) == 3 and k.startswith('##')):
        num_unrepresented += 1
        print(k, v)
num_unrepresented

[PAD] 0
[CLS] 101
[SEP] 102
[MASK] 103
... 2133
km² 3186
soundtrack 6050
remix 6136
°c 6362
uefa 6663
playoff 7808
midfielder 8850
playstation 9160
quarterfinals 9237
pinyin 9973
allmusic 10477
mlb 10901
espn 10978
gameplay 11247
nsw 11524
nascar 11838
itunes 11943
lgbt 12010
mvp 12041
xbox 12202
eurovision 12714
vfl 13480
kolkata 13522
pga 14198
m³ 14241
bundesliga 14250
metacritic 14476
remixes 15193
steelers 15280
airplay 15341
##ии 15414
paralympics 15600
zhao 15634
reggae 15662
linebacker 15674
v8 15754
hindwings 15998
bollywood 16046
podcast 16110
atletico 16132
wwf 16779
transgender 16824
paralympic 17029
postseason 17525
vhs 17550
campeonato 17675
multiplayer 17762
łodz 17814
curated 17940
iphone 18059
gmbh 18289
danielle 18490
qaeda 18659
mixtape 18713
¹⁄₂ 18728
##ław 19704
##qing 19784
saxophonist 19977
preseason 20038
pmid 20117
keyboardist 20173
iucn 20333
pokemon 20421
nrl 20686
motorsports 20711
jaenelle 20757
beyonce 20773
airbus 20901
netflix 20907
motorsport 21044
belg

202

In [55]:
not_used = 0
for k,v in tokenizer.vocab.items():
    if not v in subset_ratio_100M['subset_present_tokens']:
        not_used += 1
        print(k,v)
not_used

[PAD] 0
[unused0] 1
[unused1] 2
[unused2] 3
[unused3] 4
[unused4] 5
[unused5] 6
[unused6] 7
[unused7] 8
[unused8] 9
[unused9] 10
[unused10] 11
[unused11] 12
[unused12] 13
[unused13] 14
[unused14] 15
[unused15] 16
[unused16] 17
[unused17] 18
[unused18] 19
[unused19] 20
[unused20] 21
[unused21] 22
[unused22] 23
[unused23] 24
[unused24] 25
[unused25] 26
[unused26] 27
[unused27] 28
[unused28] 29
[unused29] 30
[unused30] 31
[unused31] 32
[unused32] 33
[unused33] 34
[unused34] 35
[unused35] 36
[unused36] 37
[unused37] 38
[unused38] 39
[unused39] 40
[unused40] 41
[unused41] 42
[unused42] 43
[unused43] 44
[unused44] 45
[unused45] 46
[unused46] 47
[unused47] 48
[unused48] 49
[unused49] 50
[unused50] 51
[unused51] 52
[unused52] 53
[unused53] 54
[unused54] 55
[unused55] 56
[unused56] 57
[unused57] 58
[unused58] 59
[unused59] 60
[unused60] 61
[unused61] 62
[unused62] 63
[unused63] 64
[unused64] 65
[unused65] 66
[unused66] 67
[unused67] 68
[unused68] 69
[unused69] 70
[unused70] 71
[unused71] 72
[un

আ 1348
ই 1349
উ 1350
এ 1351
ও 1352
ক 1353
খ 1354
গ 1355
চ 1356
ছ 1357
জ 1358
ট 1359
ড 1360
ণ 1361
ত 1362
থ 1363
দ 1364
ধ 1365
ন 1366
প 1367
ব 1368
ভ 1369
ম 1370
য 1371
র 1372
ল 1373
শ 1374
ষ 1375
স 1376
হ 1377
া 1378
ি 1379
ী 1380
ে 1381
க 1382
ச 1383
ட 1384
த 1385
ந 1386
ன 1387
ப 1388
ம 1389
ய 1390
ர 1391
ல 1392
ள 1393
வ 1394
ா 1395
ி 1396
ு 1397
ே 1398
ை 1399
ನ 1400
ರ 1401
ಾ 1402
ක 1403
ය 1404
ර 1405
ල 1406
ව 1407
ා 1408
ก 1409
ง 1410
ต 1411
ท 1412
น 1413
พ 1414
ม 1415
ย 1416
ร 1417
ล 1418
ว 1419
ส 1420
อ 1421
า 1422
เ 1423
་ 1424
། 1425
ག 1426
ང 1427
ད 1428
ན 1429
པ 1430
བ 1431
མ 1432
འ 1433
ར 1434
ལ 1435
ས 1436
မ 1437
ა 1438
ბ 1439
გ 1440
დ 1441
ე 1442
ვ 1443
თ 1444
ი 1445
კ 1446
ლ 1447
მ 1448
ნ 1449
ო 1450
რ 1451
ს 1452
ტ 1453
უ 1454
ᄀ 1455
ᄂ 1456
ᄃ 1457
ᄅ 1458
ᄆ 1459
ᄇ 1460
ᄉ 1461
ᄊ 1462
ᄋ 1463
ᄌ 1464
ᄎ 1465
ᄏ 1466
ᄐ 1467
ᄑ 1468
ᄒ 1469
ᅡ 1470
ᅢ 1471
ᅥ 1472
ᅦ 1473
ᅧ 1474
ᅩ 1475
ᅪ 1476
ᅭ 1477
ᅮ 1478
ᅯ 1479
ᅲ 1480
ᅳ 1481
ᅴ 1482
ᅵ 1483
ᆨ 1484
ᆫ 1485
ᆯ 1486
ᆷ 1487
ᆸ 1488
ᆼ 1489
ᴬ 1490

##ท 29948
##น 29949
##พ 29950
##ม 29951
##ย 29952
##ร 29953
##ล 29954
##ว 29955
##ส 29956
##อ 29957
##า 29958
##เ 29959
##་ 29960
##། 29961
##ག 29962
##ང 29963
##ད 29964
##ན 29965
##པ 29966
##བ 29967
##མ 29968
##འ 29969
##ར 29970
##ལ 29971
##ས 29972
##မ 29973
##ა 29974
##ბ 29975
##გ 29976
##დ 29977
##ე 29978
##ვ 29979
##თ 29980
##ი 29981
##კ 29982
##ლ 29983
##მ 29984
##ნ 29985
##ო 29986
##რ 29987
##ს 29988
##ტ 29989
##უ 29990
##ᄀ 29991
##ᄂ 29992
##ᄃ 29993
##ᄅ 29994
##ᄆ 29995
##ᄇ 29996
##ᄉ 29997
##ᄊ 29998
##ᄋ 29999
##ᄌ 30000
##ᄎ 30001
##ᄏ 30002
##ᄐ 30003
##ᄑ 30004
##ᄒ 30005
##ᅡ 30006
##ᅢ 30007
##ᅥ 30008
##ᅦ 30009
##ᅧ 30010
##ᅩ 30011
##ᅪ 30012
##ᅭ 30013
##ᅮ 30014
##ᅯ 30015
##ᅲ 30016
##ᅳ 30017
##ᅴ 30018
##ᅵ 30019
##ᆨ 30020
##ᆫ 30021
##ᆯ 30022
##ᆷ 30023
##ᆸ 30024
##ᆼ 30025
##ᴬ 30026
##ᴮ 30027
##ᴰ 30028
##ᴵ 30029
##ᴺ 30030
##ᵀ 30031
##ᵃ 30032
##ᵇ 30033
##ᵈ 30034
##ᵉ 30035
##ᵏ 30037
##ᵒ 30039
##ᵖ 30040
##ᵗ 30041
##ᵣ 30043
##ᵤ 30044
##ᵥ 30045
##ᶜ 30046
##ᶠ 30047
##‐ 30048
##‑ 30049
##‒ 30050


2689

In [56]:
len(subset_ratio_100M['subset_present_tokens'])

27833

In [57]:
def subset_metadata(subset_dict):
    '''
    prints:
    Number of books used in subset
    Number of tokens present in subset
    Number of tokens represented by subset
    '''
    print(len(subset_dict['subset_booklist']))
    print(subset_dict['subset_total_tokens'])
    print(subset_dict['subset_unique_tokens'])

In [58]:
subset_metadata(subset_ratio_100K)

51
99974.0
13040


In [59]:
subset_metadata(subset_ratio_1M)

178
999825.0
24294


In [60]:
subset_metadata(subset_ratio_10M)

656
9977907.0
27607


In [61]:
subset_metadata(subset_ratio_100M)

828
28660288.0
27833


In [62]:
#It seems the cleaner leaves in some other stuff, we leave this in given that it includes the alphabet.
super_cleaner(load_etext(23594), -1)

[' a b c d e f g h i j k l m n o p q r s t u v w x y z &.',
 'Online Distributed Proofreading Team at http://www.pgdp.net (This file was produced from images generously made available by The Internet Archive/American Libraries.)',
 'fi ff fl ffl ffi.']

In [63]:
book_id = 22818

In [64]:
def stored_sentences(book_id):
    print(book_id)
    filenames = ['sentences_8.pkl', 'sentences_32.pkl', 'sentences_128.pkl']
    for file in filenames:
        with open(os.path.join('../pretraining_data_chunked', str(book_id), file), 'rb') as f:
            sentences = pickle.load(f)
            print(sentences)
    print('==============')

In [65]:
stored_sentences(13167)

13167
[' ~Notes on Books.~', ' ~Ready May 2.~', 'Most sincerely yours,', '~Ready May 9.~']
['   Now why did Arthur Hoare pull out   A sovereign with a happy shout   And give it rashly to his scout,     Who almost had a fit?', '  And why so harshly did he pelt   With forks a fresh and timorous Celt   Afraid to utter what he felt?     Arthur had got his Blue!', '  Why of a sudden did he fling   A hard-boiled egg at Eustace Ling,   Forgetting how an egg can sting     The person who is hit?', ' ~The Novel of the Season~', '"A very ~engrossing story~."--Graphic.', '"The ~best mystery novel~ since Sir A. Conan Doyle\'s \'Sign of Four.\'"--Daily Graphic.', '"The ~best story~ of its kind we have read for years."--Guardian.', '"We can hardly praise too highly the beauty and exquisite simplicity of these talks."--Literary World.', '"~Great ingenuity~ is shown in the way in which clue is crossed by counter-clue."--The Daily Telegraph.', 'A striking study of nomadic life among the peasant classes,

The original plan for tokenization may not work as well as desired for certain books with specific text entries
============

Take for example book 23880

In [66]:
tensors = torch.load('../pretraining_data_truncated/23880/tensors_128.pt')
tensors

{'input_ids': tensor([[  101.,   103.,  1010.,  ...,     0.,     0.,     0.],
         [  101., 11914.,   103.,  ...,     0.,     0.,     0.],
         [  101., 11914.,  1010.,  ...,     0.,     0.,     0.],
         ...,
         [  101.,  2146.,  1999.,  ...,     0.,     0.,     0.],
         [  101.,  2146.,  1999.,  ...,     0.,     0.,     0.],
         [  101.,  2146.,  1999.,  ...,     0.,     0.,     0.]]),
 'attention_mask': tensor([[1., 1., 1.,  ..., 0., 0., 0.],
         [1., 1., 1.,  ..., 0., 0., 0.],
         [1., 1., 1.,  ..., 0., 0., 0.],
         ...,
         [1., 1., 1.,  ..., 0., 0., 0.],
         [1., 1., 1.,  ..., 0., 0., 0.],
         [1., 1., 1.,  ..., 0., 0., 0.]]),
 'labels': tensor([[  101., 11914.,  1010.,  ...,     0.,     0.,     0.],
         [  101., 11914.,  1010.,  ...,     0.,     0.,     0.],
         [  101., 11914.,  1010.,  ...,     0.,     0.,     0.],
         ...,
         [  101.,  2146.,  1999.,  ...,     0.,     0.,     0.],
         [  101.,

None of the ratios as given at the end of one of the longer sentences gets masked because of how we replace text by masks in the sentence (a result of the whole-word mask strategy).

In [67]:
for row in tensors['input_ids'][-15:]:
    print(tokenizer.convert_tokens_to_string([x for x in tokenizer.convert_ids_to_tokens(row) if x != '[PAD]']))

[CLS] as may be readily seen by comparing specimens of l . borealis and l . cinereus from mexico ( or also from [MASK] place in north america north of mexico ) , [MASK] description by saussure applies to [MASK] hoary bat ( lasiurus cinereus ) and not to [MASK] red bat ( lasiurus borealis ) . [SEP]
[CLS] as may be readily seen by comparing specimens of l . borealis and l . cinereus from mexico ( or also from any place in north america north of mexico ) , the description by saussure applies to the [MASK] [MASK] bat ( lasiurus cinereus ) and not to the [MASK] bat ( lasiurus borealis ) . [SEP]
[CLS] as may be readily seen by comparing specimens of l . borealis and l . cinereus from mexico ( or also from any place in north america north of mexico ) , the description by saussure applies to the hoary bat ( lasiurus cinereus ) and [MASK] to the red bat ( lasiurus borealis ) . [SEP]
[CLS] as may be readily see by compare specimen of l . boreali and l . cinereus from mexico ( or also from any pl

In [100]:
nlp = spacy.load('en_core_web_sm')

In [105]:
doc = nlp('I am going to Albert Heijn')
for token in doc:
    print('|', token.text, '|', token.pos_, '|', token.lemma_)

(Albert Heijn,)

In [116]:
doc = nlp("I am visiting at 5 o'clock")
doc.ents, doc.ents[0].label_

((5 o'clock,), 'TIME')

In [69]:
doc = nlp('Long inrolled tail; femoral patagium as in the vespertilios. Teeth 4/2, 1/1, 4/5 or 5/5.')

In [70]:
print(tokenizer.convert_ids_to_tokens(tokenizer('Long inrolled tail; femoral patagium as in the vespertilios. Teeth 4/2, 1/1, 4/5 or 5/5.')['input_ids']))

['[CLS]', 'long', 'in', '##roll', '##ed', 'tail', ';', 'fe', '##moral', 'pat', '##agi', '##um', 'as', 'in', 'the', 've', '##sper', '##ti', '##lio', '##s', '.', 'teeth', '4', '/', '2', ',', '1', '/', '1', ',', '4', '/', '5', 'or', '5', '/', '5', '.', '[SEP]']


In [71]:
#Text is parsed in 1 go by Spacy, but is recognized as seperate tokens by BERT
for token in doc:
    print(token.text, '|', token.pos_, '|', token.lemma_)

Long | ADV | long
inrolled | VERB | inrolle
tail | NOUN | tail
; | PUNCT | ;
femoral | ADJ | femoral
patagium | NOUN | patagium
as | ADP | as
in | ADP | in
the | DET | the
vespertilios | NOUN | vespertilio
. | PUNCT | .
Teeth | PROPN | Teeth
4/2 | NUM | 4/2
, | PUNCT | ,
1/1 | NUM | 1/1
, | PUNCT | ,
4/5 | NUM | 4/5
or | CCONJ | or
5/5 | NUM | 5/5
. | PUNCT | .


In [72]:
def investigate_last_n_tensors(tensor_file, n=15, show_pads=False, start_point=None):
    tensors = torch.load(tensor_file)
    
    if start_point == None:
        rows = tensors['input_ids'][-n:]
    else:
        rows = tensors['input_ids'][start_point-n:start_point]
        
    for row in rows:
        if show_pads:
            print(tokenizer.convert_tokens_to_string([x for x in tokenizer.convert_ids_to_tokens(row)]))
        else:
            print(tokenizer.convert_tokens_to_string([x for x in tokenizer.convert_ids_to_tokens(row) if x != '[PAD]']))

In [73]:
investigate_last_n_tensors('../pretraining_data_onlylemmatized/116/tensors_32.pt')

[CLS] most people start at our website which have the main pg search facility : www . gutenberg . org [SEP]


In [74]:
investigate_last_n_tensors('../pretraining_data_onlyposbased/116/tensors_32.pt')

[CLS] [MASK] people start at our website which has the [MASK] pg search facility : www . gutenberg . org [SEP]
[CLS] most [MASK] start at our [MASK] which has the main [MASK] [MASK] [MASK] : www . gutenberg . org [SEP]
[CLS] most people [MASK] at our website which [MASK] the main pg search facility : www . gutenberg . org [SEP]
[CLS] most people start [MASK] our website which has the main pg search facility : www . gutenberg . org [SEP]
[CLS] most people start at [MASK] website which has the main pg search facility : www . gutenberg . org [SEP]
[CLS] most people start at our website [MASK] has [MASK] main pg search facility : www . gutenberg . org [SEP]
[CLS] most people start at our website which has the main pg search facility [MASK] www . gutenberg . org [SEP]


In [75]:
investigate_last_n_tensors('../pretraining_data_chunked/116/tensors_32.pt')

[CLS] [MASK] people start at our website which has the [MASK] pg search facility : www . gutenberg . org [SEP]
[CLS] most [MASK] start at our [MASK] which has the main [MASK] [MASK] [MASK] : www . gutenberg . org [SEP]
[CLS] most people [MASK] at our website which [MASK] the main pg search facility : www . gutenberg . org [SEP]
[CLS] most people start [MASK] our website which has the main pg search facility : www . gutenberg . org [SEP]
[CLS] most people start at [MASK] website which has the main pg search facility : www . gutenberg . org [SEP]
[CLS] most people start at our website [MASK] has [MASK] main pg search facility : www . gutenberg . org [SEP]
[CLS] most people start at our website which has the main pg search facility [MASK] www . gutenberg . org [SEP]
[CLS] most people start at our website which have the main pg search facility : www . gutenberg . org [SEP]


In [76]:
investigate_last_n_tensors('../pretraining_data_singlelength/116/tensors_128.pt', 10, start_point=-90)

[CLS] international donation be gratefully accept , but we cannot make any statement concern tax treatment of donation receive from outside the united states . u . s . law alone swamp our small staff . [SEP]
[CLS] [MASK] people start at our website which has the [MASK] pg search facility : www . gutenberg . org [SEP]
[CLS] most [MASK] start at our [MASK] which has the main [MASK] [MASK] [MASK] : www . gutenberg . org [SEP]
[CLS] most people [MASK] at our website which [MASK] the main pg search facility : www . gutenberg . org [SEP]
[CLS] most people start [MASK] our website which has the main pg search facility : www . gutenberg . org [SEP]
[CLS] most people start at [MASK] website which has the main pg search facility : www . gutenberg . org [SEP]
[CLS] most people start at our website [MASK] has [MASK] main pg search facility : www . gutenberg . org [SEP]
[CLS] most people start at our website which has the main pg search facility [MASK] www . gutenberg . org [SEP]
[CLS] most people 

In [77]:
investigate_last_n_tensors('../pretraining_data_truncated/116/tensors_128.pt')

[CLS] this website includes information about project gutenberg - tm , including how to make donations to the project gutenberg literary archive foundation , how to help produce our new ebooks , [MASK] how to subscribe to our email newsletter to hear about new ebooks . [SEP]
[CLS] this website include information about project gutenberg - tm , include how to make donation to the project gutenberg literary archive foundation , how to help produce our new ebooks , and how to subscribe to our email newsletter to hear about new ebooks . [SEP]
[CLS] [MASK] we cannot and do not solicit contributions from states where we have not met the solicitation requirements , we know of no prohibition against accepting unsolicited donations from donors in such states who approach us with offers to donate . [SEP]
[CLS] while [MASK] cannot and do not solicit contributions from states where [MASK] have not met the solicitation requirements , [MASK] know of no prohibition against accepting unsolicited donat

In [78]:
import torch

In [79]:
torch.cuda.is_available()

True

In [80]:
torch.cuda.get_device_name(0)

'GeForce GTX 960'

In [81]:
text = "Anne went to the Albert Heijn at 5 o'clock to buy some milk for me."

In [82]:
default_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
default_tokenizer(text)

{'input_ids': [101, 4776, 2253, 2000, 1996, 4789, 2002, 28418, 2078, 2012, 1019, 1051, 1005, 5119, 2000, 4965, 2070, 6501, 2005, 2033, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [83]:
ST_tokenizer = StrategizedTokenizer(padding=True)
inputs = ST_tokenizer.tokenize(text)
inputs

{'input_ids': tensor([[  101,   103,  2253,  2000,  1996,   103,   103,   103,   103,  2012,
          1019,  1051,  1005,  5119,  2000,  4965,  2070,  6501,  2005,  2033,
          1012,   102],
        [  101,  4776,   103,  2000,  1996,  4789,  2002, 28418,  2078,  2012,
          1019,  1051,  1005,  5119,  2000,   103,  2070,  6501,  2005,  2033,
          1012,   102],
        [  101,  4776,  2253,   103,  1996,  4789,  2002, 28418,  2078,   103,
          1019,  1051,  1005,  5119,   103,  4965,  2070,  6501,   103,  2033,
          1012,   102],
        [  101,  4776,  2253,  2000,   103,  4789,  2002, 28418,  2078,  2012,
          1019,  1051,  1005,  5119,  2000,  4965,   103,  6501,  2005,  2033,
          1012,   102],
        [  101,  4776,  2253,  2000,  1996,  4789,  2002, 28418,  2078,  2012,
           103,  1051,  1005,  5119,  2000,  4965,  2070,  6501,  2005,  2033,
          1012,   102],
        [  101,  4776,  2253,  2000,  1996,  4789,  2002, 28418,  2078,  201

In [84]:
#1344
#14596

test_book = super_cleaner(load_etext(14596), -1)

In [85]:
longest_sentence = str(test_book[np.argmax([len(par) for par in test_book])])
#longest_sentence = text = '''He is really a good man, and is lucky enough, or the reverse, to win the hand of a delightful young lady whose charms, however, do not command the unanimous approval of the parishioners. ssession of high musical attainments makes her temperament all the more interesting, and accounts for the presence in so remote a district of her German friend whose acute sense of the rius leads to such untoward results. It is hard to say whether the author's talents are best evinced by her true pathos or by the delicate touches of humour which pervade the book.Another cable feature of the novel is an alert skill in construction which stamps it as a thoroughly artistic production.'''
len(longest_sentence)

7685

In [86]:
longest_sentence[:100]

'There is another expression which must be considered in connexion with the mediæval doctrine of deif'

In [87]:
SC = SentenceChunker(default_tokenizer)

In [88]:
tokens, sentences = SC.sentence_chunker(longest_sentence, 128, return_tokens=True)

In [89]:
#1 paragraph gets split into multiple sentences
sentences

['There is another expression which must be considered in connexion with the mediæval doctrine of deification. This is the intellectus agens, or [Greek: nous poiêtikos], which began its long history in Aristotle (De Anima, iii. 5). Aristotle there distinguishes two forms of Reason, which are related to each other as form and matter.Reason becomes all things, for the matter of anything is potentially the whole class to which it belongs; but Reason also makes all things, that is to say, it communicates to things those categories by which they become objects of thought.',
 'This higher Reason is separate and impassible ([Greek: chôristos kai amigês kai apathês]); it is eternal and immortal; while the passive reason perishes with the body. The creative Reason is immanent both in the human mind and in the external world; and thus only is it possible for the mind to know things. Unfortunately, Aristotle says very little more about his [Greek: nous poiêtikos], and does not explain how the two

In [90]:
naval_warfare_paragraph = """I have so far treated blockade as the initial stage of a struggle for the command of the sea. That appears to me to be the logical order of treatment, because when two naval Powers go to war it is almost certain that the stronger of the two will at the outset attempt to blockade the naval forces of the other. The same thing is likely to happen even if the two are approximately equal in naval force, but in that case the blockade is not likely to be of long duration, because both sides will be eager to obtain a decision in the open. The command of the sea is a matter of such vital moment to both sides that each must needs seek to obtain it as soon and as completely as possible, and the only certain way to obtain it is by the destruction of the armed forces of the enemy. The advantage of putting to sea first is in naval warfare the equivalent or counterpart of the advantage in land warfare of first crossing the enemy's frontier. If that advantage is pushed home and the enemy is still unready it must lead to a blockade. It is, moreover, quite possible that even if both belligerents are equally ready—I am[Pg 21] here assuming them to be approximately equal in force—one or other, if not both, may think it better strategy to await developments before risking everything in an attempt to secure an immediate decision. In point of fact, the difference between this policy and the policy of a declared blockade is, as I am about to show, almost imperceptible, especially in modern conditions of naval warfare. It is therefore necessary to consider the subject of blockade more in detail. Other subjects closely associated with this will also have to be considered in some detail before we can grasp the full purport and extent of what is meant by the command of the sea."""

In [91]:
len(default_tokenizer(naval_warfare_paragraph, add_special_tokens=False)['input_ids'])

367

In [92]:
len(default_tokenizer("Yes.", add_special_tokens=False)['input_ids'])

2

In [93]:
class ExampleListDataset(torch.utils.data.Dataset):
    def __init__(self):
        examples = torch.tensor(np.array([[101, 1996, 2622, 9535, 11029, 26885, 1997, 102, 0, 0, 0, 0, 0], 
                             [101,2198, 9535, 11029, 1010, 2011, 8965, 3854, 22033, 9050, 3064, 102, 0],
                             [101, 2102, 2023, 26885, 2003, 2005, 1996, 2224, 1997, 3087, 5973, 2012, 102]])).long()
        self.encodings = examples
        self.labels = examples

    def __len__(self):
        return len(self.encodings)

    def __getitem__(self, i):
        return {'input_ids': self.encodings[i],
                'labels': self.labels[i]}


In [94]:
from transformers.data.data_collator import DataCollatorWithPadding

In [95]:
train_data = ExampleListDataset()

In [96]:
train_data.encodings

tensor([[  101,  1996,  2622,  9535, 11029, 26885,  1997,   102,     0,     0,
             0,     0,     0],
        [  101,  2198,  9535, 11029,  1010,  2011,  8965,  3854, 22033,  9050,
          3064,   102,     0],
        [  101,  2102,  2023, 26885,  2003,  2005,  1996,  2224,  1997,  3087,
          5973,  2012,   102]])

In [97]:
bert_tiny_config = {"hidden_size": 128, 
                    "hidden_act": "gelu", 
                    "initializer_range": 0.02, 
                    "vocab_size": 30522, 
                    "hidden_dropout_prob": 0.1, 
                    "num_attention_heads": 2, 
                    "type_vocab_size": 2, 
                    "max_position_embeddings": 128, 
                    "num_hidden_layers": 2, 
                    "intermediate_size": 512, 
                    "attention_probs_dropout_prob": 0.1}

model = BertForMaskedLM(config=BertConfig(**bert_tiny_config))
model.train();

In [98]:
model.forward(input_ids=train_data.encodings, labels=train_data.encodings)

(tensor(10.3304, grad_fn=<NllLossBackward>),
 tensor([[[ 0.0237, -0.2144, -0.1519,  ..., -0.4443, -0.8854, -0.1942],
          [ 0.0327,  0.0298,  0.1226,  ...,  0.2167,  0.0234, -0.0708],
          [-0.0855, -0.2171,  0.1464,  ..., -0.0432, -0.3879,  0.1810],
          ...,
          [ 0.0290,  0.1397, -0.1061,  ...,  0.0944, -0.3605, -0.1195],
          [-0.0816,  0.1649,  0.0586,  ...,  0.1678,  0.1780, -0.5648],
          [-0.1088,  0.1624,  0.0525,  ..., -0.2676, -0.3393, -0.3727]],
 
         [[ 0.0587, -0.2009, -0.1228,  ..., -0.5022, -0.8236, -0.2353],
          [-0.0910,  0.2893,  0.0682,  ...,  0.0496, -0.0850,  0.2779],
          [-0.1971, -0.0808, -0.3896,  ..., -0.0490, -0.0134,  0.2566],
          ...,
          [-0.2041, -0.1826, -0.0579,  ..., -0.3026, -0.3270,  0.0961],
          [-0.2406,  0.1880, -0.0122,  ...,  0.0579, -0.1970, -0.1904],
          [-0.2120,  0.3371, -0.0657,  ..., -0.2242, -0.3357, -0.3525]],
 
         [[ 0.0768, -0.1245, -0.2199,  ..., -0.4651, -0

In [99]:
output_dir = './sample_model'
training_args = TrainingArguments(
    output_dir= os.path.join(output_dir, 'model'),          # output directory
    overwrite_output_dir = True,
    save_strategy='no',  #dont make checkpoints, easier to just retrain than continu given the experiment
    max_steps = 3,
    per_device_train_batch_size=2,  # batch size per device during training
    #per_device_eval_batch_size=256,   # batch size for evaluation
    learning_rate=1e-5,     
    logging_dir= os.path.join(output_dir, 'model', 'logs/'),            # directory for storing logs
    logging_steps=1
)

trainer = Trainer(
    args=training_args,
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    train_dataset=train_data,         # training dataset
    eval_dataset=None            # evaluation dataset
)

train_output = trainer.train()
#trainer.save_model(os.path.join(output_dir, 'model'))


TypeError: __init__() got an unexpected keyword argument 'save_strategy'

In [None]:
from transformers.data.data_collator import DataCollatorForWholeWordMask
from dataset.dataset import DefaultTokenizerDataset

In [None]:
train_dataset = DefaultTokenizerDataset(datadir='../pretraining_data_chunked', max_seq_length=128)
train_dataset.populate(book_list=[24269])

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', 
                                          truncation=True, 
                                          max_length=128,
                                          padding='max_length')
data_collator = data_collator = DataCollatorForWholeWordMask(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

In [None]:
bert_tiny_config = {"hidden_size": 128, 
                    "hidden_act": "gelu", 
                    "initializer_range": 0.02, 
                    "vocab_size": 30522, 
                    "hidden_dropout_prob": 0.1, 
                    "num_attention_heads": 2, 
                    "type_vocab_size": 2, 
                    "max_position_embeddings": 128, 
                    "num_hidden_layers": 2, 
                    "intermediate_size": 512, 
                    "attention_probs_dropout_prob": 0.1}

model = BertForMaskedLM(config=BertConfig(**bert_tiny_config))
model.train();

In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="../test-default_bert",
    overwrite_output_dir=True,
    max_steps=4,
    save_strategy='no',
    per_device_train_batch_size=1,
    logging_steps=1,
    
    #Hyper parameters as per BERT-paper which are not default values in TrainingArguments
    warmup_ratio=0.1,
    learning_rate=1e-4,
    weight_decay=0.01
)

trainer = Trainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
    data_collator=data_collator,
    train_dataset=train_dataset,
)

In [None]:
trainer.train()

In [None]:
train_dataset.encodings

In [None]:
model