In [1]:
#libraries involved in cleaning
from cleaner_utils import super_cleaner
from pretraining_data_utils import make_book_token_frequency, token_freq_df_to_dict, \
                                    all_available_tokens_from_df, optimize_book_subset, optimize_book_subset_ratio
from pretraining_data_utils import book_properties, make_df_book_properties
from pretraining_data_utils import SentenceChunker
from gutenberg.acquire import load_etext


#Library utilities
from tokenizer.tokenizer import StrategizedTokenizer
from dataset.dataset import StrategizedTokenizerDataset
from dataset.dataset import DefaultTokenizerDataset

#Training code
from transformers import BertConfig
from transformers import BertForMaskedLM
from transformers import BertTokenizer
from transformers import AdamW
from transformers import Trainer, TrainingArguments

from torch.utils.data import DataLoader

#General imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import spacy
import re
import json
import torch
import logging
import pickle
import os

from more_itertools import take
from datetime import datetime

In [2]:
cache_dir = 'cached_files/'

In [3]:
# Read scraped metadata from the gutenberg metadata database 
#(Original data was scraped by using https://github.com/c-w/gutenberg)
#The data is then further preprocessed by https://github.com/hugovk/gutenberg-metadata so it is actually usable.

f = open(cache_dir + 'gutenberg-metadata.json', 'r')
metadata = json.load(f)
f.close()

In [4]:
#retrieve how many english books there in english
english_book_keys = [key for key in metadata.keys() if metadata[key]['language'] == ['en']]
len(english_book_keys)

13142

In [5]:
# The third book cant be retrieved because of faults in retrieval. This happens sometimes.
import traceback
import sys

try:
    super_cleaner(load_etext(14575), -1, verify_deletions=True)
except Exception as e:
    try:
        exc_info = sys.exc_info()
    finally:
        # Display the *original* exception
        traceback.print_exception(*exc_info)
        del exc_info


Traceback (most recent call last):
  File "<ipython-input-5-0358ce9648a3>", line 6, in <module>
    super_cleaner(load_etext(14575), -1, verify_deletions=True)
  File "C:\Users\s145733\Anaconda3\lib\site-packages\gutenberg\acquire\text.py", line 152, in load_etext
    text = cache.read().decode('utf-8')
  File "C:\Users\s145733\Anaconda3\lib\gzip.py", line 292, in read
    return self._buffer.read(size)
  File "C:\Users\s145733\Anaconda3\lib\gzip.py", line 470, in read
    self._read_eof()
  File "C:\Users\s145733\Anaconda3\lib\gzip.py", line 516, in _read_eof
    raise BadGzipFile("CRC check failed %s != %s" % (hex(crc32),
gzip.BadGzipFile: CRC check failed 0x0 != 0xd0c5998f


In [6]:
#retrieve how many english books there are actually loadable
#If books arent cached this may take a while because it needs to scrape the books from gutenberg.org
#Therefore i provide a pre-processed file
if os.path.isfile(cache_dir + 'loadable_english_book_keys.pkl'):
    with open(cache_dir + 'loadable_english_book_keys.pkl', 'rb') as f:
        loadable_english_book_keys = pickle.load(f)              
else:
    loadable_english_book_keys = []
    i = 0
    for key in english_book_keys:
        if i % 1000 == 0:
            print(i, datetime.now())
        i += 1
        try:
            load_etext(int(key))
            loadable_english_book_keys.append(key)
        except:
            continue
    with open(cache_dir + 'loadable_english_book_keys.pkl', 'wb') as f:
        pickle.dump(loadable_english_book_keys, f)
            
len(loadable_english_book_keys)

12640

In [7]:
#Randomly select 10 books that we can query
np.random.seed(42)
rand_10_books = [x for x in np.random.choice(loadable_english_book_keys, size=10)]
rand_20_books = [x for x in np.random.choice(loadable_english_book_keys, size=20)]
print(rand_10_books), print(rand_20_books)

['17255', '1742', '14870', '14596', '23436', '22563', '15306', '15976', '1344', '13579']
['15116', '23050', '22669', '22310', '18782', '10343', '1650', '21698', '16831', '11194', '14752', '14429', '16170', '2078', '13766', '12310', '23892', '16144', '22293', '19224']


(None, None)

In [8]:
# Titles and authors for the first 5 books
# 1 book isnt actually loadable, see below.
for book_id in rand_10_books[:5]:
    print(book_id, metadata[book_id]['author'], metadata[book_id]['title'])

17255 ['Alma-Tadema, Laurence'] ['The Wings of Icarus: Being the Life of one Emilia Fletcher']
1742 ['Davis, Richard Harding'] ['Miss Civilization: A Comedy in One Act']
14870 ['Hopkinson, Alfred, Sir'] ['Rebuilding Britain: A Survey of Problems of Reconstruction After the World War']
14596 ['Inge, William Ralph'] ['Christian Mysticism']
23436 ['Anonymous'] ['Aladdin or The Wonderful Lamp']


Text preprocessing
===================

In [9]:
# original unprocessed text
text = load_etext(50000)[:500]
text

'The Project Gutenberg EBook of John Gutenberg, by Franz von Dingelstedt\r\n\r\nThis eBook is for the use of anyone anywhere at no cost and with\r\nalmost no restrictions whatsoever.  You may copy it, give it away or\r\nre-use it under the terms of the Project Gutenberg License included\r\nwith this eBook or online at www.gutenberg.org/license\r\n\r\n\r\nTitle: John Gutenberg\r\n       First Master Printer, His Acts and Most Remarkable\r\n       Discourses and his Death\r\n\r\nAuthor: Franz von Dingelstedt\r\n\r\nRelease Da'

In [10]:
#Text with formatting
print(text)

The Project Gutenberg EBook of John Gutenberg, by Franz von Dingelstedt

This eBook is for the use of anyone anywhere at no cost and with
almost no restrictions whatsoever.  You may copy it, give it away or
re-use it under the terms of the Project Gutenberg License included
with this eBook or online at www.gutenberg.org/license


Title: John Gutenberg
       First Master Printer, His Acts and Most Remarkable
       Discourses and his Death

Author: Franz von Dingelstedt

Release Da


Use the cleaner to retrieve cleaned text from the first book of the random selection.
The _super_cleaner_ strips a headers/disclaimers/tables that are not required for our purposes.

In [11]:

sentences = super_cleaner(load_etext(16968), -1, verify_deletions=False)

In [12]:
#Text is now a list of paragraphs
sentences[:10]

[' "And now for business," Lopez said. "And remember zat he what tells a lie shall be right away shotted." In his excitement he lost the little English he had.',
 ' "Put all ze men outside," Lopez ordered. Venustiano and Pedro, his chief lieutenants, obeyed at once, forcing them to march ahead of them, and standing guard over them near a great cactus bush a few feet from the adobe. "Leave ze women with me," the bandit continued. "But first, Alvarada, you find ze cook. I am \'ongry."',
 ' "Red" Giddings had been on the ranch with Gilbert since the very beginning. He came from the North with the young man, willing to stake all on this one venture. Like young Jones, he was not afraid. He was an efficient, well-set-up young fellow, with three consuming passions: Arizona, his harmonica, and Angela Hardy. The first saw a lot of "Red"; the second touched his lips frequently; but as for Angela--well, perhaps the poor boy kissed his harmonica so often in order to forget her lips. But if his own

In [13]:
#with some short sentences
sorted(sentences, key=len)[:20]

['"No."',
 '"Gun?"',
 '"Why?"',
 '"Yes."',
 '"Pells?"',
 '"A what?"',
 '"I have?"',
 '"Joking?"',
 '"Really?"',
 '"I ain\'t!"',
 '"Kiss me!"',
 '"Uh--huh!"',
 '"In a way."',
 '"What for?"',
 '"Yes, sir!"',
 '"Yes; why?"',
 'She nodded.',
 '"All those?"',
 '"You won\'t?"',
 'She started.']

In [14]:
list(sentences)

[' "And now for business," Lopez said. "And remember zat he what tells a lie shall be right away shotted." In his excitement he lost the little English he had.',
 ' "Put all ze men outside," Lopez ordered. Venustiano and Pedro, his chief lieutenants, obeyed at once, forcing them to march ahead of them, and standing guard over them near a great cactus bush a few feet from the adobe. "Leave ze women with me," the bandit continued. "But first, Alvarada, you find ze cook. I am \'ongry."',
 ' "Red" Giddings had been on the ranch with Gilbert since the very beginning. He came from the North with the young man, willing to stake all on this one venture. Like young Jones, he was not afraid. He was an efficient, well-set-up young fellow, with three consuming passions: Arizona, his harmonica, and Angela Hardy. The first saw a lot of "Red"; the second touched his lips frequently; but as for Angela--well, perhaps the poor boy kissed his harmonica so often in order to forget her lips. But if his own

In [15]:
#Find some properties about the book
book_properties(sentences)

[2041, 5, 1532, 75140, 353]

Tokenization in practice
============

In [16]:
#initialize custom tokenizer
ST_tokenizer = StrategizedTokenizer(padding=False)

In [17]:
inputs = ST_tokenizer.tokenize("Anne went to the Albert Heijn at 5 o'clock to buy some milk for me.")
inputs

{'input_ids': tensor([[  101,   103,  2253,  2000,  1996,   103,   103,   103,   103,  2012,
          1019,  1051,  1005,  5119,  2000,  4965,  2070,  6501,  2005,  2033,
          1012,   102],
        [  101,  4776,   103,  2000,  1996,  4789,  2002, 28418,  2078,  2012,
          1019,  1051,  1005,  5119,  2000,   103,  2070,  6501,  2005,  2033,
          1012,   102],
        [  101,  4776,  2253,   103,  1996,  4789,  2002, 28418,  2078,   103,
          1019,  1051,  1005,  5119,   103,  4965,  2070,  6501,   103,  2033,
          1012,   102],
        [  101,  4776,  2253,  2000,   103,  4789,  2002, 28418,  2078,  2012,
          1019,  1051,  1005,  5119,  2000,  4965,   103,  6501,  2005,  2033,
          1012,   102],
        [  101,  4776,  2253,  2000,  1996,  4789,  2002, 28418,  2078,  2012,
           103,  1051,  1005,  5119,  2000,  4965,  2070,  6501,  2005,  2033,
          1012,   102],
        [  101,  4776,  2253,  2000,  1996,  4789,  2002, 28418,  2078,  201

In [18]:
#Masks are at different places
for masked_line in ST_tokenizer.convert_ids_to_tokens(inputs['input_ids']):
    print(masked_line)

['[CLS]', '[MASK]', 'went', 'to', 'the', '[MASK]', '[MASK]', '[MASK]', '[MASK]', 'at', '5', 'o', "'", 'clock', 'to', 'buy', 'some', 'milk', 'for', 'me', '.', '[SEP]']
['[CLS]', 'anne', '[MASK]', 'to', 'the', 'albert', 'he', '##ij', '##n', 'at', '5', 'o', "'", 'clock', 'to', '[MASK]', 'some', 'milk', 'for', 'me', '.', '[SEP]']
['[CLS]', 'anne', 'went', '[MASK]', 'the', 'albert', 'he', '##ij', '##n', '[MASK]', '5', 'o', "'", 'clock', '[MASK]', 'buy', 'some', 'milk', '[MASK]', 'me', '.', '[SEP]']
['[CLS]', 'anne', 'went', 'to', '[MASK]', 'albert', 'he', '##ij', '##n', 'at', '5', 'o', "'", 'clock', 'to', 'buy', '[MASK]', 'milk', 'for', 'me', '.', '[SEP]']
['[CLS]', 'anne', 'went', 'to', 'the', 'albert', 'he', '##ij', '##n', 'at', '[MASK]', 'o', "'", 'clock', 'to', 'buy', 'some', 'milk', 'for', 'me', '.', '[SEP]']
['[CLS]', 'anne', 'went', 'to', 'the', 'albert', 'he', '##ij', '##n', 'at', '5', 'o', "'", 'clock', 'to', 'buy', 'some', '[MASK]', 'for', 'me', '.', '[SEP]']
['[CLS]', 'anne', 'we

Gutenberg book-selection
==============

In [19]:
#Setting to ignore warnings about sequences being longer than BERT can handle
logging.getLogger("transformers.tokenization_utils_base").setLevel(logging.ERROR)
from datetime import datetime
print(datetime.now())
df_books_10 = make_df_book_properties(rand_10_books)
print(datetime.now())

2021-05-06 15:15:49.617052
2021-05-06 15:16:02.747224


In [20]:
df_books_10

Unnamed: 0,book_id,num_sentences,Shortest sentence (char),Longest sentence (char),Total tokens,Longest sequence (tokens)
0,17255,677,8,2083,41672,522
1,1742,239,15,1051,8481,253
2,14870,317,22,5189,62696,1022
3,14596,528,18,7685,115181,1887
4,23436,11,41,293,579,71
5,22563,358,6,2344,16372,530
6,15306,390,9,1672,40223,383
7,15976,1419,10,1724,78126,453
8,1344,263,6,6391,27092,1536
9,13579,870,13,1939,65500,447


In [21]:
#Sort df and account for the fact that the column has both text and numbers
df_books_10.sort_values(by='Shortest sentence (char)')[:10]

Unnamed: 0,book_id,num_sentences,Shortest sentence (char),Longest sentence (char),Total tokens,Longest sequence (tokens)
5,22563,358,6,2344,16372,530
8,1344,263,6,6391,27092,1536
0,17255,677,8,2083,41672,522
6,15306,390,9,1672,40223,383
7,15976,1419,10,1724,78126,453
9,13579,870,13,1939,65500,447
1,1742,239,15,1051,8481,253
3,14596,528,18,7685,115181,1887
2,14870,317,22,5189,62696,1022
4,23436,11,41,293,579,71


In [22]:
#Some books have very few tokens.
df_books_10.sort_values(by='Total tokens', ascending=False).head()

Unnamed: 0,book_id,num_sentences,Shortest sentence (char),Longest sentence (char),Total tokens,Longest sequence (tokens)
3,14596,528,18,7685,115181,1887
7,15976,1419,10,1724,78126,453
9,13579,870,13,1939,65500,447
2,14870,317,22,5189,62696,1022
0,17255,677,8,2083,41672,522


In [23]:
df_books_10.sort_values(by='Total tokens').tail()

Unnamed: 0,book_id,num_sentences,Shortest sentence (char),Longest sentence (char),Total tokens,Longest sequence (tokens)
0,17255,677,8,2083,41672,522
2,14870,317,22,5189,62696,1022
9,13579,870,13,1939,65500,447
7,15976,1419,10,1724,78126,453
3,14596,528,18,7685,115181,1887


In [24]:
#Retrieve token occurences per book in a dataframe and another dataframe with total number of tokens
print(datetime.now())
df_book_token_freq_10, df_10_total_tokens = make_book_token_frequency(rand_10_books)
print(datetime.now())

2021-05-06 15:16:02.836456


100%|██████████| 10/10 [00:07<00:00,  1.28it/s]

2021-05-06 15:16:12.237926





In [25]:
# DataFrame is obviously very sparse
df_book_token_freq_10[:10]

Unnamed: 0,[PAD],[unused0],[unused1],[unused2],[unused3],[unused4],[unused5],[unused6],[unused7],[unused8],...,##！,##（,##）,##，,##－,##．,##／,##：,##？,##～
17255,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1742,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14870,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14596,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
23436,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
22563,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15306,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15976,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1344,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13579,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [26]:
#total number of tokens per book
df_10_total_tokens

17255     41679.0
1742       8491.0
14870     62696.0
14596    115181.0
23436       579.0
22563     16372.0
15306     40223.0
15976     78170.0
1344      27092.0
13579     65500.0
dtype: float64

In [27]:
#Total number of tokens in our small set
df_10_total_tokens.sum()

455983.0

In [28]:
#All tokens which are present in our subsample of 20 books
all_present_tokens_10 = all_available_tokens_from_df(df_book_token_freq_10)
all_present_tokens_10, len(all_present_tokens_10)

(array([  999,  1000,  1002, ..., 29645, 29664, 29667], dtype=int64), 15198)

In [29]:
#Show first 3 entries
tokens_per_book_10 = token_freq_df_to_dict(df_book_token_freq_10, df_10_total_tokens)
take(3, tokens_per_book_10.items())

[('17255',
  {'tokens': array([  999,  1000,  1005, ..., 29591, 29602, 29667], dtype=int64),
   'total_tokens': 41679.0}),
 ('1742',
  {'tokens': array([  999,  1000,  1005, ..., 28838, 29122, 29586], dtype=int64),
   'total_tokens': 8491.0}),
 ('14870',
  {'tokens': array([  999,  1000,  1005, ..., 29598, 29602, 29609], dtype=int64),
   'total_tokens': 62696.0})]

In [30]:
optimize_book_subset(all_present_tokens_10, tokens_per_book_10, threshold = 1e5)

book best:  14870 new tokens:  5983
book best:  22563 new tokens:  1634
book best:  1742 new tokens:  345
book best:  23436 new tokens:  69


{'subset_booklist': ['14870', '22563', '1742', '23436'],
 'subset_total_tokens': 88138.0,
 'subset_present_tokens': array([  999.,  1000.,  1005., ..., 29602., 29609., 29664.]),
 'subset_unique_tokens': 8031}

In [31]:
print(optimize_book_subset_ratio(all_present_tokens_10, tokens_per_book_10, threshold = 1e5))

book best:  22563 new tokens:  3650 book_total_tokens:  16372.0 ratio:  0.22294160762277057
book best:  23436 new tokens:  125 book_total_tokens:  579.0 ratio:  0.2158894645941278
book best:  1344 new tokens:  2211 book_total_tokens:  27092.0 ratio:  0.08161080761848516
book best:  1742 new tokens:  368 book_total_tokens:  8491.0 ratio:  0.0433400070663055
book best:  15306 new tokens:  2157 book_total_tokens:  40223.0 ratio:  0.053626034855679586
{'subset_booklist': ['22563', '23436', '1344', '1742', '15306'], 'subset_total_tokens': 92757.0, 'subset_present_tokens': array([  999.,  1000.,  1005., ..., 29602., 29664., 29667.]), 'subset_unique_tokens': 8511}


In [32]:
df_book_token_freq = pd.read_csv(cache_dir + 'df_book_token_freq.csv', index_col=0)
df_book_token_freq.head()

Unnamed: 0,[PAD],[unused0],[unused1],[unused2],[unused3],[unused4],[unused5],[unused6],[unused7],[unused8],...,##！,##（,##）,##，,##－,##．,##／,##：,##？,##～
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [33]:
#Total number of unique tokens in the data
len(np.flatnonzero(df_book_token_freq.sum(axis=0)))

27833

In [34]:
df_total_tokens = pd.read_csv(cache_dir +'df_total_tokens.csv', index_col=0).squeeze()
df_total_tokens.head()

1       7640.0
10    901551.0
11     36249.0
12     40831.0
13      6731.0
Name: 0, dtype: float64

In [35]:
df_total_tokens.sort_values()[:10]

23398     0.0
10802     0.0
23524     0.0
2305      0.0
232       2.0
22818     4.0
19937    15.0
22335    22.0
20086    25.0
23147    34.0
Name: 0, dtype: float64

In [36]:
#Why do certain books have 0 tokens?
#Well because it is an illustration-only book
super_cleaner(load_etext(23398), -1, verify_deletions=True)

True 
True 
True 
True 
True  _INFANT'S CABINET_
True  _OF_
True  BIRDS & BEASTS.
True  _LONDON. Printed & Sold by Harvey & Darton._ 55, Gracechurch Street, 1820. Price 6d.
True  [Illustration: The Stork.]
True  [Illustration: The Robin.]
True  [Illustration: The Hyena.]
True  [Illustration: The Lion.]
True  [Illustration: The Rhinoceros.]
True  [Illustration: The Camel.]
True  [Illustration: The Swan.]
True  [Illustration: The Vulture.]
True  [Illustration: The Lark.]
True  [Illustration: The Turkey.]
True  [Illustration: The Fox.]
True  [Illustration: The Greyhound.]
True  [Illustration: The Elephant.]
True  [Illustration: The Zebra.]
True  [Illustration: The Crow.]
True  [Illustration: The Cock.]
True  [Illustration: The Pigeon.]
True  [Illustration: The Goldfinch.]
True  [Illustration: The Buffalo.]
True  [Illustration: The Hog.]
True  [Illustration: The Horse.]
True  [Illustration: The Stag.]
True  [Illustration: The Chaffinch.]
True  [Illustration: The Peacock.]
True  [Illustrati

[]

In [37]:
# Or because it is a DVD-cover and we only use the .txt file
super_cleaner(load_etext(10802), -1, verify_deletions=True)

True to complying with copyright laws. PGLAF has not verified that all the eBook files on these discs meet the copyright laws in countries outside of the United States. PGLAF recommends that you verify this before using these files and requests that you advise us of any problems by email to copyright AT pglaf.org
True ** A note on CD and DVD disc capacity. It turns out that disk drive manufacturers (including the people who make CD and DVD burners and blank discs) measure disk space differently than the rest of the computer world. To them, 1MB, which is 1 megabyte, is 1,000,000 bytes. For the rest of the computer world, 1MB is 1,046,576 bytes. We mention this because people might read their DVD disc package and expect it to hold 4.7GB, but be surprised to find it can only hold about 4.37GB as the rest of the world measures space.
True  


[]

In [38]:
#Some books just have very little parsable information. This is often the case with books that are really really old 
#(e.g. writtenpre 1800s). The english in these books is often much different than modern day english.

print(super_cleaner(load_etext(19937), -1))
print(super_cleaner(load_etext(232), -1))

['produced from scanned images of public domain material from the Google Print project.)']
['by Virgil']


In [39]:
#How many tokens do we have in total available?
df_total_tokens.sum()

939505600.0

In [40]:
if os.path.isfile(cache_dir + 'subset_meta_100K.pkl'):
    with open(cache_dir + 'subset_meta_100K.pkl', 'rb') as f:
        subset_100K = pickle.load(f)

In [41]:
if os.path.isfile(cache_dir + 'subset_meta_ratio_100K.pkl'):
    with open(cache_dir + 'subset_meta_ratio_100K.pkl', 'rb') as f:
        subset_ratio_100K = pickle.load(f)

In [42]:
if os.path.isfile(cache_dir + 'subset_meta_1M.pkl'):
    with open(cache_dir + 'subset_meta_1M.pkl', 'rb') as f:
        subset_1M = pickle.load(f)

In [43]:
if os.path.isfile(cache_dir + 'subset_meta_ratio_1M.pkl'):
    with open(cache_dir + 'subset_meta_ratio_1M.pkl', 'rb') as f:
        subset_ratio_1M = pickle.load(f)

In [44]:
if os.path.isfile(cache_dir + 'subset_meta_10M.pkl'):
    with open(cache_dir + 'subset_meta_10M.pkl', 'rb') as f:
        subset_10M = pickle.load(f)

In [45]:
if os.path.isfile(cache_dir + 'subset_meta_ratio_10M.pkl'):
    with open(cache_dir + 'subset_meta_ratio_10M.pkl', 'rb') as f:
        subset_ratio_10M = pickle.load(f)

In [46]:
if os.path.isfile(cache_dir + 'subset_meta_ratio_100M.pkl'):
    with open(cache_dir + 'subset_meta_ratio_100M.pkl', 'rb') as f:
        subset_ratio_100M = pickle.load(f)

In [47]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [48]:
len(tokenizer.vocab)

30522

In [49]:
num_unused = 0
for k,v in tokenizer.vocab.items():
    if k.startswith('[unused'):
        num_unused += 1
num_unused

994

In [50]:
num_char = 0
for k,v in tokenizer.vocab.items():
    #'a' or '##a'
    if not v in subset_ratio_100M['subset_present_tokens'] and (len(k) == 1 or (len(k) == 3 and k.startswith('##'))):
        num_char += 1
        #print(k, v)
num_char

1493

In [51]:
#Which tokens are not represented?
num_unrepresented = 0
for k,v in tokenizer.vocab.items():
    if not v in subset_ratio_100M['subset_present_tokens'] and not k.startswith('[unused') and not len(k) <= 1 and not (len(k) == 3 and k.startswith('##')):
        num_unrepresented += 1
        print(k, v)
num_unrepresented

[PAD] 0
[CLS] 101
[SEP] 102
[MASK] 103
... 2133
km² 3186
soundtrack 6050
remix 6136
°c 6362
uefa 6663
playoff 7808
midfielder 8850
playstation 9160
quarterfinals 9237
pinyin 9973
allmusic 10477
mlb 10901
espn 10978
gameplay 11247
nsw 11524
nascar 11838
itunes 11943
lgbt 12010
mvp 12041
xbox 12202
eurovision 12714
vfl 13480
kolkata 13522
pga 14198
m³ 14241
bundesliga 14250
metacritic 14476
remixes 15193
steelers 15280
airplay 15341
##ии 15414
paralympics 15600
zhao 15634
reggae 15662
linebacker 15674
v8 15754
hindwings 15998
bollywood 16046
podcast 16110
atletico 16132
wwf 16779
transgender 16824
paralympic 17029
postseason 17525
vhs 17550
campeonato 17675
multiplayer 17762
łodz 17814
curated 17940
iphone 18059
gmbh 18289
danielle 18490
qaeda 18659
mixtape 18713
¹⁄₂ 18728
##ław 19704
##qing 19784
saxophonist 19977
preseason 20038
pmid 20117
keyboardist 20173
iucn 20333
pokemon 20421
nrl 20686
motorsports 20711
jaenelle 20757
beyonce 20773
airbus 20901
netflix 20907
motorsport 21044
belg

202

In [52]:
not_used = 0
for k,v in tokenizer.vocab.items():
    if not v in subset_ratio_100M['subset_present_tokens']:
        not_used += 1
        print(k,v)
not_used

[PAD] 0
[unused0] 1
[unused1] 2
[unused2] 3
[unused3] 4
[unused4] 5
[unused5] 6
[unused6] 7
[unused7] 8
[unused8] 9
[unused9] 10
[unused10] 11
[unused11] 12
[unused12] 13
[unused13] 14
[unused14] 15
[unused15] 16
[unused16] 17
[unused17] 18
[unused18] 19
[unused19] 20
[unused20] 21
[unused21] 22
[unused22] 23
[unused23] 24
[unused24] 25
[unused25] 26
[unused26] 27
[unused27] 28
[unused28] 29
[unused29] 30
[unused30] 31
[unused31] 32
[unused32] 33
[unused33] 34
[unused34] 35
[unused35] 36
[unused36] 37
[unused37] 38
[unused38] 39
[unused39] 40
[unused40] 41
[unused41] 42
[unused42] 43
[unused43] 44
[unused44] 45
[unused45] 46
[unused46] 47
[unused47] 48
[unused48] 49
[unused49] 50
[unused50] 51
[unused51] 52
[unused52] 53
[unused53] 54
[unused54] 55
[unused55] 56
[unused56] 57
[unused57] 58
[unused58] 59
[unused59] 60
[unused60] 61
[unused61] 62
[unused62] 63
[unused63] 64
[unused64] 65
[unused65] 66
[unused66] 67
[unused67] 68
[unused68] 69
[unused69] 70
[unused70] 71
[unused71] 72
[un

đ 1102
ƒ 1108
ɐ 1109
ɒ 1111
ɕ 1113
ɛ 1115
ɡ 1116
ɣ 1117
ɨ 1118
ɪ 1119
ɫ 1120
ɬ 1121
ɲ 1123
ɴ 1124
ɹ 1125
ɾ 1126
ʀ 1127
ʁ 1128
ʂ 1129
ʃ 1130
ʉ 1131
ʊ 1132
ʋ 1133
ʌ 1134
ʎ 1135
ʐ 1136
ʑ 1137
ʔ 1139
ʰ 1140
ʲ 1141
ʳ 1142
ʷ 1143
ʸ 1144
ʼ 1146
ʾ 1147
ʿ 1148
ˈ 1149
ː 1150
ˡ 1151
ˢ 1152
ˣ 1153
ˤ 1154
щ 1204
ъ 1205
ы 1206
ь 1207
ю 1209
ђ 1211
ј 1214
љ 1215
њ 1216
ћ 1217
ӏ 1218
ա 1219
բ 1220
գ 1221
դ 1222
ե 1223
թ 1224
ի 1225
լ 1226
կ 1227
հ 1228
մ 1229
յ 1230
ն 1231
ո 1232
պ 1233
ս 1234
վ 1235
տ 1236
ր 1237
ւ 1238
ք 1239
ן 1256
ף 1260
ץ 1262
ء 1269
ـ 1290
ٹ 1301
ں 1306
ھ 1307
ہ 1308
ے 1310
अ 1311
आ 1312
उ 1313
ए 1314
क 1315
ख 1316
ग 1317
च 1318
ज 1319
ट 1320
ड 1321
ण 1322
त 1323
थ 1324
द 1325
ध 1326
न 1327
प 1328
ब 1329
भ 1330
म 1331
य 1332
र 1333
ल 1334
व 1335
श 1336
ष 1337
स 1338
ह 1339
ा 1340
ि 1341
ी 1342
ो 1343
। 1344
॥ 1345
ং 1346
অ 1347
আ 1348
ই 1349
উ 1350
এ 1351
ও 1352
ক 1353
খ 1354
গ 1355
চ 1356
ছ 1357
জ 1358
ট 1359
ড 1360
ণ 1361
ত 1362
থ 1363
দ 1364
ধ 1365
ন 1366
প 1367
ব 1368
ভ 1369

##ა 29974
##ბ 29975
##გ 29976
##დ 29977
##ე 29978
##ვ 29979
##თ 29980
##ი 29981
##კ 29982
##ლ 29983
##მ 29984
##ნ 29985
##ო 29986
##რ 29987
##ს 29988
##ტ 29989
##უ 29990
##ᄀ 29991
##ᄂ 29992
##ᄃ 29993
##ᄅ 29994
##ᄆ 29995
##ᄇ 29996
##ᄉ 29997
##ᄊ 29998
##ᄋ 29999
##ᄌ 30000
##ᄎ 30001
##ᄏ 30002
##ᄐ 30003
##ᄑ 30004
##ᄒ 30005
##ᅡ 30006
##ᅢ 30007
##ᅥ 30008
##ᅦ 30009
##ᅧ 30010
##ᅩ 30011
##ᅪ 30012
##ᅭ 30013
##ᅮ 30014
##ᅯ 30015
##ᅲ 30016
##ᅳ 30017
##ᅴ 30018
##ᅵ 30019
##ᆨ 30020
##ᆫ 30021
##ᆯ 30022
##ᆷ 30023
##ᆸ 30024
##ᆼ 30025
##ᴬ 30026
##ᴮ 30027
##ᴰ 30028
##ᴵ 30029
##ᴺ 30030
##ᵀ 30031
##ᵃ 30032
##ᵇ 30033
##ᵈ 30034
##ᵉ 30035
##ᵏ 30037
##ᵒ 30039
##ᵖ 30040
##ᵗ 30041
##ᵣ 30043
##ᵤ 30044
##ᵥ 30045
##ᶜ 30046
##ᶠ 30047
##‐ 30048
##‑ 30049
##‒ 30050
##– 30051
##— 30052
##― 30053
##‖ 30054
##‘ 30055
##’ 30056
##‚ 30057
##“ 30058
##” 30059
##„ 30060
##† 30061
##‡ 30062
##• 30063
##… 30064
##‰ 30065
##′ 30066
##″ 30067
##› 30068
##‿ 30069
##⁄ 30070
##⁰ 30071
##ⁱ 30072
##⁴ 30073
##⁵ 30074
##⁶ 30075
##⁷ 30076


2689

In [53]:
len(subset_100M['subset_present_tokens'])

NameError: name 'subset_100M' is not defined

In [None]:
def subset_metadata(subset_dict):
    print(len(subset_dict['subset_booklist']))
    print(subset_dict['subset_total_tokens'])
    print(subset_dict['subset_unique_tokens'])

In [None]:
subset_metadata(subset_ratio_100K)

In [None]:
subset_metadata(subset_ratio_1M)

In [None]:
subset_metadata(subset_ratio_10M)

In [None]:
subset_metadata(subset_ratio_100M)

In [None]:
torch.cuda.is_available()

In [None]:
torch.cuda.get_device_name(0)

In [None]:
text = "Anne went to the Albert Heijn at 5 o'clock to buy some milk for me."

In [None]:
default_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
default_tokenizer(text)

In [None]:
ST_tokenizer = StrategizedTokenizer(padding=True)
inputs = ST_tokenizer.tokenize(text)
inputs

In [None]:
for x in ST_tokenizer.convert_ids_to_tokens(inputs['input_ids']):
    print(default_tokenizer.convert_tokens_to_string(x))

In [None]:
test_book = super_cleaner(load_etext(14596), -1)

In [None]:
longest_sentence = str(test_book[np.argmax([len(par) for par in test_book])])
longest_sentence

In [None]:
tokenized_longest_sentence = default_tokenizer(longest_sentence,
                                               add_special_tokens=False)['input_ids']
len(tokenized_longest_sentence)

In [None]:
SC = SentenceChunker()

In [None]:
tokens, sentences = SC.sentence_chunker(longest_sentence, 512, return_tokens=True)

In [68]:
sentences

[' "And now for business," Lopez said. "And remember zat he what tells a lie shall be right away shotted." In his excitement he lost the little English he had.',
 ' "Put all ze men outside," Lopez ordered. Venustiano and Pedro, his chief lieutenants, obeyed at once, forcing them to march ahead of them, and standing guard over them near a great cactus bush a few feet from the adobe. "Leave ze women with me," the bandit continued. "But first, Alvarada, you find ze cook. I am \'ongry."',
 ' "Red" Giddings had been on the ranch with Gilbert since the very beginning. He came from the North with the young man, willing to stake all on this one venture. Like young Jones, he was not afraid. He was an efficient, well-set-up young fellow, with three consuming passions: Arizona, his harmonica, and Angela Hardy. The first saw a lot of "Red"; the second touched his lips frequently; but as for Angela--well, perhaps the poor boy kissed his harmonica so often in order to forget her lips. But if his own

In [None]:
train_dataset = StrategizedTokenizerDataset()
train_dataset.populate()

In [None]:
len(train_dataset.labels)

In [64]:
from typing import Any, Callable, Dict, List, NewType, Optional, Tuple, Union
from transformers.tokenization_utils_base import BatchEncoding, PreTrainedTokenizerBase
InputDataClass = NewType("InputDataClass", Any)
DataCollator = NewType("DataCollator", Callable[[List[InputDataClass]], Dict[str, torch.Tensor]])

In [56]:
DataCollator

<function typing.NewType.<locals>.new_type(x)>

In [62]:
class DataCollatorForStrategizedMasking:
    def __post_init__(self):
        if self.mlm and self.tokenizer.mask_token is None:
            raise ValueError(
                "This tokenizer does not have a mask token which is necessary for masked language modeling. "
                "You should pass `mlm=False` to train on causal language modeling instead."
            )
    def __init__(self, tokenizer):
        tokenizer: PreTrainedTokenizerBase
        mlm: bool = True
        mlm_probability: float = 0.15
        pad_to_multiple_of: Optional[int] = None
            
            
    def __call__(self, examples: List[Union[List[int], torch.Tensor, Dict[str, torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # Handle dict or lists with proper padding and conversion to tensor.
        if isinstance(examples[0], (dict, BatchEncoding)):
            batch = self.tokenizer.pad(examples, return_tensors="pt", pad_to_multiple_of=self.pad_to_multiple_of)
        else:
            batch = {"input_ids": _collate_batch(examples, self.tokenizer, pad_to_multiple_of=self.pad_to_multiple_of)}

        # If special token mask has been preprocessed, pop it from the dict.
        special_tokens_mask = batch.pop("special_tokens_mask", None)
        if self.mlm:
            batch["input_ids"], batch["labels"] = self.mask_tokens(
                batch["input_ids"], special_tokens_mask=special_tokens_mask
            )
        else:
            labels = batch["input_ids"].clone()
            if self.tokenizer.pad_token_id is not None:
                labels[labels == self.tokenizer.pad_token_id] = -100
            batch["labels"] = labels
        return batch
        

NameError: name 'PreTrainedTokenizerBase' is not defined

In [67]:
from transformers import LineByLineTextDataset

In [None]:
class ExampleListDataset(torch.utils.data.Dataset):
    def __init__(self):
        test_book = super_cleaner(load_etext(14596), -1)

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, i) -> Dict[str, torch.tensor]:
        return self.examples[i]


In [None]:
dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="./oscar.eo.txt",
    block_size=128,
)

In [None]:
bert_tiny_config = {"hidden_size": 128, 
                    "hidden_act": "gelu", 
                    "initializer_range": 0.02, 
                    "vocab_size": 30522, 
                    "hidden_dropout_prob": 0.1, 
                    "num_attention_heads": 2, 
                    "type_vocab_size": 2, 
                    "max_position_embeddings": 512, 
                    "num_hidden_layers": 2, 
                    "intermediate_size": 512, 
                    "attention_probs_dropout_prob": 0.1}

model = BertForMaskedLM(config=BertConfig(**bert_tiny_config))
model.train();

In [None]:
model(**custom_input)

In [None]:
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # total # of training epochs
    per_device_train_batch_size=2,  # batch size per device during training
    #per_device_eval_batch_size=256,   # batch size for evaluation
    learning_rate=1e-5,     
    logging_dir='./logs',            # directory for storing logs
)

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=None            # evaluation dataset
)

In [None]:
TrainingArguments(output_dir='./results',
                  dataloader_pin_memory=False)