In [1]:
#libraries involved in cleaning
from cleaner_utils import super_cleaner
from pretraining_data_utils import make_book_token_frequency, token_freq_df_to_dict, \
                                    all_available_tokens_from_df, optimize_book_subset_ratio
from pretraining_data_utils import book_properties, make_df_book_properties
from pretraining_data_utils import SentenceChunker, SentenceWriter
from gutenberg.acquire import load_etext


#Library utilities
from tokenizer.tokenizer import StrategizedTokenizer
from dataset.dataset import StrategizedTokenizerDataset
from dataset.dataset import DefaultTokenizerDataset

#Training code
from transformers import BertConfig
from transformers import BertForMaskedLM
from transformers import BertTokenizer
from transformers import AdamW
from transformers import Trainer, TrainingArguments

from torch.utils.data import DataLoader

#General imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import spacy
import re
import json
import torch
import logging
import pickle
import os

from more_itertools import take
from datetime import datetime

In [2]:
cache_dir = 'cached_files/'

In [3]:
# Read scraped metadata from the gutenberg metadata database 
#(Original data was scraped by using https://github.com/c-w/gutenberg)
#The data is then further preprocessed by https://github.com/hugovk/gutenberg-metadata so it is actually usable.

f = open(cache_dir + 'gutenberg-metadata.json', 'r')
metadata = json.load(f)
f.close()

In [4]:
#retrieve how many english books there in english
english_book_keys = [key for key in metadata.keys() if metadata[key]['language'] == ['en']]
len(english_book_keys)

13142

In [5]:
# The third book cant be retrieved because of faults in retrieval. This happens sometimes.
import traceback
import sys

try:
    super_cleaner(load_etext(14575), -1, verify_deletions=True)
except Exception as e:
    try:
        exc_info = sys.exc_info()
    finally:
        # Display the *original* exception
        traceback.print_exception(*exc_info)
        del exc_info


Traceback (most recent call last):
  File "<ipython-input-5-0358ce9648a3>", line 6, in <module>
    super_cleaner(load_etext(14575), -1, verify_deletions=True)
  File "C:\ProgramData\Anaconda3\lib\site-packages\gutenberg\acquire\text.py", line 78, in load_etext
    text = cache.read().decode('utf-8')
  File "C:\ProgramData\Anaconda3\lib\gzip.py", line 292, in read
    return self._buffer.read(size)
  File "C:\ProgramData\Anaconda3\lib\gzip.py", line 470, in read
    self._read_eof()
  File "C:\ProgramData\Anaconda3\lib\gzip.py", line 516, in _read_eof
    raise BadGzipFile("CRC check failed %s != %s" % (hex(crc32),
gzip.BadGzipFile: CRC check failed 0x0 != 0xd0c5998f


In [6]:
#retrieve how many english books there are actually loadable
#If books arent cached this may take a while because it needs to scrape the books from gutenberg.org
#Therefore i provide a pre-processed file
if os.path.isfile(cache_dir + 'loadable_english_book_keys.pkl'):
    with open(cache_dir + 'loadable_english_book_keys.pkl', 'rb') as f:
        loadable_english_book_keys = pickle.load(f)              
else:
    loadable_english_book_keys = []
    i = 0
    for key in english_book_keys:
        if i % 1000 == 0:
            print(i, datetime.now())
        i += 1
        try:
            load_etext(int(key))
            loadable_english_book_keys.append(key)
        except:
            continue
    with open(cache_dir + 'loadable_english_book_keys.pkl', 'wb') as f:
        pickle.dump(loadable_english_book_keys, f)
            
len(loadable_english_book_keys)

12640

In [7]:
#Randomly select 10 books that we can query
np.random.seed(42)
rand_10_books = [x for x in np.random.choice(loadable_english_book_keys, size=10)]
rand_20_books = [x for x in np.random.choice(loadable_english_book_keys, size=20)]
print(rand_10_books), print(rand_20_books)

['17255', '1742', '14870', '14596', '23436', '22563', '15306', '15976', '1344', '13579']
['15116', '23050', '22669', '22310', '18782', '10343', '1650', '21698', '16831', '11194', '14752', '14429', '16170', '2078', '13766', '12310', '23892', '16144', '22293', '19224']


(None, None)

In [8]:
# Titles and authors for the first 5 books
# 1 book isnt actually loadable, see below.
for book_id in rand_10_books[:5]:
    print(book_id, metadata[book_id]['author'], metadata[book_id]['title'])

17255 ['Alma-Tadema, Laurence'] ['The Wings of Icarus: Being the Life of one Emilia Fletcher']
1742 ['Davis, Richard Harding'] ['Miss Civilization: A Comedy in One Act']
14870 ['Hopkinson, Alfred, Sir'] ['Rebuilding Britain: A Survey of Problems of Reconstruction After the World War']
14596 ['Inge, William Ralph'] ['Christian Mysticism']
23436 ['Anonymous'] ['Aladdin or The Wonderful Lamp']


Text preprocessing
===================

In [9]:
# original unprocessed text
text = load_etext(50000)[:500]
text

'The Project Gutenberg EBook of John Gutenberg, by Franz von Dingelstedt\r\n\r\nThis eBook is for the use of anyone anywhere at no cost and with\r\nalmost no restrictions whatsoever.  You may copy it, give it away or\r\nre-use it under the terms of the Project Gutenberg License included\r\nwith this eBook or online at www.gutenberg.org/license\r\n\r\n\r\nTitle: John Gutenberg\r\n       First Master Printer, His Acts and Most Remarkable\r\n       Discourses and his Death\r\n\r\nAuthor: Franz von Dingelstedt\r\n\r\nRelease Da'

In [10]:
#Text with formatting
print(text)

The Project Gutenberg EBook of John Gutenberg, by Franz von Dingelstedt

This eBook is for the use of anyone anywhere at no cost and with
almost no restrictions whatsoever.  You may copy it, give it away or
re-use it under the terms of the Project Gutenberg License included
with this eBook or online at www.gutenberg.org/license


Title: John Gutenberg
       First Master Printer, His Acts and Most Remarkable
       Discourses and his Death

Author: Franz von Dingelstedt

Release Da


Use the cleaner to retrieve cleaned text from the first book of the random selection.
The _super_cleaner_ strips a headers/disclaimers/tables that are not required for our purposes.

In [11]:
sentences = super_cleaner(load_etext(16968), -1, verify_deletions=False)

In [12]:
#Text is now a list of paragraphs
sentences[:10]

[' "And now for business," Lopez said. "And remember zat he what tells a lie shall be right away shotted." In his excitement he lost the little English he had.',
 ' "Put all ze men outside," Lopez ordered. Venustiano and Pedro, his chief lieutenants, obeyed at once, forcing them to march ahead of them, and standing guard over them near a great cactus bush a few feet from the adobe. "Leave ze women with me," the bandit continued. "But first, Alvarada, you find ze cook. I am \'ongry."',
 ' "Red" Giddings had been on the ranch with Gilbert since the very beginning. He came from the North with the young man, willing to stake all on this one venture. Like young Jones, he was not afraid. He was an efficient, well-set-up young fellow, with three consuming passions: Arizona, his harmonica, and Angela Hardy. The first saw a lot of "Red"; the second touched his lips frequently; but as for Angela--well, perhaps the poor boy kissed his harmonica so often in order to forget her lips. But if his own

In [13]:
#with some short sentences
sorted(sentences, key=len)[:20]

['"No."',
 '"Gun?"',
 '"Why?"',
 '"Yes."',
 '"Pells?"',
 '"A what?"',
 '"I have?"',
 '"Joking?"',
 '"Really?"',
 '"I ain\'t!"',
 '"Kiss me!"',
 '"Uh--huh!"',
 '"In a way."',
 '"What for?"',
 '"Yes, sir!"',
 '"Yes; why?"',
 'She nodded.',
 '"All those?"',
 '"You won\'t?"',
 'She started.']

In [14]:
list(sentences)[:10]

[' "And now for business," Lopez said. "And remember zat he what tells a lie shall be right away shotted." In his excitement he lost the little English he had.',
 ' "Put all ze men outside," Lopez ordered. Venustiano and Pedro, his chief lieutenants, obeyed at once, forcing them to march ahead of them, and standing guard over them near a great cactus bush a few feet from the adobe. "Leave ze women with me," the bandit continued. "But first, Alvarada, you find ze cook. I am \'ongry."',
 ' "Red" Giddings had been on the ranch with Gilbert since the very beginning. He came from the North with the young man, willing to stake all on this one venture. Like young Jones, he was not afraid. He was an efficient, well-set-up young fellow, with three consuming passions: Arizona, his harmonica, and Angela Hardy. The first saw a lot of "Red"; the second touched his lips frequently; but as for Angela--well, perhaps the poor boy kissed his harmonica so often in order to forget her lips. But if his own

In [15]:
#Find some properties about the book
book_properties(sentences)

[2041, 5, 1532, 75140, 353]

Tokenization in practice
============

In [16]:
#initialize custom tokenizer
ST_tokenizer = StrategizedTokenizer(padding=False)

In [17]:
inputs = ST_tokenizer.tokenize("Anne went to the Albert Heijn at 5 o'clock to buy some milk for me.")
inputs

{'input_ids': tensor([[  101,   103,  2253,  2000,  1996,   103,   103,   103,   103,  2012,
          1019,  1051,  1005,  5119,  2000,  4965,  2070,  6501,  2005,  2033,
          1012,   102],
        [  101,  4776,   103,  2000,  1996,  4789,  2002, 28418,  2078,  2012,
          1019,  1051,  1005,  5119,  2000,   103,  2070,  6501,  2005,  2033,
          1012,   102],
        [  101,  4776,  2253,   103,  1996,  4789,  2002, 28418,  2078,   103,
          1019,  1051,  1005,  5119,   103,  4965,  2070,  6501,   103,  2033,
          1012,   102],
        [  101,  4776,  2253,  2000,   103,  4789,  2002, 28418,  2078,  2012,
          1019,  1051,  1005,  5119,  2000,  4965,   103,  6501,  2005,  2033,
          1012,   102],
        [  101,  4776,  2253,  2000,  1996,  4789,  2002, 28418,  2078,  2012,
           103,  1051,  1005,  5119,  2000,  4965,  2070,  6501,  2005,  2033,
          1012,   102],
        [  101,  4776,  2253,  2000,  1996,  4789,  2002, 28418,  2078,  201

In [18]:
#Masks are at different places
for masked_line in ST_tokenizer.convert_ids_to_tokens(inputs['input_ids']):
    print(masked_line)

['[CLS]', '[MASK]', 'went', 'to', 'the', '[MASK]', '[MASK]', '[MASK]', '[MASK]', 'at', '5', 'o', "'", 'clock', 'to', 'buy', 'some', 'milk', 'for', 'me', '.', '[SEP]']
['[CLS]', 'anne', '[MASK]', 'to', 'the', 'albert', 'he', '##ij', '##n', 'at', '5', 'o', "'", 'clock', 'to', '[MASK]', 'some', 'milk', 'for', 'me', '.', '[SEP]']
['[CLS]', 'anne', 'went', '[MASK]', 'the', 'albert', 'he', '##ij', '##n', '[MASK]', '5', 'o', "'", 'clock', '[MASK]', 'buy', 'some', 'milk', '[MASK]', 'me', '.', '[SEP]']
['[CLS]', 'anne', 'went', 'to', '[MASK]', 'albert', 'he', '##ij', '##n', 'at', '5', 'o', "'", 'clock', 'to', 'buy', '[MASK]', 'milk', 'for', 'me', '.', '[SEP]']
['[CLS]', 'anne', 'went', 'to', 'the', 'albert', 'he', '##ij', '##n', 'at', '[MASK]', 'o', "'", 'clock', 'to', 'buy', 'some', 'milk', 'for', 'me', '.', '[SEP]']
['[CLS]', 'anne', 'went', 'to', 'the', 'albert', 'he', '##ij', '##n', 'at', '5', 'o', "'", 'clock', 'to', 'buy', 'some', '[MASK]', 'for', 'me', '.', '[SEP]']
['[CLS]', 'anne', 'we

In [19]:
#load original tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [20]:
for x in ST_tokenizer.convert_ids_to_tokens(inputs['input_ids']):
    print(tokenizer.convert_tokens_to_string(x))

[CLS] [MASK] went to the [MASK] [MASK] [MASK] [MASK] at 5 o ' clock to buy some milk for me . [SEP]
[CLS] anne [MASK] to the albert heijn at 5 o ' clock to [MASK] some milk for me . [SEP]
[CLS] anne went [MASK] the albert heijn [MASK] 5 o ' clock [MASK] buy some milk [MASK] me . [SEP]
[CLS] anne went to [MASK] albert heijn at 5 o ' clock to buy [MASK] milk for me . [SEP]
[CLS] anne went to the albert heijn at [MASK] o ' clock to buy some milk for me . [SEP]
[CLS] anne went to the albert heijn at 5 o ' clock to buy some [MASK] for me . [SEP]
[CLS] anne went [MASK] the albert heijn at 5 o ' clock [MASK] buy some milk for me . [SEP]
[CLS] anne went to the albert heijn at 5 o ' clock to buy some milk for [MASK] . [SEP]
[CLS] anne went to the albert heijn at 5 o ' clock to buy some milk for me [MASK] [SEP]
[CLS] anne go to the albert heijn at 5 o ' clock to buy some milk for i . [SEP]
[CLS] anne went to the albert heijn at o ' clock 5 to buy some milk for me . [SEP]


Gutenberg book-selection
==============

In [21]:
#Setting to ignore warnings about sequences being longer than BERT can handle
logging.getLogger("transformers.tokenization_utils_base").setLevel(logging.ERROR)
from datetime import datetime
print(datetime.now())
df_books_10 = make_df_book_properties(rand_10_books)
print(datetime.now())

2021-05-26 10:04:41.379072
2021-05-26 10:05:14.122497


In [22]:
df_books_10

Unnamed: 0,book_id,num_sentences,Shortest sentence (char),Longest sentence (char),Total tokens,Longest sequence (tokens)
0,17255,677,8,2083,41672,522
1,1742,239,15,1051,8481,253
2,14870,317,22,5189,62696,1022
3,14596,528,18,7685,115181,1887
4,23436,11,41,293,579,71
5,22563,358,6,2344,16372,530
6,15306,390,9,1672,40223,383
7,15976,1419,10,1724,78126,453
8,1344,263,6,6391,27092,1536
9,13579,870,13,1939,65500,447


In [23]:
#Sort df and account for the fact that the column has both text and numbers
df_books_10.sort_values(by='Shortest sentence (char)')[:10]

Unnamed: 0,book_id,num_sentences,Shortest sentence (char),Longest sentence (char),Total tokens,Longest sequence (tokens)
5,22563,358,6,2344,16372,530
8,1344,263,6,6391,27092,1536
0,17255,677,8,2083,41672,522
6,15306,390,9,1672,40223,383
7,15976,1419,10,1724,78126,453
9,13579,870,13,1939,65500,447
1,1742,239,15,1051,8481,253
3,14596,528,18,7685,115181,1887
2,14870,317,22,5189,62696,1022
4,23436,11,41,293,579,71


In [24]:
#Some books have very few tokens.
df_books_10.sort_values(by='Total tokens', ascending=False).head()

Unnamed: 0,book_id,num_sentences,Shortest sentence (char),Longest sentence (char),Total tokens,Longest sequence (tokens)
3,14596,528,18,7685,115181,1887
7,15976,1419,10,1724,78126,453
9,13579,870,13,1939,65500,447
2,14870,317,22,5189,62696,1022
0,17255,677,8,2083,41672,522


In [25]:
df_books_10.sort_values(by='Total tokens').tail()

Unnamed: 0,book_id,num_sentences,Shortest sentence (char),Longest sentence (char),Total tokens,Longest sequence (tokens)
0,17255,677,8,2083,41672,522
2,14870,317,22,5189,62696,1022
9,13579,870,13,1939,65500,447
7,15976,1419,10,1724,78126,453
3,14596,528,18,7685,115181,1887


In [26]:
#Retrieve token occurences per book in a dataframe and another dataframe with total number of tokens
print(datetime.now())
df_book_token_freq_10, df_10_total_tokens = make_book_token_frequency(rand_10_books)
print(datetime.now())

2021-05-26 10:05:14.300538


100%|██████████| 10/10 [00:11<00:00,  1.18s/it]

2021-05-26 10:05:29.490304





In [27]:
# DataFrame is obviously very sparse
df_book_token_freq_10[:10]

Unnamed: 0,[PAD],[unused0],[unused1],[unused2],[unused3],[unused4],[unused5],[unused6],[unused7],[unused8],...,##！,##（,##）,##，,##－,##．,##／,##：,##？,##～
17255,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1742,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14870,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14596,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
23436,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
22563,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15306,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15976,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1344,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13579,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [28]:
#total number of tokens per book
df_10_total_tokens

17255     41679.0
1742       8491.0
14870     62696.0
14596    115181.0
23436       579.0
22563     16372.0
15306     40223.0
15976     78170.0
1344      27092.0
13579     65500.0
dtype: float64

In [29]:
#Total number of tokens in our small set
df_10_total_tokens.sum()

455983.0

In [30]:
#All tokens which are present in our subsample of 20 books
all_present_tokens_10 = all_available_tokens_from_df(df_book_token_freq_10)
all_present_tokens_10, len(all_present_tokens_10)

(array([  999,  1000,  1002, ..., 29645, 29664, 29667], dtype=int64), 15198)

In [31]:
#Show first 3 entries
tokens_per_book_10 = token_freq_df_to_dict(df_book_token_freq_10, df_10_total_tokens)
take(3, tokens_per_book_10.items())

[('17255',
  {'tokens': array([  999,  1000,  1005, ..., 29591, 29602, 29667], dtype=int64),
   'total_tokens': 41679.0}),
 ('1742',
  {'tokens': array([  999,  1000,  1005, ..., 28838, 29122, 29586], dtype=int64),
   'total_tokens': 8491.0}),
 ('14870',
  {'tokens': array([  999,  1000,  1005, ..., 29598, 29602, 29609], dtype=int64),
   'total_tokens': 62696.0})]

In [32]:
print(optimize_book_subset_ratio(all_present_tokens_10, tokens_per_book_10, threshold = 1e5))

book best:  22563 new tokens:  3650 book_total_tokens:  16372.0 ratio:  0.22294160762277057
book best:  23436 new tokens:  125 book_total_tokens:  579.0 ratio:  0.2158894645941278
book best:  1344 new tokens:  2211 book_total_tokens:  27092.0 ratio:  0.08161080761848516
book best:  1742 new tokens:  368 book_total_tokens:  8491.0 ratio:  0.0433400070663055
book best:  15306 new tokens:  2157 book_total_tokens:  40223.0 ratio:  0.053626034855679586
{'subset_booklist': ['22563', '23436', '1344', '1742', '15306'], 'subset_total_tokens': 92757.0, 'subset_present_tokens': array([  999.,  1000.,  1005., ..., 29602., 29664., 29667.]), 'subset_unique_tokens': 8511}


In [33]:
df_book_token_freq = pd.read_csv(os.path.join('../LessIsMore-cache','df_book_token_freq.csv'), index_col=0)
df_book_token_freq.head()

Unnamed: 0,[PAD],[unused0],[unused1],[unused2],[unused3],[unused4],[unused5],[unused6],[unused7],[unused8],...,##！,##（,##）,##，,##－,##．,##／,##：,##？,##～
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [34]:
#Total number of unique tokens in the data
len(np.flatnonzero(df_book_token_freq.sum(axis=0)))

27833

In [35]:
df_total_tokens = pd.read_csv(os.path.join('../LessIsMore-cache','df_total_tokens.csv'), index_col=0).squeeze()
df_total_tokens.head()

1       7640.0
10    901551.0
11     36249.0
12     40831.0
13      6731.0
Name: 0, dtype: float64

In [36]:
df_total_tokens.sort_values()[:10]

23398     0.0
10802     0.0
23524     0.0
2305      0.0
232       2.0
22818     4.0
19937    15.0
22335    22.0
20086    25.0
23147    34.0
Name: 0, dtype: float64

In [37]:
#Why do certain books have 0 tokens?
#Well because it is an illustration-only book
super_cleaner(load_etext(23398), -1, verify_deletions=True)

True 
True 
True 
True 
True  _INFANT'S CABINET_
True  _OF_
True  BIRDS & BEASTS.
True  _LONDON. Printed & Sold by Harvey & Darton._ 55, Gracechurch Street, 1820. Price 6d.
True  [Illustration: The Stork.]
True  [Illustration: The Robin.]
True  [Illustration: The Hyena.]
True  [Illustration: The Lion.]
True  [Illustration: The Rhinoceros.]
True  [Illustration: The Camel.]
True  [Illustration: The Swan.]
True  [Illustration: The Vulture.]
True  [Illustration: The Lark.]
True  [Illustration: The Turkey.]
True  [Illustration: The Fox.]
True  [Illustration: The Greyhound.]
True  [Illustration: The Elephant.]
True  [Illustration: The Zebra.]
True  [Illustration: The Crow.]
True  [Illustration: The Cock.]
True  [Illustration: The Pigeon.]
True  [Illustration: The Goldfinch.]
True  [Illustration: The Buffalo.]
True  [Illustration: The Hog.]
True  [Illustration: The Horse.]
True  [Illustration: The Stag.]
True  [Illustration: The Chaffinch.]
True  [Illustration: The Peacock.]
True  [Illustrati

[]

In [38]:
# Or because it is a DVD-cover and we only use the .txt file
super_cleaner(load_etext(10802), -1, verify_deletions=True)

True to complying with copyright laws. PGLAF has not verified that all the eBook files on these discs meet the copyright laws in countries outside of the United States. PGLAF recommends that you verify this before using these files and requests that you advise us of any problems by email to copyright AT pglaf.org
True ** A note on CD and DVD disc capacity. It turns out that disk drive manufacturers (including the people who make CD and DVD burners and blank discs) measure disk space differently than the rest of the computer world. To them, 1MB, which is 1 megabyte, is 1,000,000 bytes. For the rest of the computer world, 1MB is 1,046,576 bytes. We mention this because people might read their DVD disc package and expect it to hold 4.7GB, but be surprised to find it can only hold about 4.37GB as the rest of the world measures space.
True  


[]

In [39]:
#Some books just have very little parsable information. This is often the case with books that are really really old 
#(e.g. writtenpre 1800s). The english in these books is often much different than modern day english.

print(super_cleaner(load_etext(19937), -1))
print(super_cleaner(load_etext(232), -1))

['produced from scanned images of public domain material from the Google Print project.)']
['by Virgil']


In [40]:
#How many tokens do we have in total available?
df_total_tokens.sum()

939505600.0

In [41]:
if os.path.isfile(cache_dir + 'subset_meta_ratio_100K.pkl'):
    with open(cache_dir + 'subset_meta_ratio_100K.pkl', 'rb') as f:
        subset_ratio_100K = pickle.load(f)

In [42]:
print(subset_ratio_100K)

{'subset_booklist': ['232', '22818', '22335', '23594', '20086', '20360', '10557', '19571', '19177', '14100', '13536', '23436', '129', '21783', '11006', '19937', '22847', '1321', '23147', '21805', '22529', '12474', '13082', '14463', '23538', '13081', '116', '18589', '23446', '23450', '17124', '16780', '23146', '18935', '12554', '17254', '23429', '13203', '17365', '22236', '16169', '18417', '22579', '19634', '24044', '104', '1567', '23315', '24269', '12358', '23880'], 'subset_total_tokens': 99974.0, 'subset_present_tokens': array([  100.,   999.,  1000., ..., 29735., 29737., 29739.]), 'subset_unique_tokens': 13040}


In [43]:
if os.path.isfile(cache_dir + 'subset_meta_1M.pkl'):
    with open(cache_dir + 'subset_meta_1M.pkl', 'rb') as f:
        subset_1M = pickle.load(f)

In [44]:
if os.path.isfile(cache_dir + 'subset_meta_ratio_1M.pkl'):
    with open(cache_dir + 'subset_meta_ratio_1M.pkl', 'rb') as f:
        subset_ratio_1M = pickle.load(f)

In [45]:
if os.path.isfile(cache_dir + 'subset_meta_10M.pkl'):
    with open(cache_dir + 'subset_meta_10M.pkl', 'rb') as f:
        subset_10M = pickle.load(f)

In [46]:
if os.path.isfile(cache_dir + 'subset_meta_ratio_10M.pkl'):
    with open(cache_dir + 'subset_meta_ratio_10M.pkl', 'rb') as f:
        subset_ratio_10M = pickle.load(f)

In [47]:
if os.path.isfile(cache_dir + 'subset_meta_ratio_100M.pkl'):
    with open(cache_dir + 'subset_meta_ratio_100M.pkl', 'rb') as f:
        subset_ratio_100M = pickle.load(f)

In [48]:
#Make a file with the union of all books in the subsets (easy for processing on the server)

union_subsets = {'subset_booklist': np.union1d(np.union1d(np.union1d(subset_ratio_100K['subset_booklist'],
                                                          subset_ratio_1M['subset_booklist']),
                                                          subset_ratio_10M['subset_booklist']), 
                                               subset_ratio_100M['subset_booklist'])}

with open(os.path.join(cache_dir, 'subset_meta_ratio_union.pkl'), 'wb') as f:
    pickle.dump(union_subsets, f)

Check how many tokens are actually represented by the data
=============

In [49]:
#load original tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [50]:
#size of the vocabulary
len(tokenizer.vocab)

30522

In [51]:
#vocabulary contains a bunch of [unused] tokens which allow people to add their own tokens
num_unused = 0
for k,v in tokenizer.vocab.items():
    if k.startswith('[unused'):
        num_unused += 1
num_unused

994

In [52]:
#Check for tokens which are either alone or a continued token, e.g. 'a' or '##a'
num_char = 0
for k,v in tokenizer.vocab.items():
    #'a' or '##a'
    if not v in subset_ratio_100M['subset_present_tokens'] and (len(k) == 1 or (len(k) == 3 and k.startswith('##'))):
        num_char += 1
        #print(k, v)
num_char

1493

In [53]:
#Which tokens are not represented?
num_unrepresented = 0
for k,v in tokenizer.vocab.items():
    if not v in subset_ratio_100M['subset_present_tokens'] and not k.startswith('[unused') and not len(k) <= 1 and not (len(k) == 3 and k.startswith('##')):
        num_unrepresented += 1
        print(k, v)
num_unrepresented

[PAD] 0
[CLS] 101
[SEP] 102
[MASK] 103
... 2133
km² 3186
soundtrack 6050
remix 6136
°c 6362
uefa 6663
playoff 7808
midfielder 8850
playstation 9160
quarterfinals 9237
pinyin 9973
allmusic 10477
mlb 10901
espn 10978
gameplay 11247
nsw 11524
nascar 11838
itunes 11943
lgbt 12010
mvp 12041
xbox 12202
eurovision 12714
vfl 13480
kolkata 13522
pga 14198
m³ 14241
bundesliga 14250
metacritic 14476
remixes 15193
steelers 15280
airplay 15341
##ии 15414
paralympics 15600
zhao 15634
reggae 15662
linebacker 15674
v8 15754
hindwings 15998
bollywood 16046
podcast 16110
atletico 16132
wwf 16779
transgender 16824
paralympic 17029
postseason 17525
vhs 17550
campeonato 17675
multiplayer 17762
łodz 17814
curated 17940
iphone 18059
gmbh 18289
danielle 18490
qaeda 18659
mixtape 18713
¹⁄₂ 18728
##ław 19704
##qing 19784
saxophonist 19977
preseason 20038
pmid 20117
keyboardist 20173
iucn 20333
pokemon 20421
nrl 20686
motorsports 20711
jaenelle 20757
beyonce 20773
airbus 20901
netflix 20907
motorsport 21044
belg

202

In [54]:
not_used = 0
for k,v in tokenizer.vocab.items():
    if not v in subset_ratio_100M['subset_present_tokens']:
        not_used += 1
        print(k,v)
not_used

[PAD] 0
[unused0] 1
[unused1] 2
[unused2] 3
[unused3] 4
[unused4] 5
[unused5] 6
[unused6] 7
[unused7] 8
[unused8] 9
[unused9] 10
[unused10] 11
[unused11] 12
[unused12] 13
[unused13] 14
[unused14] 15
[unused15] 16
[unused16] 17
[unused17] 18
[unused18] 19
[unused19] 20
[unused20] 21
[unused21] 22
[unused22] 23
[unused23] 24
[unused24] 25
[unused25] 26
[unused26] 27
[unused27] 28
[unused28] 29
[unused29] 30
[unused30] 31
[unused31] 32
[unused32] 33
[unused33] 34
[unused34] 35
[unused35] 36
[unused36] 37
[unused37] 38
[unused38] 39
[unused39] 40
[unused40] 41
[unused41] 42
[unused42] 43
[unused43] 44
[unused44] 45
[unused45] 46
[unused46] 47
[unused47] 48
[unused48] 49
[unused49] 50
[unused50] 51
[unused51] 52
[unused52] 53
[unused53] 54
[unused54] 55
[unused55] 56
[unused56] 57
[unused57] 58
[unused58] 59
[unused59] 60
[unused60] 61
[unused61] 62
[unused62] 63
[unused63] 64
[unused64] 65
[unused65] 66
[unused66] 67
[unused67] 68
[unused68] 69
[unused69] 70
[unused70] 71
[unused71] 72
[un

[unused741] 746
[unused742] 747
[unused743] 748
[unused744] 749
[unused745] 750
[unused746] 751
[unused747] 752
[unused748] 753
[unused749] 754
[unused750] 755
[unused751] 756
[unused752] 757
[unused753] 758
[unused754] 759
[unused755] 760
[unused756] 761
[unused757] 762
[unused758] 763
[unused759] 764
[unused760] 765
[unused761] 766
[unused762] 767
[unused763] 768
[unused764] 769
[unused765] 770
[unused766] 771
[unused767] 772
[unused768] 773
[unused769] 774
[unused770] 775
[unused771] 776
[unused772] 777
[unused773] 778
[unused774] 779
[unused775] 780
[unused776] 781
[unused777] 782
[unused778] 783
[unused779] 784
[unused780] 785
[unused781] 786
[unused782] 787
[unused783] 788
[unused784] 789
[unused785] 790
[unused786] 791
[unused787] 792
[unused788] 793
[unused789] 794
[unused790] 795
[unused791] 796
[unused792] 797
[unused793] 798
[unused794] 799
[unused795] 800
[unused796] 801
[unused797] 802
[unused798] 803
[unused799] 804
[unused800] 805
[unused801] 806
[unused802] 807
[unused8

soundtrack 6050
remix 6136
°c 6362
uefa 6663
playoff 7808
midfielder 8850
playstation 9160
quarterfinals 9237
pinyin 9973
allmusic 10477
mlb 10901
espn 10978
gameplay 11247
nsw 11524
nascar 11838
itunes 11943
lgbt 12010
mvp 12041
xbox 12202
eurovision 12714
##⁺ 12744
vfl 13480
kolkata 13522
pga 14198
m³ 14241
bundesliga 14250
metacritic 14476
remixes 15193
steelers 15280
airplay 15341
##ии 15414
paralympics 15600
zhao 15634
reggae 15662
linebacker 15674
v8 15754
hindwings 15998
bollywood 16046
podcast 16110
atletico 16132
wwf 16779
transgender 16824
paralympic 17029
##₀ 17110
postseason 17525
vhs 17550
campeonato 17675
multiplayer 17762
łodz 17814
curated 17940
iphone 18059
gmbh 18289
danielle 18490
qaeda 18659
mixtape 18713
¹⁄₂ 18728
##ᵢ 19109
##ₙ 19110
##ław 19704
##qing 19784
saxophonist 19977
preseason 20038
pmid 20117
keyboardist 20173
iucn 20333
pokemon 20421
nrl 20686
motorsports 20711
jaenelle 20757
beyonce 20773
airbus 20901
netflix 20907
motorsport 21044
belgarath 21256
iaaf 

2689

In [55]:
len(subset_ratio_100M['subset_present_tokens'])

27833

In [56]:
def subset_metadata(subset_dict):
    '''
    prints:
    Number of books used in subset
    Number of tokens present in subset
    Number of tokens represented by subset
    '''
    print(len(subset_dict['subset_booklist']))
    print(subset_dict['subset_total_tokens'])
    print(subset_dict['subset_unique_tokens'])

In [57]:
subset_metadata(subset_ratio_100K)

51
99974.0
13040


In [58]:
subset_metadata(subset_ratio_1M)

178
999825.0
24294


In [59]:
subset_metadata(subset_ratio_10M)

656
9977907.0
27607


In [60]:
subset_metadata(subset_ratio_100M)

828
28660288.0
27833


In [61]:
#It seems the cleaner leaves in some other stuff, we leave this in given that it includes the alphabet.
super_cleaner(load_etext(23594), -1)

[' a b c d e f g h i j k l m n o p q r s t u v w x y z &.',
 'Online Distributed Proofreading Team at http://www.pgdp.net (This file was produced from images generously made available by The Internet Archive/American Libraries.)',
 'fi ff fl ffl ffi.']

In [62]:
book_id = 22818

In [63]:
def stored_sentences(book_id):
    print(book_id)
    filenames = ['sentences_8.pkl', 'sentences_32.pkl', 'sentences_128.pkl']
    for file in filenames:
        with open(os.path.join('../pretraining_data_chunked', str(book_id), file), 'rb') as f:
            sentences = pickle.load(f)
            print(sentences)
    print('==============')

In [64]:
for book_id in subset_ratio_100K['subset_booklist'][:10]:
    stored_sentences(book_id)

232
['by Virgil']
[]
[]
22818
['   An Alphabet   of Celebrities']
[]
[]
22335
[]
['Transcriber\'s Note: Original spells the title "Nursury." This was retained.']
[]
23594
[]
[' a b c d e f g h i j k l m n o p q r s t u v w x y z &.', 'fi ff fl ffl ffi.']
['Online Distributed Proofreading Team at http://www.pgdp.net (This file was produced from images generously made available by The Internet Archive/American Libraries.)']
20086
[]
['       Where differences between the list of illustrations and the       caption text existed in the original the most comprehensive       description was used for both.']
[]
20360
[]
["   Entered at Stationer's Hall", '  P. 13, l. 7, for mighty read magick.']
['      Go we to the Committee room,     There gleams of light conflict with gloom,     While unread rheams in chaos lye,     Our water closets to supply.', "    Noodles{3}, who rave for abolition     Of th' African's improv'd condition{4},     At your own cost fine projects try;     Dont rob--from pu

The original plan for tokenization may not work as well as desired for certain books with specific text entries
============

Take for example book 23880

In [65]:
stored_sentences(23880)

23880
[]
['Lasiurus borealis ornatus new subspecies', 'The Mexican red bat, thus, is left without a name, and for it I propose', 'University of Kansas Publications Museum of Natural History Volume 5, No. 14, pp. 223-226 December 15, 1951', 'Volume 5, No. 14, pp. 223-226 December 15, 1951']
['Accordingly, the name A[talapha]. mexicana Saussure 1861 falls as a synonym of Lasiurus cinereus cinereus (Beauvois 1796); if the hoary bat of the southern end of the Mexican table land should prove to be subspecifically separable, the name Lasiurus cinereus mexicanus would be available for it.', 'As may be readily seen by comparing specimens of L. borealis and L. cinereus from Mexico (or also from any place in North America north of Mexico), the description by Saussure applies to the hoary bat (Lasiurus cinereus) and not to the red bat (Lasiurus borealis).', 'Long inrolled tail; femoral patagium as in the vespertilios. Teeth 4/2, 1/1, 4/5 or 5/5.']


In [66]:
tensors = torch.load('../pretraining_data_chunked/23880/tensors_128.pt')
tensors

{'input_ids': tensor([[  101.,   103.,  1010.,  ...,     0.,     0.,     0.],
         [  101., 11914.,   103.,  ...,     0.,     0.,     0.],
         [  101., 11914.,  1010.,  ...,     0.,     0.,     0.],
         ...,
         [  101.,  2146.,  1999.,  ...,     0.,     0.,     0.],
         [  101.,  2146.,  1999.,  ...,     0.,     0.,     0.],
         [  101.,  2146.,  1999.,  ...,     0.,     0.,     0.]]),
 'attention_mask': tensor([[1., 1., 1.,  ..., 0., 0., 0.],
         [1., 1., 1.,  ..., 0., 0., 0.],
         [1., 1., 1.,  ..., 0., 0., 0.],
         ...,
         [1., 1., 1.,  ..., 0., 0., 0.],
         [1., 1., 1.,  ..., 0., 0., 0.],
         [1., 1., 1.,  ..., 0., 0., 0.]]),
 'labels': tensor([[  101., 11914.,  1010.,  ...,     0.,     0.,     0.],
         [  101., 11914.,  1010.,  ...,     0.,     0.,     0.],
         [  101., 11914.,  1010.,  ...,     0.,     0.,     0.],
         ...,
         [  101.,  2146.,  1999.,  ...,     0.,     0.,     0.],
         [  101.,

None of the ratios as given at the end of one of the longer sentences gets masked because of how we replace text by masks in the sentence (a result of the whole-word mask strategy).

In [67]:
for row in tensors['input_ids'][-15:]:
    print(tokenizer.convert_tokens_to_string([x for x in tokenizer.convert_ids_to_tokens(row) if x != '[PAD]']))

[CLS] as may be readily seen by comparing specimens of l . borealis and l . cinereus from mexico ( or also from any place in north america north of mexico ) , the description by saussure applies to the hoary bat ( lasiurus cinereus ) and [MASK] to the red bat ( lasiurus borealis ) . [SEP]
[CLS] as may be readily see by compare specimen of l . boreali and l . cinereus from mexico ( or also from any place in north america north of mexico ) , the description by saussure apply to the hoary bat ( lasiurus cinereus ) and not to the red bat ( lasiurus boreali ) . [SEP]
[CLS] as may be readily seen by comparing specimens of l . borealis and l . cinereus from mexico ( or also from any place in north america north of mexico ) , the description by saussure applies to the hoary bat ( lasiurus cinereus ) and not to the red bat ( lasiurus borealis ) . [SEP]
[CLS] [MASK] inrolled tail ; femoral patagium as in the vespertilios . teeth 4 / 2 , 1 / 1 , 4 / 5 or 5 / 5 . [SEP]
[CLS] long [MASK] [MASK] [MA

In [30]:
nlp = spacy.load('en_core_web_sm')

In [69]:
doc = nlp('Long inrolled tail; femoral patagium as in the vespertilios. Teeth 4/2, 1/1, 4/5 or 5/5.')

In [70]:
print(tokenizer.convert_ids_to_tokens(tokenizer('Long inrolled tail; femoral patagium as in the vespertilios. Teeth 4/2, 1/1, 4/5 or 5/5.')['input_ids']))

['[CLS]', 'long', 'in', '##roll', '##ed', 'tail', ';', 'fe', '##moral', 'pat', '##agi', '##um', 'as', 'in', 'the', 've', '##sper', '##ti', '##lio', '##s', '.', 'teeth', '4', '/', '2', ',', '1', '/', '1', ',', '4', '/', '5', 'or', '5', '/', '5', '.', '[SEP]']


In [71]:
#Text is parsed in 1 go by Spacy, but is recognized as seperate tokens by BERT
for token in doc:
    print(token.text, '|', token.pos_, '|', token.lemma_)

Long | ADV | long
inrolled | VERB | inrolle
tail | NOUN | tail
; | PUNCT | ;
femoral | ADJ | femoral
patagium | NOUN | patagium
as | ADP | as
in | ADP | in
the | DET | the
vespertilios | NOUN | vespertilio
. | PUNCT | .
Teeth | PROPN | Teeth
4/2 | NUM | 4/2
, | PUNCT | ,
1/1 | NUM | 1/1
, | PUNCT | ,
4/5 | NUM | 4/5
or | CCONJ | or
5/5 | NUM | 5/5
. | PUNCT | .


In [72]:
subset_ratio_100M['subset_booklist'][-5:]

['10625', '22', '19447', '19217', '15476']

In [31]:
text = '''He is really a good man, and is lucky enough, or the reverse, to win the hand of a delightful young lady whose charms, however, do not command the unanimous approval of the parishioners. ssession of high musical attainments makes her temperament all the more interesting, and accounts for the presence in so remote a district of her German friend whose acute sense of the rius leads to such untoward results. It is hard to say whether the author's talents are best evinced by her true pathos or by the delicate touches of humour which pervade the book.Another cable feature of the novel is an alert skill in construction which stamps it as a thoroughly artistic production.'''
doc = nlp(text)

In [32]:
for sent in doc.sents:
    print(sent.text)

He is really a good man, and is lucky enough, or the reverse, to win the hand of a delightful young lady whose charms, however, do not command the unanimous approval of the parishioners.
ssession of high musical attainments makes her temperament all the more interesting, and accounts for the presence in so remote a district of her German friend whose acute sense of the rius leads to such untoward results.
It is hard to say whether the author's talents are best evinced by her true pathos or by the delicate touches of humour which pervade the book.
Another cable feature of the novel is an alert skill in construction which stamps it as a thoroughly artistic production.


In [73]:
torch.cuda.is_available()

True

In [74]:
torch.cuda.get_device_name(0)

'Quadro M1200'

In [75]:
text = "Anne went to the Albert Heijn at 5 o'clock to buy some milk for me."

In [76]:
default_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
default_tokenizer(text)

{'input_ids': [101, 4776, 2253, 2000, 1996, 4789, 2002, 28418, 2078, 2012, 1019, 1051, 1005, 5119, 2000, 4965, 2070, 6501, 2005, 2033, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [77]:
ST_tokenizer = StrategizedTokenizer(padding=True)
inputs = ST_tokenizer.tokenize(text)
inputs

{'input_ids': tensor([[  101,   103,  2253,  2000,  1996,   103,   103,   103,   103,  2012,
          1019,  1051,  1005,  5119,  2000,  4965,  2070,  6501,  2005,  2033,
          1012,   102],
        [  101,  4776,   103,  2000,  1996,  4789,  2002, 28418,  2078,  2012,
          1019,  1051,  1005,  5119,  2000,   103,  2070,  6501,  2005,  2033,
          1012,   102],
        [  101,  4776,  2253,   103,  1996,  4789,  2002, 28418,  2078,   103,
          1019,  1051,  1005,  5119,   103,  4965,  2070,  6501,   103,  2033,
          1012,   102],
        [  101,  4776,  2253,  2000,   103,  4789,  2002, 28418,  2078,  2012,
          1019,  1051,  1005,  5119,  2000,  4965,   103,  6501,  2005,  2033,
          1012,   102],
        [  101,  4776,  2253,  2000,  1996,  4789,  2002, 28418,  2078,  2012,
           103,  1051,  1005,  5119,  2000,  4965,  2070,  6501,  2005,  2033,
          1012,   102],
        [  101,  4776,  2253,  2000,  1996,  4789,  2002, 28418,  2078,  201

In [38]:
#1344
#14596

test_book = super_cleaner(load_etext(14596), -1)

In [2]:
#longest_sentence = str(test_book[np.argmax([len(par) for par in test_book])])
longest_sentence = text = '''He is really a good man, and is lucky enough, or the reverse, to win the hand of a delightful young lady whose charms, however, do not command the unanimous approval of the parishioners. ssession of high musical attainments makes her temperament all the more interesting, and accounts for the presence in so remote a district of her German friend whose acute sense of the rius leads to such untoward results. It is hard to say whether the author's talents are best evinced by her true pathos or by the delicate touches of humour which pervade the book.Another cable feature of the novel is an alert skill in construction which stamps it as a thoroughly artistic production.'''
len(longest_sentence)

673

In [3]:
SC = SentenceChunker()

In [4]:
tokens, sentences = SC.sentence_chunker(longest_sentence, 128, return_tokens=True)

In [5]:
sentences

["He is really a good man, and is lucky enough, or the reverse, to win the hand of a delightful young lady whose charms, however, do not command the unanimous approval of the parishioners. ssession of high musical attainments makes her temperament all the more interesting, and accounts for the presence in so remote a district of her German friend whose acute sense of the rius leads to such untoward results. It is hard to say whether the author's talents are best evinced by her true pathos or by the delicate touches of humour which pervade the book.Another cable feature of the novel is an alert skill in construction which stamps it as a thoroughly artistic production."]

In [83]:
print(subset_ratio_100K['subset_booklist'])

['232', '22818', '22335', '23594', '20086', '20360', '10557', '19571', '19177', '14100', '13536', '23436', '129', '21783', '11006', '19937', '22847', '1321', '23147', '21805', '22529', '12474', '13082', '14463', '23538', '13081', '116', '18589', '23446', '23450', '17124', '16780', '23146', '18935', '12554', '17254', '23429', '13203', '17365', '22236', '16169', '18417', '22579', '19634', '24044', '104', '1567', '23315', '24269', '12358', '23880']


In [84]:
print(datetime.now())
text_splits_chunk = make_data_splits(14596, max_seq_lengths=[8,32,128], truncate='chunk')
text_splits_trunc = make_data_splits(14596, max_seq_lengths=[8,32,128], truncate=True)
print(datetime.now())

2021-05-26 10:08:59.539493


NameError: name 'make_data_splits' is not defined

In [None]:
os.listdir('../pretraining_data')

In [None]:
class ExampleListDataset(torch.utils.data.Dataset):
    def __init__(self):
        examples = torch.tensor(np.array([[101, 1996, 2622, 9535, 11029, 26885, 1997, 102, 0, 0, 0, 0, 0], 
                             [101,2198, 9535, 11029, 1010, 2011, 8965, 3854, 22033, 9050, 3064, 102, 0],
                             [101, 2102, 2023, 26885, 2003, 2005, 1996, 2224, 1997, 3087, 5973, 2012, 102]])).long()
        self.encodings = examples
        self.labels = examples

    def __len__(self):
        return len(self.encodings)

    def __getitem__(self, i):
        return {'input_ids': self.encodings[i],
                'labels': self.labels[i]}


In [None]:
from transformers.data.data_collator import DataCollatorWithPadding

In [None]:
train_data = ExampleListDataset()

In [None]:
train_data.encodings

In [None]:
bert_tiny_config = {"hidden_size": 128, 
                    "hidden_act": "gelu", 
                    "initializer_range": 0.02, 
                    "vocab_size": 30522, 
                    "hidden_dropout_prob": 0.1, 
                    "num_attention_heads": 2, 
                    "type_vocab_size": 2, 
                    "max_position_embeddings": 128, 
                    "num_hidden_layers": 2, 
                    "intermediate_size": 512, 
                    "attention_probs_dropout_prob": 0.1}

model = BertForMaskedLM(config=BertConfig(**bert_tiny_config))
model.train();

In [None]:
output_dir = './test_experiment'
training_args = TrainingArguments(
    output_dir= os.path.join(output_dir, 'model2'),          # output directory
    overwrite_output_dir = True,
    save_strategy='no',  #dont make checkpoints, easier to just retrain than continu given the experiment
    max_steps = 3,
    per_device_train_batch_size=2,  # batch size per device during training
    #per_device_eval_batch_size=256,   # batch size for evaluation
    learning_rate=1e-5,     
    logging_dir= os.path.join(output_dir, 'model2', 'logs/'),            # directory for storing logs
    logging_steps=1
)

trainer = Trainer(
    args=training_args,
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    train_dataset=train_data,         # training dataset
    eval_dataset=None            # evaluation dataset
)

train_output = trainer.train()
#trainer.save_model(os.path.join(output_dir, 'model'))


In [None]:
trainer.save_model(os.path.join(output_dir, 'model2'))

In [None]:
train_output

In [15]:
from transformers.data.data_collator import DataCollatorForWholeWordMask
from dataset.dataset import DefaultTokenizerDataset

In [16]:
train_dataset = DefaultTokenizerDataset(datadir='../pretraining_data_chunked', max_seq_length=128)
train_dataset.populate(book_list=[24269])

128 Loaded books:  [24269]


In [105]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', 
                                          truncation=True, 
                                          max_length=128,
                                          padding='max_length')
data_collator = data_collator = DataCollatorForWholeWordMask(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

In [106]:
bert_tiny_config = {"hidden_size": 128, 
                    "hidden_act": "gelu", 
                    "initializer_range": 0.02, 
                    "vocab_size": 30522, 
                    "hidden_dropout_prob": 0.1, 
                    "num_attention_heads": 2, 
                    "type_vocab_size": 2, 
                    "max_position_embeddings": 128, 
                    "num_hidden_layers": 2, 
                    "intermediate_size": 512, 
                    "attention_probs_dropout_prob": 0.1}

model = BertForMaskedLM(config=BertConfig(**bert_tiny_config))
model.train();

In [107]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="../test-default_bert",
    overwrite_output_dir=True,
    max_steps=4,
    save_strategy='no',
    per_device_train_batch_size=1,
    logging_steps=1,
    
    #Hyper parameters as per BERT-paper which are not default values in TrainingArguments
    warmup_ratio=0.1,
    learning_rate=1e-4,
    weight_decay=0.01
)

trainer = Trainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
    data_collator=data_collator,
    train_dataset=train_dataset,
)




In [108]:
trainer.train()

Step,Training Loss
1,10.3642
2,10.3964
3,10.3294
4,10.3877


TrainOutput(global_step=4, training_loss=10.369400262832642, metrics={'train_runtime': 39.6518, 'train_samples_per_second': 0.101, 'total_flos': 9853183776.0, 'epoch': 0.1, 'init_mem_cpu_alloc_delta': 498556928, 'init_mem_gpu_alloc_delta': 17471488, 'init_mem_cpu_peaked_delta': 14729216, 'init_mem_gpu_peaked_delta': 0, 'train_mem_cpu_alloc_delta': -108642304, 'train_mem_gpu_alloc_delta': 52412928, 'train_mem_cpu_peaked_delta': 108793856, 'train_mem_gpu_peaked_delta': 65876992})

In [103]:
train_dataset.encodings

[{'input_ids': tensor([  101,  1162, 14608, 16177, 14608, 29734, 15297,  1158, 29723,  1174,
          29730, 18199,  1159, 29729,  1155, 29727, 15297,  1155, 29735, 29734,
          29739,  1155, 29720, 29727, 24824, 29737, 29732, 15297,  1166, 14608,
          29727, 14608,  1174, 29730, 18199, 15297,  1159, 29727, 29723, 29735,
          29733, 29723, 29734, 14608, 18199,  1012,   102])},
 {'input_ids': tensor([  101,  2665,  1005,  1055,  2460,  2381,  1997,  1996,  2394,  2111,
           1010,  5824,  2581,  1010,  5824,  2620,  1012,  1996,  8416,  3179,
           2003,  1999,  1016, 18709,  1012,  2030,  1015,  5285,  1012,  2035,
           2060,  6572,  2024,  1999,  1015,  5285,  1012,   102])},
 {'input_ids': tensor([  101,  3602,  1011,  1011,  1996,  2206,  3616,  2024,  2012,  2556,
           2041,  1997,  6140,  1024,  7287,  1010, 11118,  1010, 16333,  1010,
          22238,  1010, 24194,  1010, 17528,  1010, 20024,  1010,  4601,  2620,
           1010,  5354,  2581,

In [44]:
model

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 128, padding_idx=0)
      (position_embeddings): Embedding(128, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=128, out_features=128, bias=True)
              (key): Linear(in_features=128, out_features=128, bias=True)
              (value): Linear(in_features=128, out_features=128, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=128, out_features=128, bias=True)
              (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=Tr

In [47]:
a = TrainingArguments(output_dir='./test')

In [48]:
a.should_log

AttributeError: 'TrainingArguments' object has no attribute 'should_log'

JIANT
=============

In [26]:
from jiant.proj.simple import runscript as run
import jiant.scripts.download_data.runscript as downloader

In [None]:
EXP_DIR = "/path/to/exp"

# Download the Data
downloader.download_data(["mrpc"], f"{EXP_DIR}/tasks")

In [43]:


# Set up the arguments for the Simple API
args = run.RunConfiguration(
    run_name="simple",
    exp_dir='./test_experiment/model/tasks',
    data_dir=f"{EXP_DIR}/tasks",
    hf_pretrained_model_name_or_path="./test_experiment/model",
    model_weights_path="./test_experiment/model",
    tasks="mrpc",
    train_batch_size=16,
    num_train_epochs=3,
    force_overwrite=True,
    seed = 3
#    seed=3
)

# Run!
run.run_simple(args)

Running from start
  jiant_task_container_config_path: ./test_experiment/model/tasks\run_configs\simple_config.json
  output_dir: ./test_experiment/model/tasks\runs\simple
  hf_pretrained_model_name_or_path: ./test_experiment/model
  model_path: ./test_experiment/model
  model_config_path: ./test_experiment/model/tasks\models\bert\model\config.json
  model_load_mode: partial
  do_train: True
  do_val: True
  do_save: False
  do_save_last: False
  do_save_best: False
  write_val_preds: False
  write_test_preds: False
  eval_every_steps: 0
  save_every_steps: 0
  save_checkpoint_every_steps: 0
  no_improvements_for_n_evals: 0
  keep_checkpoint_when_done: False
  force_overwrite: True
  seed: 3
  learning_rate: 1e-05
  adam_epsilon: 1e-08
  max_grad_norm: 1.0
  optimizer_type: adam
  no_cuda: False
  fp16: False
  fp16_opt_level: O1
  local_rank: -1
  server_ip: 
  server_port: 
device: cuda n_gpu: 1, distributed training: False, 16-bits training: False
Using seed: 3
{
  "jiant_task_conta

Some weights of the model checkpoint at ./test_experiment/model were not used when initializing BertModel: ['cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoint at ./test_experiment/model and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bi

PermissionError: [Errno 13] Permission denied: './test_experiment/model'

In [39]:
np.random.randint(0, 2** 32 -1)

ValueError: high is out of bounds for int32

In [40]:
np.__version__

'1.20.1'

In [32]:
import jiant.proj.main.export_model as export_model

In [33]:
export_model.export_model(
    hf_pretrained_model_name_or_path="bert-base-uncased",
    output_base_path="./jiant/bert-base-uncased",
)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=570.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




Some weights of BertForPreTraining were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['cls.predictions.decoder.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
#%load_ext tensorboard
#%tensorboard --logdir './logs'

In [None]:
np.array([[101, 1996, 2622, 9535, 11029, 26885, 1997], 
         [101,2198, 9535, 11029, 1010, 2011, 8965, 3854, 22033, 9050, 3064, 102],
         [101, 2102, 2023, 26885, 2003, 2005, 1996, 2224, 1997, 3087, 5973, 2012, 102]])

In [None]:
self.examples

In [None]:
from dataset.dataset import StrategizedTokenizerDataset

In [None]:
train_data.examples

In [None]:
model(**custom_input)

In [None]:
a = ['a', 'b']

In [None]:
a += ['c']

In [None]:
a

GLUE and SentEval benchmarking
==================

In [49]:
from datasets import load_dataset, load_metric

In [114]:
GLUE_TASKS = ["cola", "mnli", "mnli-mm", "mrpc", "qnli", "qqp", "rte", "sst2", "stsb", "wnli"]

In [115]:
task = "cola"
model_checkpoint = "test_experiment/model2/"
batch_size = 16

In [116]:
actual_task = "mnli" if task == "mnli-mm" else task
dataset = load_dataset("glue", actual_task)
metric = load_metric('glue', actual_task)

Reusing dataset glue (C:\Users\s145733\.cache\huggingface\datasets\glue\cola\1.0.0\dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


In [117]:
dataset

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 8551
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1043
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1063
    })
})

In [118]:
dataset["train"][0]

{'idx': 0,
 'label': 1,
 'sentence': "Our friends won't buy this analysis, let alone the next one we propose."}

In [119]:
import datasets
import random
import pandas as pd
from IPython.display import display, HTML

def show_random_elements(dataset, num_examples=10):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)
    
    df = pd.DataFrame(dataset[picks])
    for column, typ in dataset.features.items():
        if isinstance(typ, datasets.ClassLabel):
            df[column] = df[column].transform(lambda i: typ.names[i])
    display(HTML(df.to_html()))

In [120]:
show_random_elements(dataset["train"])

Unnamed: 0,idx,label,sentence
0,1824,unacceptable,"I acknowledged that my father, he was tight as an owl."
1,409,acceptable,For him to do that would be a mistake.
2,4506,acceptable,"Mary sang a song, but Lee never did."
3,4012,unacceptable,John made Mary cooking Korean food.
4,3657,unacceptable,John sounded in the park.
5,2286,acceptable,Clouds cleared from the sky.
6,1679,unacceptable,It is this hat that that he was wearing is certain.
7,1424,acceptable,Who are you gawking at?
8,6912,acceptable,Captain Oates died in order to save his comrades.
9,520,acceptable,The tree dropped fruit to the ground.


In [121]:
metric

Metric(name: "glue", features: {'predictions': Value(dtype='int64', id=None), 'references': Value(dtype='int64', id=None)}, usage: """
Compute GLUE evaluation metric associated to each GLUE dataset.
Args:
    predictions: list of predictions to score.
        Each translation should be tokenized into a list of tokens.
    references: list of lists of references for each translation.
        Each reference should be tokenized into a list of tokens.
Returns: depending on the GLUE subset, one or several of:
    "accuracy": Accuracy
    "f1": F1 score
    "pearson": Pearson Correlation
    "spearmanr": Spearman Correlation
    "matthews_correlation": Matthew Correlation
Examples:

    >>> glue_metric = datasets.load_metric('glue', 'sst2')  # 'sst2' or any of ["mnli", "mnli_mismatched", "mnli_matched", "qnli", "rte", "wnli", "hans"]
    >>> references = [0, 1]
    >>> predictions = [0, 1]
    >>> results = glue_metric.compute(predictions=predictions, references=references)
    >>> print(res

In [122]:
import numpy as np

fake_preds = np.random.randint(0, 2, size=(64,))
fake_labels = np.random.randint(0, 2, size=(64,))
metric.compute(predictions=fake_preds, references=fake_labels)

{'matthews_correlation': 0.14462158210542375}

In [123]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [124]:
task_to_keys = {
    "cola": ("sentence", None),
    "mnli": ("premise", "hypothesis"),
    "mnli-mm": ("premise", "hypothesis"),
    "mrpc": ("sentence1", "sentence2"),
    "qnli": ("question", "sentence"),
    "qqp": ("question1", "question2"),
    "rte": ("sentence1", "sentence2"),
    "sst2": ("sentence", None),
    "stsb": ("sentence1", "sentence2"),
    "wnli": ("sentence1", "sentence2"),
}

In [None]:
task_to_metrics = {
    "cola": 'matthews_correlation',
    "mnli": "accuracy",
    "mnli-mm": "accuracy",
    "mrpc": ['accuracy', 'f1'],
    "qnli": "accuracy",
    "qqp": ['accuracy', 'f1'],
    "rte": 'accuracy',
    "sst2": 'accuracy',
    "stsb": ['pearson', 'spearmanr'],
    "wnli": 'accuracy',
}

In [125]:
sentence1_key, sentence2_key = task_to_keys[task]
if sentence2_key is None:
    print(f"Sentence: {dataset['train'][0][sentence1_key]}")
else:
    print(f"Sentence 1: {dataset['train'][0][sentence1_key]}")
    print(f"Sentence 2: {dataset['train'][0][sentence2_key]}")

Sentence: Our friends won't buy this analysis, let alone the next one we propose.


In [126]:
def preprocess_function(examples):
    if sentence2_key is None:
        return tokenizer(examples[sentence1_key], truncation=True)
    return tokenizer(examples[sentence1_key], examples[sentence2_key], truncation=True)

In [127]:
encoded_dataset = dataset.map(preprocess_function, batched=True)

HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))




In [128]:
from transformers import AutoModelForSequenceClassification

In [129]:
num_labels = 3 if task.startswith("mnli") else 1 if task=="stsb" else 2
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels)

Some weights of the model checkpoint at test_experiment/model2/ were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at test_experiment/model

In [130]:
metric_name = "pearson" if task == "stsb" else "matthews_correlation" if task == "cola" else "accuracy"

args = TrainingArguments(
    output_dir = os.path.join(model_checkpoint, 'cola'),
    evaluation_strategy = "steps",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    max_steps=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
)

In [131]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    if task != "stsb":
        predictions = np.argmax(predictions, axis=1)
    else:
        predictions = predictions[:, 0]
    return metric.compute(predictions=predictions, references=labels)

In [132]:
validation_key = "validation_mismatched" if task == "mnli-mm" else "validation_matched" if task == "mnli" else "validation"
trainer = Trainer(
    model,
    args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset[validation_key],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [133]:
trainer.train()

Step,Training Loss,Validation Loss


TrainOutput(global_step=3, training_loss=0.6951454480489095, metrics={'train_runtime': 55.7825, 'train_samples_per_second': 0.054, 'total_flos': 22483142784.0, 'epoch': 0.01, 'init_mem_cpu_alloc_delta': -21917696, 'init_mem_gpu_alloc_delta': 17349632, 'init_mem_cpu_peaked_delta': 21946368, 'init_mem_gpu_peaked_delta': 0, 'train_mem_cpu_alloc_delta': -79511552, 'train_mem_gpu_alloc_delta': 52054528, 'train_mem_cpu_peaked_delta': 79540224, 'train_mem_gpu_peaked_delta': 15810048})

In [136]:
results = trainer.evaluate()

In [138]:
results

{'eval_loss': 0.6929337382316589,
 'eval_matthews_correlation': 0.053976421365281024,
 'eval_runtime': 507.7157,
 'eval_samples_per_second': 2.054,
 'epoch': 0.01,
 'eval_mem_cpu_alloc_delta': -3543040,
 'eval_mem_gpu_alloc_delta': 0,
 'eval_mem_cpu_peaked_delta': 3612672,
 'eval_mem_gpu_peaked_delta': 3193344}

In [135]:
trainer.save_model(os.path.join(model_checkpoint, 'cola'))

Hyperparameter search
========

In [140]:
def model_init():
    return AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels)

In [141]:
trainer = Trainer(
    model_init=model_init,
    args=args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset[validation_key],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

Some weights of the model checkpoint at test_experiment/model2/ were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at test_experiment/model

In [None]:
trainer.hyperparameter_search(n_trials=1, direction="maximize")

In [101]:
predictions[:3]

[{'label': 'POSITIVE', 'score': 0.9993847012519836},
 {'label': 'POSITIVE', 'score': 0.9993847012519836},
 {'label': 'POSITIVE', 'score': 0.9993847012519836}]

In [100]:
print(datetime.now().strftime("%H:%M:%S.%f"))
predictions = nlp(text_list)
print(datetime.now().strftime("%H:%M:%S.%f"))

16:15:04.979391
16:15:31.633785
