In [1]:
#Custom code in this library
from cleaner_utils import super_cleaner
from pretraining_data_utils import make_book_token_frequency, token_freq_df_to_dict, \
                                    all_available_tokens_from_df, optimize_book_subset, optimize_book_subset_ratio
from pretraining_data_utils import book_properties, make_df_book_properties
from gutenberg.acquire import load_etext


from tokenizer.tokenizer import StrategizedTokenizer
from dataset.dataset import StrategizedTokenizerDataset
from dataset.dataset import DefaultTokenizerDataset

#Training code
from transformers import BertConfig
from transformers import BertForMaskedLM
from transformers import BertTokenizer
from transformers import AdamW
from transformers import Trainer, TrainingArguments

from torch.utils.data import DataLoader

#General imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import spacy
import re
import json
import torch
import logging
import pickle
import os

from more_itertools import take
from datetime import datetime

ImportError: cannot import name 'MODataset' from 'dataset.dataset' (C:\Users\s145733\Google Drive\Data science in Engineering\Jaar 2\Q3\Thesis\LessIsMore\dataset\dataset.py)

In [None]:
cache_dir = 'cached_files/'

In [None]:
# Read scraped metadata from the gutenberg metadata database 
#(Original data was scraped by using https://github.com/c-w/gutenberg)
#The data is then further preprocessed by https://github.com/hugovk/gutenberg-metadata so it is actually usable.

f = open(cache_dir + 'gutenberg-metadata.json', 'r')
metadata = json.load(f)
f.close()

In [None]:
#retrieve how many english books there in english
english_book_keys = [key for key in metadata.keys() if metadata[key]['language'] == ['en']]
len(english_book_keys)

In [None]:
# The third book cant be retrieved because of faults in retrieval. This happens sometimes.
import traceback
import sys

try:
    super_cleaner(load_etext(14575), -1, verify_deletions=True)
except Exception as e:
    try:
        exc_info = sys.exc_info()
    finally:
        # Display the *original* exception
        traceback.print_exception(*exc_info)
        del exc_info


In [None]:
#retrieve how many english books there are actually loadable
#If books arent cached this may take a while because it needs to scrape the books from gutenberg.org
#Therefore i provide a pre-processed file
if os.path.isfile(cache_dir + 'loadable_english_book_keys.pkl'):
    with open(cache_dir + 'loadable_english_book_keys.pkl', 'rb') as f:
        loadable_english_book_keys = pickle.load(f)              
else:
    loadable_english_book_keys = []
    i = 0
    for key in english_book_keys:
        if i % 1000 == 0:
            print(i, datetime.now())
        i += 1
        try:
            load_etext(int(key))
            loadable_english_book_keys.append(key)
        except:
            continue
    with open(cache_dir + 'loadable_english_book_keys.pkl', 'wb') as f:
        pickle.dump(loadable_english_book_keys, f)
            
len(loadable_english_book_keys)

In [None]:
#Randomly select 20 books that we can query
np.random.seed(42)
rand_10_books = [x for x in np.random.choice(loadable_english_book_keys, size=10)]
print(rand_10_books)

In [None]:
# Titles and authors for the first 5 books
# 1 book isnt actually loadable, see below.
for book_id in rand_10_books[:5]:
    print(book_id, metadata[book_id]['author'], metadata[book_id]['title'])

Text preprocessing
===================

In [None]:
# original unprocessed text
text = load_etext(50000)[:500]
text

In [None]:
#Text with formatting
print(text)

Use the cleaner to retrieve cleaned text from the first book of the random selection.
The _super_cleaner_ strips a headers/disclaimers/tables that are not required for our purposes.

In [None]:

sentences = super_cleaner(load_etext(16968), -1, verify_deletions=False)

In [None]:
#Text is now a list of paragraphs
sentences[:10]

In [None]:
#with some short sentences
sorted(sentences, key=len)[:20]

In [None]:
list(sentences)

In [None]:
#Find some properties about the book
book_properties(sentences)

Tokenization in practice
============

In [None]:
#initialize custom tokenizer
ST_tokenizer = StrategizedTokenizer(padding=False)

In [None]:
inputs = ST_tokenizer.tokenize("Anne went to the Albert Heijn at 5 o'clock to buy some milk for me.")
inputs

In [None]:
#Masks are at different places
for masked_line in ST_tokenizer.convert_ids_to_tokens(inputs['input_ids']):
    print(masked_line)

Gutenberg book-selection
==============

In [None]:
#Setting to ignore warnings about sequences being longer than BERT can handle
logging.getLogger("transformers.tokenization_utils_base").setLevel(logging.ERROR)
from datetime import datetime
print(datetime.now())
df_books_10 = make_df_book_properties(rand_10_books)
print(datetime.now())

In [None]:
df_books_10

In [None]:
#Sort df and account for the fact that the column has both text and numbers
df_books_10.sort_values(by='Shortest sentence (char)')[:10]

In [None]:
#Some books have very few tokens.
df_books_10.sort_values(by='Total tokens').head()

In [None]:
df_books_10.sort_values(by='Total tokens').tail()

In [None]:
#Retrieve token occurences per book in a dataframe and another dataframe with total number of tokens
print(datetime.now())
df_book_token_freq_10, df_10_total_tokens = make_book_token_frequency(rand_10_books)
print(datetime.now())

In [None]:
# DataFrame is obviously very sparse
df_book_token_freq_10[:10]

In [None]:
#total number of tokens per book
df_10_total_tokens

In [None]:
#Total number of tokens in our small set
df_10_total_tokens.sum()

In [None]:
#All tokens which are present in our subsample of 20 books
all_present_tokens_10 = all_available_tokens_from_df(df_book_token_freq_10)
all_present_tokens_10, len(all_present_tokens_10)

In [None]:
#Show first 3 entries
tokens_per_book_10 = token_freq_df_to_dict(df_book_token_freq_10, df_10_total_tokens)
take(3, tokens_per_book_10.items())

In [None]:
optimize_book_subset(all_present_tokens_10, tokens_per_book_10, threshold = 1e5)

In [None]:
print(optimize_book_subset_ratio(all_present_tokens_10, tokens_per_book_10, threshold = 1e5))

In [30]:
df_book_token_freq = pd.read_csv(cache_dir + 'df_book_token_freq.csv', index_col=0)
df_book_token_freq

Unnamed: 0,[PAD],[unused0],[unused1],[unused2],[unused3],[unused4],[unused5],[unused6],[unused7],[unused8],...,##！,##（,##）,##，,##－,##．,##／,##：,##？,##～
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1880,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1881,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1882,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1883,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [31]:
df_total_tokens = pd.read_csv(cache_dir +'df_total_tokens.csv', index_col=0).squeeze()
df_total_tokens

1         7642.0
10      901309.0
11       36117.0
12       40833.0
13        6778.0
          ...   
1880    227995.0
1881     98655.0
1882     65838.0
1883    100735.0
1884     15394.0
Name: 0, Length: 1000, dtype: float64

In [32]:
tokens_per_book = token_freq_df_to_dict(df_book_token_freq, df_total_tokens)

In [33]:
df_total_tokens.sort_values()[:20].index

Int64Index([ 239,  117,  156,  232,  127,  129,  212,  114,  116, 1525, 1016,
             115, 1546, 1593, 1065, 1544,  104, 1545, 1341,  235],
           dtype='int64')

In [47]:
if os.path.isfile(cache_dir + 'subset_meta_100K.pkl'):
    with open(cache_dir + 'subset_meta_100K.pkl', 'rb') as f:
        subset_100K = pickle.load(f)
        
for k,v in subset_100K.items():
    print(k,v)

subset_booklist ['1061', '1034', '114']
subset_total_tokens 99978.0
subset_present_tokens [  101.   102.   999. ... 29604. 29606. 29611.]
subset_unique_tokens 9320


In [48]:
if os.path.isfile(cache_dir + 'subset_meta_ratio_100K.pkl'):
    with open(cache_dir + 'subset_meta_ratio_100K.pkl', 'rb') as f:
        subset_ratio_100K = pickle.load(f)
for k,v in subset_ratio_100K.items():
    print(k,v)

subset_booklist ['117', '127', '1525', '116', '1546', '1321', '232', '104', '1567', '235', '1016', '1137', '1359', '1064', '1593', '1757', '1336', '1475', '1861', '115', '1331', '1171', '212', '114']
subset_total_tokens 99181.0
subset_present_tokens [  101.   102.   999. ... 29731. 29735. 29739.]
subset_unique_tokens 10892


In [49]:
if os.path.isfile(cache_dir + 'subset_meta_1M.pkl'):
    with open(cache_dir + 'subset_meta_1M.pkl', 'rb') as f:
        subset_1M = pickle.load(f)
for k,v in subset_1M.items():
    print(k,v)

subset_booklist ['22', '1662', '1558', '1034', '235']
subset_total_tokens 999936.0
subset_present_tokens [  101.   102.   999. ... 29604. 29607. 29667.]
subset_unique_tokens 20841


In [50]:
if os.path.isfile(cache_dir + 'subset_meta_ratio_1M.pkl'):
    with open(cache_dir + 'subset_meta_ratio_1M.pkl', 'rb') as f:
        subset_ratio_1M = pickle.load(f)
for k,v in subset_ratio_1M.items():
    print(k,v)

subset_booklist ['117', '127', '1525', '116', '1546', '1321', '232', '104', '1567', '235', '1016', '1137', '1359', '1064', '1593', '1757', '1336', '1475', '1861', '115', '1331', '1042', '180', '1034', '1870', '1092', '1462', '1663', '1222', '1219', '1031', '101', '212', '1566', '112', '1746', '1221', '1817', '1855', '162', '114', '1179', '1063', '1805', '1615', '1645', '1280', '118', '1215', '1753', '1080', '1189']
subset_total_tokens 999805.0
subset_present_tokens [  100.   101.   102. ... 29736. 29737. 29739.]
subset_unique_tokens 21996


In [51]:
if os.path.isfile(cache_dir + 'subset_meta_10M.pkl'):
    with open(cache_dir + 'subset_meta_10M.pkl', 'rb') as f:
        subset_10M = pickle.load(f)
for k,v in subset_10M.items():
    print(k,v)

subset_booklist ['200', '22', '1662', '101', '1449', '162', '135', '14', '1365', '1156', '118', '1340', '1444', '100', '1615', '1226', '1725', '1391', '1351', '1694', '1399', '1162', '1320', '1452', '152', '1166', '1218', '1198', '1479', '112', '1304', '180', '1804', '1039', '1210', '1470', '1558', '1060', '235', '116', '114', '115', '1065', '1593']
subset_total_tokens 9997943.0
subset_present_tokens [  100.   101.   102. ... 29736. 29737. 29739.]
subset_unique_tokens 26396


In [52]:
if os.path.isfile(cache_dir + 'subset_meta_ratio_10M.pkl'):
    with open(cache_dir + 'subset_meta_ratio_10M.pkl', 'rb') as f:
        subset_ratio_10M = pickle.load(f)
for k,v in subset_ratio_10M.items():
    print(k,v)

subset_booklist ['117', '127', '1525', '116', '1546', '1321', '232', '104', '1567', '235', '1016', '1137', '1359', '1064', '1593', '1757', '1336', '1475', '1861', '115', '1331', '1042', '180', '1034', '1870', '1092', '1462', '1663', '1222', '1219', '1031', '101', '212', '1566', '112', '1746', '1221', '1817', '1855', '162', '114', '1179', '1063', '1805', '1615', '1280', '118', '1324', '1753', '1708', '1317', '1425', '1645', '1479', '1444', '14', '1418', '1351', '1551', '1873', '1270', '1864', '22', '1818', '1199', '1060', '1446', '1166', '152', '1367', '1210', '1065', '1804', '1638', '1149', '1742', '1847', '1451', '1491', '1234', '1187', '1477', '1662', '1808', '1220', '1758', '1054', '1278', '1764', '1253', '1233', '1478', '1694', '1395', '1470', '1761', '1763', '200', '1670', '1186', '132', '1240', '1816', '1669', '1830', '1020', '1387', '240', '1273', '1050', '1206', '1820', '1045', '1156', '1195', '1594', '1391', '1859', '1824', '1827', '1734', '1169', '1731', '1554', '1160', '1165

In [66]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
tokenizer.vocab

In [None]:
for k,v in tokenizer.vocab.items():
    if not v in subset_10M['subset_present_tokens'] and not k.startswith('[unused'):
        print(k, v)


In [64]:
from tokenizer.tokenizer import StrategizedTokenizer

In [68]:
tok = StrategizedTokenizer()
tok.__str__()

'<tokenizer.tokenizer.StrategizedTokenizer object at 0x000001FDF9CD6B20>'

In [69]:
tokenizer.__str__()

'<transformers.tokenization_bert.BertTokenizer object at 0x000001FD97F5F070>'

In [None]:
torch.cuda.is_available()

In [None]:
torch.cuda.get_device_name(0)

In [None]:
text = "Anne went to the Albert Heijn at 5 o'clock to buy some milk for me."

In [None]:
default_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
default_tokenizer(text)

In [None]:
ST_tokenizer = StrategizedTokenizer(padding=True)
inputs = ST_tokenizer.tokenize(text)
inputs

In [None]:
for x in ST_tokenizer.convert_ids_to_tokens(inputs['input_ids']):
    print(x)

In [None]:
train_dataset = MODataset()
train_dataset.populate()

In [None]:
train_dataset.labels

In [None]:
bert_tiny_config = {"hidden_size": 128, 
                    "hidden_act": "gelu", 
                    "initializer_range": 0.02, 
                    "vocab_size": 30522, 
                    "hidden_dropout_prob": 0.1, 
                    "num_attention_heads": 2, 
                    "type_vocab_size": 2, 
                    "max_position_embeddings": 512, 
                    "num_hidden_layers": 2, 
                    "intermediate_size": 512, 
                    "attention_probs_dropout_prob": 0.1}

model = BertForMaskedLM(config=BertConfig(**bert_tiny_config))
model.train();

In [None]:
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # total # of training epochs
    per_device_train_batch_size=2,  # batch size per device during training
    #per_device_eval_batch_size=256,   # batch size for evaluation
    learning_rate=1e-5,     
    logging_dir='./logs',            # directory for storing logs
)

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=None            # evaluation dataset
)

In [None]:
#trainer.train()