In [1]:
#@title Step 1: Installing Hugging Face Transformer
# We won't need tensorflow here 
!pip uninstall -y tensorflow

# Install `transformers` from master
!pip install git+https://github.com/huggingface/transformers
!pip list | grep -E 'transformers|tokenizers'
# transformers version at notebook update --- 2.9.1
# tokenizers version at notebook update --- 0.7.0

Found existing installation: tensorflow 2.6.0
Uninstalling tensorflow-2.6.0:
  Successfully uninstalled tensorflow-2.6.0
Collecting git+https://github.com/huggingface/transformers
  Cloning https://github.com/huggingface/transformers to /tmp/pip-req-build-gv2nj430
  Running command git clone -q https://github.com/huggingface/transformers /tmp/pip-req-build-gv2nj430
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Collecting pyyaml>=5.1
  Downloading PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)
[K     |████████████████████████████████| 636 kB 3.9 MB/s 
Collecting huggingface-hub>=0.0.12
  Downloading huggingface_hub-0.0.16-py3-none-any.whl (50 kB)
[K     |████████████████████████████████| 50 kB 6.5 MB/s 
[?25hCollecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x8

In [2]:
#@title Step 2: Training a tokenizer

%%time 

from pathlib import Path

from tokenizers import ByteLevelBPETokenizer

paths = [str(x) for x in Path(".").glob("**/*.txt")]
# Initialize a tokenizer
tokenizer = ByteLevelBPETokenizer()


# Customize training 
tokenizer.train(files=paths, vocab_size=52_000,\
                min_frequency=2, special_tokens=[
                
                "<s>",
                "<pad>",
                "</s>",
                "<unk>",
                "<mask>",
                ])

CPU times: user 6.67 s, sys: 262 ms, total: 6.93 s
Wall time: 3.7 s


In [3]:
#@title Step 3: Saving the files to disk
import os
token_dir = '/content/KantaiBERT'
if not os.path.exists(token_dir):
  os.makedirs(token_dir)

tokenizer.save_model('KantaiBERT')

['KantaiBERT/vocab.json', 'KantaiBERT/merges.txt']

In [4]:
#@title Step 4: Loading the Trained Tokenizer Files
from tokenizers.implementations import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing 

tokenizer = ByteLevelBPETokenizer(
    "./KantaiBERT/vocab.json",
    "./KantaiBERT/merges.txt",
)

In [5]:
tokenizer.encode("The Critique of Pure Reason").tokens

['The', 'ĠCritique', 'Ġof', 'ĠPure', 'ĠReason']

In [6]:
tokenizer.encode("The Critique of Pure Reason")

Encoding(num_tokens=5, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])

In [7]:
tokenizer._tokenizer.post_processor = BertProcessing(
    
    ("</s>", tokenizer.token_to_id("</s>")),
    ("<s>", tokenizer.token_to_id("<s>")),
)

tokenizer.enable_truncation(max_length=512)

In [8]:
#@title Step 5: Checking Resource Constraints: GPU and NVIDIA
!nvidia-smi

Fri Aug 27 19:33:22 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.57.02    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   62C    P8    32W / 149W |      0MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [9]:
#@title Checking that PyTorch Sees CUDAnot 
import torch 
torch.cuda.is_available()

True

In [10]:
#@title Step 6: Defining the configuation of the Model
from transformers import RobertaConfig

config = RobertaConfig(
    vocab_size = 52_000,
    max_position_embeddings=514,
    num_attention_heads = 12,
    num_hidden_layers = 6,
    type_vocab_size = 1, 
)

In [11]:
print(config)

RobertaConfig {
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.10.0.dev0",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 52000
}



In [12]:
#@title Step 7: Re-creating the Tokenizer in Transformers
from transformers import RobertaTokenizer
tokenizer = RobertaTokenizer.from_pretrained("./KantaiBERT", max_length=512)

file ./KantaiBERT/config.json not found


In [13]:
#@title Step 8: Initializing the Model from Scratch
from transformers import RobertaForMaskedLM 

model = RobertaForMaskedLM(config=config)
print(model)

RobertaForMaskedLM(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(52000, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNor

In [15]:
print(model.num_parameters())

83504416


In [16]:
#@title Exploring the Parameters
LP = list(model.parameters())
lp=len(LP)
print(lp)

for p in range(0, lp):
  print(LP[p])

106
Parameter containing:
tensor([[ 1.7141e-02,  4.7471e-02, -4.7089e-05,  ...,  1.2108e-02,
          1.1729e-02,  1.4611e-02],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00],
        [ 2.4339e-03,  1.2675e-02,  2.1783e-03,  ...,  5.6729e-04,
          5.3539e-03,  8.6926e-03],
        ...,
        [-1.1631e-02, -3.6987e-02,  1.2609e-02,  ...,  1.9116e-03,
         -6.8906e-04, -3.5214e-03],
        [-1.6769e-02, -1.4781e-02, -1.3163e-02,  ...,  2.4617e-02,
          2.3126e-02,  2.2799e-02],
        [ 2.3641e-02,  1.2720e-02,  7.0900e-03,  ...,  2.1542e-02,
         -1.1544e-02,  3.5430e-02]], requires_grad=True)
Parameter containing:
tensor([[ 0.0006, -0.0120,  0.0109,  ...,  0.0158,  0.0243,  0.0169],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0234, -0.0109,  0.0099,  ...,  0.0012, -0.0065,  0.0112],
        ...,
        [-0.0427,  0.0405,  0.0053,  ...,  0.0313,  0.0182,  0.0314],
       

In [18]:
#@title Counting the Parameters
np = 0
#number of tensors
for p in range(0, lp):
  PL2 = True
  try:
    L2=len(LP[p][0])
  except:
    L2= 1
    PL2= False
  L1=len(LP[p])
  L3 = L1 * L2
  np = np + L3
  if PL2 == True:
    print(p, L1, L2, L3)
  if PL2 == False:
    print(p, L1, L2)
  

print(np)

0 52000 768 39936000
1 514 768 394752
2 1 768 768
3 768 1
4 768 1
5 768 768 589824
6 768 1
7 768 768 589824
8 768 1
9 768 768 589824
10 768 1
11 768 768 589824
12 768 1
13 768 1
14 768 1
15 3072 768 2359296
16 3072 1
17 768 3072 2359296
18 768 1
19 768 1
20 768 1
21 768 768 589824
22 768 1
23 768 768 589824
24 768 1
25 768 768 589824
26 768 1
27 768 768 589824
28 768 1
29 768 1
30 768 1
31 3072 768 2359296
32 3072 1
33 768 3072 2359296
34 768 1
35 768 1
36 768 1
37 768 768 589824
38 768 1
39 768 768 589824
40 768 1
41 768 768 589824
42 768 1
43 768 768 589824
44 768 1
45 768 1
46 768 1
47 3072 768 2359296
48 3072 1
49 768 3072 2359296
50 768 1
51 768 1
52 768 1
53 768 768 589824
54 768 1
55 768 768 589824
56 768 1
57 768 768 589824
58 768 1
59 768 768 589824
60 768 1
61 768 1
62 768 1
63 3072 768 2359296
64 3072 1
65 768 3072 2359296
66 768 1
67 768 1
68 768 1
69 768 768 589824
70 768 1
71 768 768 589824
72 768 1
73 768 768 589824
74 768 1
75 768 768 589824
76 768 1
77 768 1
78 768 1
7

In [19]:
#@title Step 9: Building the Dataset
%%time 
from transformers import LineByLineTextDataset

dataset = LineByLineTextDataset(
    tokenizer = tokenizer,
    file_path = "./kant.txt",
    block_size = 128,
)



CPU times: user 32.2 s, sys: 1.03 s, total: 33.2 s
Wall time: 33.1 s


In [21]:
#@title Step 10: Defining a Data Collator
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer = tokenizer, mlm=True, mlm_probability=0.15
)

In [24]:
#@title Step 11: Initializing the Trainer 
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir = "./KantaiBERT",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=64,
    save_steps=10_000,
    save_total_limit=2,
)

trainer = Trainer(
    
    model=model,
    args= training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)

In [25]:
#@title Step 14: Pre-training the Model
%%time
trainer.train()

***** Running training *****
  Num examples = 170964
  Num Epochs = 1
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 2672


Step,Training Loss
500,6.6056
1000,5.7375
1500,5.267
2000,5.0063
2500,4.8554




Training completed. Do not forget to share your model on huggingface.co/models =)




CPU times: user 19min 35s, sys: 4.62 s, total: 19min 40s
Wall time: 19min 36s


TrainOutput(global_step=2672, training_loss=5.4506196575964285, metrics={'train_runtime': 1176.8811, 'train_samples_per_second': 145.269, 'train_steps_per_second': 2.27, 'total_flos': 873620128952064.0, 'train_loss': 5.4506196575964285, 'epoch': 1.0})

In [29]:
#@title Step 15: Saving the Final Model(+tokenizer + config) to disk
trainer.save_model("./KantaiBERT")

Saving model checkpoint to ./KantaiBERT
Configuration saved in ./KantaiBERT/config.json
Model weights saved in ./KantaiBERT/pytorch_model.bin


In [30]:
#@title Step 16: Language Modeling with the FillMaskPipeline
from transformers import pipeline

fill_mask = pipeline(
    "fill-mask",
    model = "./KantaiBERT",
    tokenizer = "./KantaiBERT"
)

loading configuration file ./KantaiBERT/config.json
Model config RobertaConfig {
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.10.0.dev0",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 52000
}

loading configuration file ./KantaiBERT/config.json
Model config RobertaConfig {
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
 

In [31]:
fill_mask("Human thinking involves<mask>.")

[{'score': 0.013052902184426785,
  'sequence': 'Human thinking involves it.',
  'token': 306,
  'token_str': ' it'},
 {'score': 0.010370113886892796,
  'sequence': 'Human thinking involves reason.',
  'token': 393,
  'token_str': ' reason'},
 {'score': 0.007926465943455696,
  'sequence': 'Human thinking involves principles.',
  'token': 670,
  'token_str': ' principles'},
 {'score': 0.00788362231105566,
  'sequence': 'Human thinking involves,.',
  'token': 16,
  'token_str': ','},
 {'score': 0.006756190676242113,
  'sequence': 'Human thinking involves conceptions.',
  'token': 605,
  'token_str': ' conceptions'}]