In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/text-data/kant.txt


#### Pretraining a RoBERTA like model from scratch:  
#### Training a tokenizer and pretraining the transformer

In [2]:
#imports
import accelerate
from accelerate import Accelerator
from tokenizers import ByteLevelBPETokenizer
import os

#### Train the tokenizer using Hugging face BPETokenizer & saving it

In [3]:
def train_bpe_tokenizer(train_text, vocab_size=10000):
    # Initialize the BPE tokenizer
    tokenizer = ByteLevelBPETokenizer()

    # Train the tokenizer on the provided text
    tokenizer.train(files=[train_text], vocab_size=vocab_size, min_frequency=2, special_tokens=[
"<s>",
"<pad>",
"</s>",
"<unk>",
"<mask>",
])

    # directory to save the tokenizer on kaggle output/working folder
    token_dir = 'RoberTAlikeModel'
    if not os.path.exists(token_dir):
        os.makedirs(token_dir)
    tokenizer.save_model('RoberTAlikeModel')


In [4]:
# Example text for training
train_text = "/kaggle/input/text-data/kant.txt"
train_bpe_tokenizer(train_text)






#### Loading the tokenizer and then testing on some sample data

In [5]:
# Loading the Trained Tokenizer Files
from tokenizers.implementations import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing
tokenizer = ByteLevelBPETokenizer(
"./RoberTAlikeModel/vocab.json",
"./RoberTAlikeModel/merges.txt",
)

#### Testing the tokenizer on some sample data

In [6]:
tokenizer.encode("For it is in reality vain to profess in difference in regard to such inquiries, the object of which cannot be indifferent to humanity.").tokens

['For',
 'Ġit',
 'Ġis',
 'Ġin',
 'Ġreality',
 'Ġvain',
 'Ġto',
 'Ġprofess',
 'Ġin',
 'Ġdifference',
 'Ġin',
 'Ġregard',
 'Ġto',
 'Ġsuch',
 'Ġinquiries',
 ',',
 'Ġthe',
 'Ġobject',
 'Ġof',
 'Ġwhich',
 'Ġcannot',
 'Ġbe',
 'Ġindifferent',
 'Ġto',
 'Ġhumanity',
 '.']

In [7]:
tokenizer.encode("For it is in reality vain to profess in difference in regard to such inquiries, the object of which cannot be indifferent to humanity.")

Encoding(num_tokens=26, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])

#### processing the tokens to fit BERT variant selected, after this tokens will have start and end tokens

In [8]:
tokenizer._tokenizer.post_processor = BertProcessing(
("</s>", tokenizer.token_to_id("</s>")),
("<s>", tokenizer.token_to_id("<s>")),
)
tokenizer.enable_truncation(max_length=512)

#### Example after we process tokenizer

In [9]:
#encode a post-processed sequence:
tokenizer.encode("He will achieve his goal.")

Encoding(num_tokens=8, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])

In [10]:
tokenizer.encode("He will achieve his goal.").tokens

['<s>', 'He', 'Ġwill', 'Ġachieve', 'Ġhis', 'Ġgoal', '.', '</s>']

#### Defining model config: pretraining a RoberTa like transformer using same number of layers and heads as a DistilBert transformer, it has vocab size of 52000,12 attention heads,6 layers

In [11]:
from transformers import RobertaConfig
config = RobertaConfig(
    vocab_size=52_000,
    max_position_embeddings=514,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1,
)
 

In [12]:
print(config)

RobertaConfig {
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.36.2",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 52000
}



#### Load the trained tokenizer

In [13]:
#Re-creating the Tokenizer in Transformers
from transformers import RobertaTokenizer
tokenizer = RobertaTokenizer.from_pretrained("./RoberTAlikeModel", max_length=512)

#### Initialize a model from scratch,examine its size after using the config defined earlier in steps

In [14]:
#Initializing a Model From Scratch
from transformers import RobertaForMaskedLM
model = RobertaForMaskedLM(config=config)
print(model)

RobertaForMaskedLM(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(52000, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-5): 6 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): La

In [15]:
# just looking at number of parameters in the model
print(model.num_parameters())

83504416


#### Load dataset line by line to generate samples for batch training

In [16]:
#Building the Dataset
from transformers import LineByLineTextDataset
dataset = LineByLineTextDataset(
tokenizer=tokenizer,
file_path="/kaggle/input/text-data/kant.txt",
block_size=128,
)



#### Define data collator: it will take samples from dataset and collate them into batches resulting in dictionary like objects,also preparing a batched sample process for MLM

In [17]:
#Defining a data collator

from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)
  

#### Now the training can be done

In [18]:
#Initializing the trainer

from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./RoberTAlikeModel",
    overwrite_output_dir=True,
    num_train_epochs=2,
    per_device_train_batch_size=64,
    save_steps=10_000,
    save_total_limit=2,
)
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)

In [19]:
trainer.train()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Step,Training Loss
500,6.7015
1000,5.8453
1500,5.339
2000,5.0193
2500,4.7868
3000,4.5931
3500,4.4685
4000,4.3866
4500,4.2977
5000,4.2688


TrainOutput(global_step=5344, training_loss=4.924521189249918, metrics={'train_runtime': 795.2334, 'train_samples_per_second': 429.972, 'train_steps_per_second': 6.72, 'total_flos': 1896986656521216.0, 'train_loss': 4.924521189249918, 'epoch': 2.0})

#### Saving model+tokenizer+config to the disk

In [20]:
#Saving the final model (+tokenizer + config) to disk

trainer.save_model("./RoberTAlikeModel")

#### import a language modeling fill-mask task, use the trained model and trained tokenizer to perform MLM

In [21]:
#Language modeling with FillMaskPipeline

from transformers import pipeline

fill_mask = pipeline(
    "fill-mask",
    model="./RoberTAlikeModel",
    tokenizer="./RoberTAlikeModel"
)

In [22]:
fill_mask("Human thinking involves human <mask>.")

[{'score': 0.24949002265930176,
  'token': 393,
  'token_str': ' reason',
  'sequence': 'Human thinking involves human reason.'},
 {'score': 0.03271394222974777,
  'token': 611,
  'token_str': ' cognition',
  'sequence': 'Human thinking involves human cognition.'},
 {'score': 0.02086200937628746,
  'token': 605,
  'token_str': ' conceptions',
  'sequence': 'Human thinking involves human conceptions.'},
 {'score': 0.019538380205631256,
  'token': 531,
  'token_str': ' experience',
  'sequence': 'Human thinking involves human experience.'},
 {'score': 0.015104752033948898,
  'token': 722,
  'token_str': ' laws',
  'sequence': 'Human thinking involves human laws.'}]

#### Goal of this model is to show that we can create datasets to train a transformer for a specific type of complex language modeling task.