<a href="https://colab.research.google.com/github/tiwari-arpit/nlp/blob/main/HegelBERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Setup**

In [None]:
!pip install Transformers
!pip install --upgrade accelerate

Collecting accelerate
  Using cached accelerate-1.4.0-py3-none-any.whl.metadata (19 kB)
Using cached accelerate-1.4.0-py3-none-any.whl (342 kB)
Installing collected packages: accelerate
  Attempting uninstall: accelerate
    Found existing installation: accelerate 0.34.2
    Uninstalling accelerate-0.34.2:
      Successfully uninstalled accelerate-0.34.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
autogluon-multimodal 1.2 requires nvidia-ml-py3==7.352.0, which is not installed.
autogluon-multimodal 1.2 requires accelerate<1.0,>=0.34.0, but you have accelerate 1.4.0 which is incompatible.
autogluon-multimodal 1.2 requires jsonschema<4.22,>=4.18, but you have jsonschema 4.23.0 which is incompatible.
autogluon-multimodal 1.2 requires nltk<3.9,>=3.4.5, but you have nltk 3.9.1 which is incompatible.
autogluon-multimodal 1.2 requires omegaconf<2.3.0,>=2.1.1,

In [None]:
from accelerate import Accelerator

**Load the dataset**

In [None]:
from pathlib import Path

paths = [str(x) for x in Path(".").glob("**/*.txt")]

file_contents = []
for path in paths:
  try:
    with open(path,"r",encoding="utf-8",errors="replace") as file:
      file_contents.append(file.read())
  except Exception as e:
    print(f"Error reading file {path}: {e}")

text = "\n".join(file_contents)

**Training Tokenizer and saving files**

In [None]:
from tokenizers import ByteLevelBPETokenizer

tokenizer = ByteLevelBPETokenizer()

special_tokens = ["<s>","<pad>","</s>","<unk>","<mask>"]

tokenizer.train_from_iterator([text],vocab_size=52_000,min_frequency=2, special_tokens = special_tokens)






In [None]:
import os
token_dir = './HegelBERT'
if not os.path.exists(token_dir):
  os.makedirs(token_dir)
tokenizer.save_model('HegelBERT')

['HegelBERT/vocab.json', 'HegelBERT/merges.txt']

**Load the trained tokenizer.**

In [None]:
from tokenizers.implementations import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing

tokenizer = ByteLevelBPETokenizer(
    "./HegelBERT/vocab.json",
    "./HegelBERT/merges.txt",
)

In [None]:
# Adding the start and end token.
tokenizer._tokenizer.post_processor = BertProcessing(
    ("</s>",tokenizer.token_to_id("</s>")),
    ("<s>",tokenizer.token_to_id("<s>")),
)
tokenizer.enable_truncation(max_length=512)

**Define model configuration**

In [None]:
from transformers import RobertaConfig

config = RobertaConfig(
    vocab_size=52_000,
    max_position_embeddings=514,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1,
)

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

In [None]:
from transformers import RobertaTokenizerFast

tokenizer = RobertaTokenizerFast.from_pretrained("./HegelBERT",max_len=512)

In [None]:
from transformers import RobertaForMaskedLM

model = RobertaForMaskedLM(config=config)

2025-02-27 04:59:08.957067: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [None]:
print(model)

RobertaForMaskedLM(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(52000, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-5): 6 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm)

# **Pretraining the Model**

**Building the Dataset**

In [None]:
%%time
from transformers import LineByLineTextDataset

dataset = LineByLineTextDataset(
    tokenizer = tokenizer,
    file_path = "./hegel.txt",
    block_size = 128,
)



CPU times: user 9.23 s, sys: 286 ms, total: 9.51 s
Wall time: 3.27 s


**Define the Data Collator**

In [None]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer = tokenizer,
    mlm = True,
    mlm_probability = 0.15,
)

In [None]:
from transformers import Trainer, TrainingArguments

**Initializing the Trainer**

In [None]:
training_args = TrainingArguments(
    output_dir = "./HegelBERT",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=64,
    save_steps=10_000,
    save_total_limit=2,
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)

**Pretraining**

In [None]:
%%time
trainer.train()

Step,Training Loss
500,7.0835
1000,6.4345
1500,6.2582


**Saving the final model**

In [None]:
print("Saving model...")
trainer.save_model("./HegelBERT")
print("Model saved!")


Saving model...
Model saved!


**Example**

In [None]:
from transformers import pipeline

fill_mask = pipeline(
    "fill-mask",
    model="./HegelBERT",
    tokenizer="./HegelBERT"
)

Device set to use cpu


In [None]:
fill_mask("To be independent <mask> public opinion is the first formal condition of achieving anything great")

# more epochs needed for optimized model

[{'score': 0.12893177568912506,
  'token': 18,
  'token_str': '.',
  'sequence': 'To be independent. public opinion is the first formal condition of achieving anything great'},
 {'score': 0.09787289053201675,
  'token': 16,
  'token_str': ',',
  'sequence': 'To be independent, public opinion is the first formal condition of achieving anything great'},
 {'score': 0.07722752541303635,
  'token': 266,
  'token_str': ' the',
  'sequence': 'To be independent the public opinion is the first formal condition of achieving anything great'},
 {'score': 0.033616770058870316,
  'token': 274,
  'token_str': ' of',
  'sequence': 'To be independent of public opinion is the first formal condition of achieving anything great'},
 {'score': 0.033488426357507706,
  'token': 301,
  'token_str': ' to',
  'sequence': 'To be independent to public opinion is the first formal condition of achieving anything great'}]