# This notebook shows how we can train a Language Model using your own pre-tokenizer from scratch using Hugging Face
Most of the code below is not original code written by me. This notebook is just a simple and convenient collection of the relevant articles I found while training my language model for my pre-tokenized dataset.

# Ensuring that we are utilising the GPU hardward on google colab and the correct drive is mounted

In [None]:
!nvidia-smi

Sat Nov  6 05:54:06 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 495.44       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   36C    P0    27W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
cd '/content/drive/My Drive/data_science_projects/transformer_adapters'

/content/drive/My Drive/data_science_projects/transformer_adapters


# 1. Install Dependencies

In [None]:
!pip uninstall -y tensorflow
!pip install git+https://github.com/huggingface/transformers
!pip list | grep -E 'transformers|tokenizers'

Found existing installation: tensorflow 2.6.0
Uninstalling tensorflow-2.6.0:
  Successfully uninstalled tensorflow-2.6.0
Collecting git+https://github.com/huggingface/transformers
  Cloning https://github.com/huggingface/transformers to /tmp/pip-req-build-m5wiesj8
  Running command git clone -q https://github.com/huggingface/transformers /tmp/pip-req-build-m5wiesj8
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 4.0 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 68.8 MB/s 
[?25hCollecting sacr

In [None]:
# Check again if GPU is being used

import torch
torch.cuda.is_available()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

Using device: cuda


# Data

In [None]:
!ls -l data/ | head -n 10

total 1801605
-rw------- 1 root root  14427434 Oct 22 00:06 all_10.csv
-rw------- 1 root root  15009474 Oct 22 00:06 all_11.csv
-rw------- 1 root root  14769213 Oct 22 00:06 all_12.csv
-rw------- 1 root root  15624435 Oct 22 00:06 all_13.csv
-rw------- 1 root root  14087597 Oct 22 00:06 all_14.csv
-rw------- 1 root root  14345086 Oct 22 00:06 all_15.csv
-rw------- 1 root root  15137480 Oct 22 00:06 all_16.csv
-rw------- 1 root root  15156153 Oct 22 00:06 all_17.csv
-rw------- 1 root root  15345289 Oct 22 00:06 all_18.csv


In [None]:
import os, re

In [None]:
files = [f"data/{filename}" for filename in os.listdir("data/")]

### Prepare Data

In [None]:
prepared_data = True

In [None]:
if not prepared_data:
  import pandas as pd
  merged_df = pd.DataFrame()
  for file in files[:-1]:
    df = pd.read_csv(file,header=None)
    merged_df = pd.concat([merged_df, df])
  dev_df = pd.read_csv(files[-1],header=None)
  
  merged_df.to_csv('data/train.txt', header=None, index=None, sep=' ', mode='a')
  dev_df.to_csv('data/dev.txt', header=None, index=None, sep=' ', mode='a')

# Tokenize and Encode

In [None]:
train_tokenizer = False

In [None]:
if train_tokenizer:

  from tokenizers import Tokenizer
  from tokenizers.models import WordPiece

  my_tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))

In [None]:
if train_tokenizer:

  from tokenizers import normalizers
  from tokenizers.normalizers import Lowercase, NFD, StripAccents

  my_tokenizer.normalizer = normalizers.Sequence([Lowercase()])

In [None]:
if train_tokenizer:
  
  from tokenizers.pre_tokenizers import CharDelimiterSplit

  # For example, if you would like to split by just a vertical slash
  pre_tokenizer = CharDelimiterSplit('|')

  my_tokenizer.pre_tokenizer = pre_tokenizer

In [None]:
if train_tokenizer:

  from tokenizers.processors import TemplateProcessing

  my_tokenizer.post_processor = TemplateProcessing(
      single="[CLS] $A [SEP]",
      pair="[CLS] $A [SEP] $B:1 [SEP]:1",
      special_tokens=[
          ("[CLS]", 1),
          ("[SEP]", 2),
      ],
  )

In [None]:
if train_tokenizer:
  from tokenizers.trainers import WordPieceTrainer

  VOCAB = 9999 # if you know the vocab size before hand or you could just query from your dataset

  trainer = WordPieceTrainer(
      vocab_size=VOCAB, special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]
  )
  my_tokenizer.train(files, trainer)

  my_tokenizer.save("data/cust-tokenizer-data.json")

In [None]:
if not train_tokenizer:
  from transformers import PreTrainedTokenizerFast
  my_tokenizer = PreTrainedTokenizerFast(tokenizer_file="data/cust-tokenizer-data.json")

  # Add the below special tokens to make use of Roberta MLM training. Have to do so because we are using a simple pre-tokenizer
  my_tokenizer.add_special_tokens({'pad_token': '[PAD]', 'mask_token': '[MASK]'})


In [None]:
output = my_tokenizer.encode("python|.net|financial reporting|accountant's report")
print(output)


[1, 1202, 1118, 1004, 6681, 6624, 3021, 2]


# Start Training

In [None]:

output = my_tokenizer.encode_plus("python|.net|financial reporting|accountant's report", max_length = 128, truncation=True, padding=True)
output

{'input_ids': [1, 1202, 1118, 1004, 6681, 6624, 3021, 2], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1]}

In [None]:
VOCABSIZE = my_tokenizer.vocab_size # if using PreTrainedTokenizerFast
MAX_LEN = 128

In [None]:
# test if encode_plus works
x = my_tokenizer.encode_plus("python|.net|financial reporting|accountant's report", max_length = 128, truncation=True, padding=True) # if using PreTrainedTokenizerFast
x

{'input_ids': [1, 1202, 1118, 1004, 6681, 6624, 3021, 2], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1]}

In [None]:
# initialize our model using the configuration file. 
# As we are training from scratch, we initialize from a config that defines the architecture of the model but not restoring previously trained weights. 
# The weights will be randomly initialized.

from transformers import RobertaConfig
from transformers import RobertaForMaskedLM

# Set a configuration for our RoBERTa model
config = RobertaConfig(
    vocab_size=VOCABSIZE,
    max_position_embeddings=514,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1,
)
# Initialize the model from a configuration without pretrained weights
model = RobertaForMaskedLM(config=config)
print('Num parameters: ',model.num_parameters())

Num parameters:  48721777


In [None]:
import pandas as pd
train_df = pd.read_csv("data/train.txt",header=None)
eval_df = pd.read_csv("data/dev.txt",header=None)

In [None]:
print(train_df.shape)
print(eval_df.shape)
train_df.head()

(1692497, 1)
(48858, 1)


Unnamed: 0,0
0,liaison
1,construction industry|business finance|constru...
2,c++|c#|software development life cycle (sdlc)|...
3,healthcare|culinary arts|investment|business p...
4,project management|project management|supply c...


In [None]:


from torch.utils.data.dataset import Dataset

class CustomDataset(Dataset):
    def __init__(self, df_series, tokenizer):
        ## encode batch encodes and converts all the data into the tokenized object which contain other information such as tokens etc which takes up RAM
        ## by doing a ugly loop and only taking the ids which are integers, RAM consumption is much lower
        self.examples = []
        for example in df_series:
            x = tokenizer.encode_plus(example, max_length = MAX_LEN, truncation=True, padding=True)
            self.examples += [x.input_ids]
        

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, i):
        return torch.tensor(self.examples[i])
      
# Create the train and evaluation dataset
train_dataset = CustomDataset(train_df[0], my_tokenizer)
eval_dataset = CustomDataset(eval_df[0], my_tokenizer)


In [None]:
from transformers import DataCollatorForLanguageModeling

# Define the Data Collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=my_tokenizer, mlm=True, mlm_probability=0.15
)

# need to add mask token to tokenizer to train with masked language modeling (mlm) 


In [None]:
OUTPUT_DIR = "models/robertamaskedlm/output" #@param {type: "string"}

TRAIN_BATCH_SIZE = 16  #@param [8, 16, 32] # input batch size for training (default: 64)
VALID_BATCH_SIZE = 8  #@param [8, 16, 32] # input batch size for testing (default: 1000)
TRAIN_EPOCHS = 20  #@param [10, 15, 20] # number of epochs to train (default: 10)
LEARNING_RATE = 1e-3  #@param [1e-3, 1e-4, 1e-5] # learning rate (default: 0.001)
WEIGHT_DECAY = 0.01  #@param [0.001,0.01,0.05] 
SEED = 999   #@param {type: "integer"} # random seed (default: 999)
MAX_LEN = 128 #@param {type: "integer"}
SUMMARY_LEN = 7 #@param {type: "integer"}

In [None]:

from transformers import Trainer, TrainingArguments
# Define the training arguments
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    overwrite_output_dir=True,
    evaluation_strategy = 'epoch',
    num_train_epochs=TRAIN_EPOCHS,
    learning_rate=LEARNING_RATE,
    weight_decay=WEIGHT_DECAY,
    per_device_train_batch_size=TRAIN_BATCH_SIZE,
    per_device_eval_batch_size=VALID_BATCH_SIZE,
    save_steps=8192,
    #eval_steps=4096,
    save_total_limit=1,
)
# Create the trainer for our model
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    #prediction_loss_only=True,
)
# Train the model
# trainer.train(OUTPUT_DIR+"/"+"checkpoint-99999") # to continue from check point if training is done in one sitting
trainer.train()

In [None]:
trainer.save_model(OUTPUT_DIR)

In [None]:
import math

In [None]:
eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

# Check Results

In [None]:
model_folder = "models/robertamaskedlm/output"
tokenizer_folder = "data/cust-tokenizer-data.json"

In [None]:
from tokenizers import Tokenizer
my_tokenizer = Tokenizer.from_file(tokenizer_folder)

In [None]:
# AutoModel is a generic model class that will be instantiated as one of the base model classes of the library when created with the AutoModel.from_pretrained(pretrained_model_name_or_path) or the AutoModel.from_config(config) class methods.
# This class cannot be instantiated using __init__() (throws an error).

from transformers import AutoModel
model = AutoModel.from_pretrained(model_folder, output_hidden_states=True)


In [None]:
encoded = my_tokenizer.encode("j2ee|java|python|.net|financial reporting|accountant's report")
input_ids = torch.tensor(encoded.ids).unsqueeze(0) 

In [None]:
print(encoded.tokens)
print(encoded.word_ids)
print(len(encoded.word_ids))
print(input_ids)

['[CLS]', 'j2ee', 'java', 'python', '.net', 'financial reporting', 'accountant', "##'s ", '##report', '[SEP]']
[None, 0, 1, 2, 3, 4, 5, 5, 5, None]
10
tensor([[   1, 1792,  540, 1202, 1118, 1004, 6681, 6624, 3021,    2]])


In [None]:
with torch.no_grad():
  output = model(input_ids)

In [None]:
print(output['last_hidden_state'].shape)
output['last_hidden_state']

torch.Size([1, 10, 768])


tensor([[[-1.1572, -0.0595, -0.3193,  ...,  1.0063, -0.1634, -0.7406],
         [-0.9416, -0.1351, -0.4944,  ...,  0.3276, -0.6364,  0.0367],
         [-0.2532, -0.4811,  0.1043,  ...,  0.4378, -0.0901,  0.0998],
         ...,
         [ 0.8555, -0.1407, -0.3859,  ..., -0.1166,  0.4961,  0.0316],
         [-0.3628,  0.0248, -0.9519,  ...,  0.3995, -0.1347,  0.0526],
         [-0.5398, -0.1964, -1.1223,  ...,  0.4968,  1.0257,  0.4972]]])

In [None]:
# output['hidden_states'] is a tuple of outputs equal to the number of layers. Each layer's outputs is of shape (batch_size, sequence_length, hidden_size)

print(len(output['hidden_states']))
print(output['hidden_states'][0].shape)


7
torch.Size([1, 10, 768])


In [None]:
len(output['last_hidden_state']) + len(output['hidden_states'])

8

In [None]:
# the last_hidden_state is the last layer's output of all the hidden_states
print(output['last_hidden_state'])
print(output.hidden_states[-1])

tensor([[[-1.1572, -0.0595, -0.3193,  ...,  1.0063, -0.1634, -0.7406],
         [-0.9416, -0.1351, -0.4944,  ...,  0.3276, -0.6364,  0.0367],
         [-0.2532, -0.4811,  0.1043,  ...,  0.4378, -0.0901,  0.0998],
         ...,
         [ 0.8555, -0.1407, -0.3859,  ..., -0.1166,  0.4961,  0.0316],
         [-0.3628,  0.0248, -0.9519,  ...,  0.3995, -0.1347,  0.0526],
         [-0.5398, -0.1964, -1.1223,  ...,  0.4968,  1.0257,  0.4972]]])
tensor([[[-1.1572, -0.0595, -0.3193,  ...,  1.0063, -0.1634, -0.7406],
         [-0.9416, -0.1351, -0.4944,  ...,  0.3276, -0.6364,  0.0367],
         [-0.2532, -0.4811,  0.1043,  ...,  0.4378, -0.0901,  0.0998],
         ...,
         [ 0.8555, -0.1407, -0.3859,  ..., -0.1166,  0.4961,  0.0316],
         [-0.3628,  0.0248, -0.9519,  ...,  0.3995, -0.1347,  0.0526],
         [-0.5398, -0.1964, -1.1223,  ...,  0.4968,  1.0257,  0.4972]]])


# Convert the hidden states into a word embedding that is contextualised

In [None]:
import numpy as np

def get_word_idx(sent: str, word: str):
  return sent.split("|").index(word)
 
 
def get_hidden_states(encoded, token_ids_word, model, layers):
  input_ids = torch.tensor(encoded.ids).unsqueeze(0) 
  with torch.no_grad():
      output = model(input_ids)

  # Get all hidden states
  states = output.hidden_states
  # Stack and sum all requested layers
  output = torch.stack([states[i] for i in layers]).sum(0).squeeze()
  # Only select the tokens that constitute the requested word
  word_tokens_output = output[token_ids_word]

  return word_tokens_output.mean(dim=0)
 
 
def get_word_vector(sent, idx, tokenizer, model, layers):
  encoded = tokenizer.encode(sent)
  # get all token idxs that belong to the word of interest
  token_ids_word = np.where(np.array(encoded.word_ids) == idx)

  return get_hidden_states(encoded, token_ids_word, model, layers)

In [None]:
layers = None
layers = [-2, -1] if layers is None else layers

### The entity "Risk Management" is contextualised differently depending on its surrounding entities in the 'sentence'.

In [None]:
sent = "credit risk|risk management" 
idx = get_word_idx(sent, "risk management")
word_embedding_1 = get_word_vector(sent, idx, my_tokenizer, model, layers)

sent = "cyber security|risk management" 
idx = get_word_idx(sent, "risk management")
word_embedding_2 = get_word_vector(sent, idx, my_tokenizer, model, layers)

print(cos(word_embedding_1,word_embedding_2))

tensor(0.9464)


# Reference
- https://colab.research.google.com/github/huggingface/blog/blob/master/notebooks/01_how_to_train.ipynb#scrollTo=ri2BIQKqjfHm
- https://huggingface.co/docs/tokenizers/python/latest/pipeline.html
- https://medium.com/analytics-vidhya/create-a-tokenizer-and-train-a-huggingface-roberta-model-from-scratch-f3ed1138180c
- https://discuss.huggingface.co/t/generate-raw-word-embeddings-using-transformer-models-like-bert-for-downstream-process/2958