In [None]:
# # Mounting drive for running the code through Google colab 
# from google.colab import drive
# drive.mount('/content/drive')

**The Python version we use: 3.7.13**

# 1- Tokenizer

In [None]:
# # We won't need TensorFlow here
# !pip uninstall -y tensorflow

# # Install `transformers` from master
# # Install huggingface-hub-0.8.1 pyyaml-6.0 tokenizers-0.12.1 transformers-4.22.0.dev0
# !pip install git+https://github.com/huggingface/transformers
# !pip list | grep -E 'transformers|tokenizers'

In [None]:
from tokenizers import SentencePieceBPETokenizer

tokenizer = SentencePieceBPETokenizer()

In [None]:
paths = '/kaggle/input/ecgbert/250 Hz/ecg_train.csv' ###path to the train_dataset
special_tokens = ["<s>", "<pad>", "</s>", "<unk>", "<cls>", "<sep>", "<mask>"]

In [None]:
tokenizer.train(
    paths,
    vocab_size=52_000,
    min_frequency=2, #The minimum frequency a pair should have in order to be merged.
    show_progress=True,
    limit_alphabet=100, #The maximum different characters to keep in the alphabet.
    special_tokens=special_tokens
)
print("voc size", tokenizer.get_vocab_size())

# Making Our Tokenizer 

In [None]:
address = "tokenizer_pretrained" #Replace your local address here

In [None]:
import transformers
tokenizer = transformers.PreTrainedTokenizerFast(tokenizer_object=tokenizer, special_tokens=special_tokens) 

In [None]:
tokenizer.bos_token = "<s>"
tokenizer.bos_token_id = tokenizer.convert_tokens_to_ids("<s>")
tokenizer.pad_token = "<pad>"
tokenizer.pad_token_id = tokenizer.convert_tokens_to_ids("<pad>")
tokenizer.eos_token = "</s>"
tokenizer.eos_token_id = tokenizer.convert_tokens_to_ids("</s>")
tokenizer.unk_token = "<unk>"
tokenizer.unk_token_id = tokenizer.convert_tokens_to_ids("<unk>")
tokenizer.cls_token = "<cls>"
tokenizer.cls_token_id = tokenizer.convert_tokens_to_ids("<cls>")
tokenizer.sep_token = "<sep>"
tokenizer.sep_token_id = tokenizer.convert_tokens_to_ids("<sep>")
tokenizer.mask_token = "<mask>"
tokenizer.mask_token_id = tokenizer.convert_tokens_to_ids("<mask>")

In [None]:
#!mkdir HeartBert
tokenizer.save_pretrained(address)

# 2- Heart MLM Model 

# HeartBert---training phase

In [None]:
# # Check that we have a GPU (for colab)
# !nvidia-smi

In [None]:
# Check that PyTorch sees it
# import torch
# torch.cuda.is_available()

In [None]:
# import tensorflow as tf
# print("TPU devices:", tf.config.experimental.list_logical_devices('TPU'))

In [None]:
# import math, re, os
# import tensorflow as tf
# import numpy as np
# from matplotlib import pyplot as plt
# from kaggle_datasets import KaggleDatasets
# from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix
# print("Tensorflow version " + tf.__version__)
# AUTO = tf.data.experimental.AUTOTUNE

# # Detect TPU, return appropriate distribution strategy
# try:
#     tpu = tf.distribute.cluster_resolver.TPUClusterResolver() 
#     print('Running on TPU ', tpu.master())
# except ValueError:
#     tpu = None

# if tpu:
#     tf.config.experimental_connect_to_cluster(tpu)
#     tf.tpu.experimental.initialize_tpu_system(tpu)
#     strategy = tf.distribute.experimental.TPUStrategy(tpu)
# else:
#     strategy = tf.distribute.get_strategy() 

# print("REPLICAS: ", strategy.num_replicas_in_sync)

# 2-1-  Model Config 


In [None]:
from transformers import RobertaConfig

config = RobertaConfig(
    vocab_size=52_000,
    max_position_embeddings=514,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1,
)

# 2-2- Tokenizer

In [None]:
from transformers import TFAutoModel, AutoTokenizer

address = '/kaggle/working/tokenizer_pretrained'
tokenizer = AutoTokenizer.from_pretrained(address) 

# 2-3- MLM definition 

In [None]:
from transformers import RobertaForMaskedLM

model = RobertaForMaskedLM(config=config)
print(model.num_parameters())

# 2-4- Dataset Generation

In [None]:
from torch.utils.data import Dataset
import os
from typing import Dict
import torch

class LineByLineTextDataset(Dataset):
    # tokenizer: PreTrainedTokenizer,
    def __init__(self, file_path: str, block_size: int):
        if os.path.isfile(file_path) is False:
            raise ValueError(f"Input file path {file_path} not found")
        
        lines= []
        with open(file_path, encoding="utf-8") as f:
            # lines = [line for line in f.read().splitlines() if (len(line) > 0 and not line.isspace())]
            for line in f:
              line= line.strip('\n')
              if len(line)!=0:
                if line.isspace():
                  print('this line contains space!')
                lines.append(line)

        batch_encoding = tokenizer(lines, add_special_tokens=True, truncation=True, max_length=block_size)
        self.examples = batch_encoding["input_ids"]
        self.examples = [{"input_ids": torch.tensor(e, dtype=torch.long)} for e in self.examples]

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, i) -> Dict[str, torch.tensor]:
        return self.examples[i]


In [None]:
%%time
########## Replace your file_path to train, val, and test files
bs = 128

train_dataset = LineByLineTextDataset(
    # tokenizer=tokenizer,
    file_path= '/kaggle/input/ecgbert/250 Hz/ecg_train.csv',
    block_size=bs,
)

eval_dataset = LineByLineTextDataset(
    # tokenizer=tokenizer,
    file_path='/kaggle/input/ecgbert/250 Hz/ecg_val.csv',
    block_size=bs,
)

test_dataset = LineByLineTextDataset(
    # tokenizer=tokenizer,
    file_path='/kaggle/input/ecgbert/250 Hz/ecg_test.csv',
    block_size=bs,
)

In [None]:
len(train_dataset)

In [None]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

# 2-5- Train

In [None]:
######################################################################################
################################### ATTENTION!########################################
### If this is the first time you run, run this cell; otherwise, skip this cell. ###
######################################################################################

import pickle

path_loss_eval = 'loss_eval.pickle' #path to loss_eval.pickle
path_loss_train = 'loss_train.pickle' #path to loss_train.pickle

loss_eval = []
loss_train = []

with open(path_loss_train, 'wb') as handle:
  pickle.dump(loss_train, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open(path_loss_eval, 'wb') as handle:
  pickle.dump(loss_eval, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
from transformers import TrainerCallback
import pickle

path_loss_eval = 'loss_eval.pickle' #path to loss_eval.pickle
path_loss_train = 'loss_train.pickle' #path to loss_train.pickle


with open(path_loss_train, 'rb') as handle:
  loss_train = pickle.load(handle)

with open(path_loss_eval, 'rb') as handle:
  loss_eval = pickle.load(handle)

# the server can be lost in the middle of saving losses so we use the minimum length of stored losses between train and eval
if len(loss_train)!=len(loss_eval):
  print('difference between length of train loss and eval loss :(')
  loss_train = loss_train[0:min(len(loss_train),len(loss_eval))]
  loss_eval = loss_eval[0:min(len(loss_train),len(loss_eval))]

class PrinterCallback(TrainerCallback):
  def on_log(self, args, state, control, logs=None, **kwargs):
      print('my logs:',logs)
      if 'loss' in logs.keys():
        loss_train.append(logs['loss'])
        with open(path_loss_train, 'wb') as handle:
          pickle.dump(loss_train, handle, protocol=pickle.HIGHEST_PROTOCOL)
      elif 'eval_loss' in logs.keys():
        loss_eval.append(logs['eval_loss'])
        with open(path_loss_eval, 'wb') as handle:
          pickle.dump(loss_eval, handle, protocol=pickle.HIGHEST_PROTOCOL)


In [None]:
# pip install --upgrade transformers

In [None]:
# pip install transformers==4.38.2

In [None]:
# !pip install accelerate -U

In [None]:
# from transformers import Trainer, TrainerCallback
# import torch_xla.core.xla_model as xm

# class PrinterCallback(TrainerCallback):
#     def on_epoch_end(self, args, state, control, **kwargs):
#         print(f"Epoch {state.epoch} done.")

In [None]:
from transformers import Trainer, TrainingArguments
from transformers import EarlyStoppingCallback, IntervalStrategy

address = 'training_arguments'

training_args = TrainingArguments(
    output_dir= address, #your address to the output directory
    report_to = 'none',
    overwrite_output_dir=True,
    num_train_epochs=400,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    # gradient_accumulation_steps = 1,
    save_steps=10_000,
    save_total_limit=2,
    prediction_loss_only=True,
    #logging_steps=50,
    logging_strategy = 'epoch',                            
    eval_strategy = IntervalStrategy.EPOCH, 
    save_strategy = 'epoch',
    metric_for_best_model = 'loss',
    load_best_model_at_end=True,
)

In [None]:
# !pip install "transformers[torch]" --upgrade
# !pip install accelerate --upgrade
# !pip install datasets --upgrade

In [None]:
trainer = Trainer(
    model=model.to('cuda'),
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    callbacks = [PrinterCallback, EarlyStoppingCallback(early_stopping_patience=50, early_stopping_threshold=0)]
)

In [None]:
######################################################################################
################################### ATTENTION!########################################
### If this is the first time you run, run this cell (skip the next); otherwise, skip this cell (run the next). ###
######################################################################################

import time

start_time = time.time()

trainer.train()  # only first time running 

end_time = time.time()
print(f"Training took {end_time - start_time:.2f} seconds")

In [None]:
%%time
trainer.train(resume_from_checkpoint=True) 

# 2-6- Evaluation

In [None]:
import matplotlib.pyplot as plt
import numpy as np

In [None]:
plt.rcParams.update({
    'font.family': 'Times New Roman',
    'font.size': 16,
    'axes.titlesize': 16,
    'axes.labelsize': 16,
    'xtick.labelsize': 16,
    'ytick.labelsize': 16,
})

In [None]:
plt.plot(np.arange(1,len(loss_train)+1),loss_train,)
plt.plot(np.arange(1,len(loss_eval)+1),loss_eval,)
plt.legend(['loss_train','loss_eval'])
plt.xlabel('epoch')
plt.ylabel('loss')
plt.title('loss per epoch')
# plt.show()
plt.savefig('loss_per_epoch.png')

In [None]:
class EvaluationMetrics:
  def __init__(self,):
    pass
    #crossentropy is the loss of model wrt input
  def perplexity(self, crossentropy):
    return np.exp(crossentropy)

In [None]:
# an instance of class EvaluationMetrics
evaluation_metrics = EvaluationMetrics()

In [None]:
plt.plot(np.arange(1,len(loss_train)+1),evaluation_metrics.perplexity(loss_train))
plt.plot(np.arange(1,len(loss_eval)+1),evaluation_metrics.perplexity(loss_eval))
plt.legend(['ppl_train','ppl_eval'])
plt.xlabel('epoch')
plt.ylabel('ppl')
plt.title('ppl per epoch')
plt.show()

test

In [None]:
# copy loss_eval because when we call trainer.evaluate() on_log append loss of test to list of loss_eval so we have loss of eval loss on first part of the list and loss of test at the end part of the list.
# to not confront to this problem first we clear the content of loss_eval and after we use it for test set, we bring back the original value of loss_eval.
import copy
loss_eval_temp = copy.deepcopy(loss_eval)
loss_eval = []

loss_test = []
for i in range(len(test_dataset)): #check
  # print(sample)
  o = trainer.evaluate(eval_dataset=test_dataset[i:i+1],)
  print(o['eval_loss'])
  loss_test.append(o['eval_loss'])

loss_eval = copy.deepcopy(loss_eval_temp)

plt.plot(np.arange(1,len(loss_test)+1),loss_test)
plt.legend(['loss_test'])
plt.xlabel('epoch')
plt.ylabel('loss')
plt.title('loss per epoch')
# plt.show()
plt.savefig('test_loss_per_epoch.png')


In [None]:
plt.plot(np.arange(1,len(loss_test)+1),evaluation_metrics.perplexity(loss_test))
plt.legend(['ppl_test'])
plt.xlabel('sample')
plt.ylabel('ppl')
plt.title('ppl per sample')
# plt.show()
plt.savefig('ppl_per_sample.png')

In [None]:
loss_eval_temp = copy.deepcopy(loss_eval)
loss_eval = []
# calculating perplexity of model using  test set. in eval_dataset you can use train, dev or test set
print('PPL:',evaluation_metrics.perplexity(trainer.evaluate(eval_dataset=test_dataset,)['eval_loss']))

loss_eval = copy.deepcopy(loss_eval_temp)

# 2-7 Saving model

In [None]:
address = 'model'
trainer.save_model(address)

# Saving model checkpoint to ./HeartBert/mlm_model
# Configuration saved in ./HeartBert/mlm_model/config.json
# Model weights saved in ./HeartBert/mlm_model/pytorch_model.bin

# 3- Test



Aside from looking at the training and eval losses going down, the easiest way to check whether our language model is learning anything interesting is via the `FillMaskPipeline`.

Pipelines are simple wrappers around tokenizers and models, and the 'fill-mask' one will let you input a sequence containing a masked token (here, `<mask>`) and return a list of the most probable filled sequences, with their probabilities.



In [None]:
# from transformers import pipeline

# fill_mask = pipeline(
#     "fill-mask",
#     model= address,
#     tokenizer= address
# )