In [None]:
import torch
from datasets import load_dataset
from transformers import AutoTokenizer

In [None]:
data_files={"train":"swerick_data_random_train.pkl","test":"swerick_data_random_test.pkl","valid":"swerick_data_random_valid.pkl"}
swerick_dataset = load_dataset("pandas",data_files=data_files)
swerick_dataset

In [None]:
from transformers import PreTrainedTokenizerFast

wrapped_tokenizer = PreTrainedTokenizerFast(
    #tokenizer_object=tokenizer,
    tokenizer_file="tokenizer_swerick.json", # You can load from the tokenizer file, alternatively
    unk_token="[UNK]",
    pad_token="[PAD]",
    cls_token="[CLS]",
    sep_token="[SEP]",
    mask_token="[MASK]",
)

In [None]:
tokenizer=AutoTokenizer.from_pretrained("swerick_tokenizer")

In [None]:
#difference de tokenizer 
base_tokenizer = AutoTokenizer.from_pretrained("KBLab/bert-base-swedish-cased")

In [None]:
replace =lambda x :x.replace('##',"")
swerick_voc=list(map(replace,wrapped_tokenizer.vocab.keys()))
base_voc=list(map(replace,base_tokenizer.vocab.keys()))

In [None]:
import pretraing_tokenizer
inter,f,s,jaccard,vocab_f= pretraing_tokenizer.get_vocab_sim(swerick_voc,base_voc)
print(inter)
print(f,s)
print("similarity of Jaccard",jaccard)
print("New Vocab added in tokenizer of swerick", vocab_f)

In [None]:
import pickle
with open("from_scratc_dataset","rb") as f:
    tokenized_datasets = pickle.load(f)

In [None]:
from transformers import BertConfig as TransformersBertConfig
import os
import sys
from typing import Optional, cast
from omegaconf import DictConfig
from omegaconf import OmegaConf as om


class BertConfig(TransformersBertConfig):

    def __init__(
        self,
        alibi_starting_size: int = 512,
        attention_probs_dropout_prob: float = 0.0,
        **kwargs,
    ):
        """Configuration class for MosaicBert.

        Args:
            alibi_starting_size (int): Use `alibi_starting_size` to determine how large of an alibi tensor to
                create when initializing the model. You should be able to ignore this parameter in most cases.
                Defaults to 512.
            attention_probs_dropout_prob (float): By default, turn off attention dropout in Mosaic BERT
                (otherwise, Flash Attention will be off by default). Defaults to 0.0.
        """
        super().__init__(
            attention_probs_dropout_prob=attention_probs_dropout_prob, **kwargs)
        self.alibi_starting_size = alibi_starting_size

with open("examples/examples/benchmarks/bert/yamls/main/mosaic-bert-base-uncased.yaml") as f:
        yaml_cfg = om.load(f)
cfg = cast(DictConfig, yaml_cfg)  
print(cfg)  
        

pretrained_model_name = "KBLab/bert-base-swedish-cased"
model_config=cfg.model.get('model_config', None)
print(model_config)
config = BertConfig.from_pretrained(
        pretrained_model_name, **model_config)
print(config)
if config.vocab_size % 8 != 0:
        config.vocab_size += 8 - (config.vocab_size % 8)


In [None]:
import bert_layers as bert_layers_module
from composer.metrics.nlp import (BinaryF1Score, LanguageCrossEntropy,
                                  MaskedAccuracy)
from composer.models.huggingface import HuggingFaceModel
model = bert_layers_module.BertForMaskedLM(config)
metrics = [
        LanguageCrossEntropy(ignore_index=-100),
        MaskedAccuracy(ignore_index=-100)
    ]
model = HuggingFaceModel(model=model,
                                tokenizer=wrapped_tokenizer,
                                use_logits=True,
                                metrics=metrics)

    # Padding for divisibility by 8
    # We have to do it again here because wrapping by HuggingFaceModel changes it
if config.vocab_size % 8 != 0:
    config.vocab_size += 8 - (config.vocab_size % 8)
model.model.resize_token_embeddings(config.vocab_size)
 

In [None]:
import torch
state_dict =torch.load("From_scratch_train/ep0-ba16494-rank0.pt")
model.load_state_dict(state_dict["state"]["model"])

In [None]:
state_dict["state"]["optimizers"].keys()

In [None]:
model_size = sum(t.numel() for t in model.parameters())
n_params = sum(p.numel() for p in model.parameters())
print(f'{n_params=:.4e}')

In [None]:
print(cfg.train_loader.dataset.get('eos_token_id'))

In [None]:
from transformers import DataCollatorForLanguageModeling
mlm_probability = 0.30
collate_fn = DataCollatorForLanguageModeling(
        tokenizer=wrapped_tokenizer,
        mlm=mlm_probability is not None,
        mlm_probability=mlm_probability)

eos_token_id = wrapped_tokenizer.sep_token_id
bos_token_id = wrapped_tokenizer.eos_token_id


In [None]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(tokenized_datasets["train"],collate_fn=collate_fn,batch_size=64,num_workers=4)
test_dataloader = DataLoader(tokenized_datasets["test"],collate_fn=collate_fn,batch_size=64,num_workers=4)

In [None]:
from composer import algorithms
from composer.optim import DecoupledAdamW
from composer.optim.scheduler import (ConstantWithWarmupScheduler,
                                      CosineAnnealingWithWarmupScheduler,
                                      LinearWithWarmupScheduler)


algorithms = [algorithms.LowPrecisionLayerNorm()]

In [None]:

optimizer = DecoupledAdamW(model.parameters(),
                            lr=cfg.optimizer.lr,
                            betas=cfg.optimizer.betas,
                            eps=cfg.optimizer.eps,
                            weight_decay=cfg.optimizer.weight_decay)

#optimizer.load_state_dict(state_dict["state"]["optimizers"]["DecoupledAdamW"])
scheduler = LinearWithWarmupScheduler(t_warmup=cfg.scheduler.t_warmup,
                                        alpha_f=cfg.scheduler.alpha_f)


In [None]:
from composer import Trainer


trainer = Trainer(
    run_name="Scratch_model",
    seed = cfg.seed,
    algorithms=algorithms,
    model=model,
    #loggers=logger,
   optimizers=optimizer,
   schedulers =scheduler,
    progress_bar=cfg.progress_bar,
    train_dataloader=train_dataloader,
    eval_dataloader=test_dataloader,
    precision=cfg.precision,
    save_folder="From_scratch_train",
    save_num_checkpoints_to_keep=100,
    save_interval="1ep",
    max_duration=cfg.max_duration,
    save_overwrite=True,
    #autoresume=True,
    log_to_console=True,
    console_log_interval="1ep",
    #load_path="From_scratch_train/latest-rank0.pt"
)

In [None]:
trainer.fit()