In [2]:
from transformers import AutoModelForMaskedLM
import transformers
from transformers import AutoModelForMaskedLM, BertTokenizer, pipeline
from transformers import BertTokenizer, BertConfig,AutoConfig
import torch
from datasets import load_dataset
from transformers import DataCollatorForLanguageModeling
from transformers import TrainingArguments
from transformers import Trainer
import math
from torch.utils.data import DataLoader
from transformers import default_data_collator
from torch.optim import AdamW
from transformers import get_scheduler
from tqdm.auto import tqdm
import matplotlib.pyplot as plt

In [9]:
def tokenize_function(examples,tokenizer):
    result = tokenizer(examples["texte"])
    if tokenizer.is_fast:
        result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
    return result

In [10]:
def group_texts(examples,chunk_size):
    # Concatenate all texts
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    # Compute length of concatenated texts
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the last chunk if it's smaller than chunk_size
    total_length = (total_length // chunk_size) * chunk_size
    # Split by chunks of max_len
    result = {
        k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
        for k, t in concatenated_examples.items()
    }
    # Create a new labels column
    result["labels"] = result["input_ids"].copy()
    return result

In [4]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [19]:
model_checkpoint = "KBLab/bert-base-swedish-cased"
model =  AutoModelForMaskedLM.from_pretrained(model_checkpoint)
model.load_state_dict(torch.load("finetuning_manual.pth"))

Some weights of the model checkpoint at KBLab/bert-base-swedish-cased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


<All keys matched successfully>

In [21]:
print(model.parameters())

<generator object Module.parameters at 0x74a00c22cba0>


In [8]:
#datasest
data_files = {"train": "swerick_data_train.pkl", "test": "swerick_data_test.pkl"}
swerick_dataset = load_dataset("pandas",data_files=data_files)
print(swerick_dataset)

DatasetDict({
    train: Dataset({
        features: ['protocole', 'texte', '__index_level_0__'],
        num_rows: 104
    })
    test: Dataset({
        features: ['protocole', 'texte', '__index_level_0__'],
        num_rows: 26
    })
})


In [11]:
tokenized_datasets = swerick_dataset.map(
      lambda examples: tokenize_function(examples, tokenizer), batched=True, remove_columns=["texte", "protocole",'__index_level_0__']
)
tokenized_datasets

Map:   0%|          | 0/104 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (26821 > 512). Running this sequence through the model will result in indexing errors


Map:   0%|          | 0/26 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 104
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 26
    })
})

In [12]:
chunk_size = 128
lm_datasets = tokenized_datasets.map( lambda examples: group_texts(examples,chunk_size), batched=True) #dataset with chunk
lm_datasets

Map:   0%|          | 0/104 [00:00<?, ? examples/s]

Map:   0%|          | 0/26 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 102248
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 22431
    })
})

In [6]:
config = AutoConfig.from_pretrained(
    "mosaicml/mosaic-bert-base",
    vocab_size=len(tokenizer),
    n_ctx=chunk_size,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
)



config.json:   0%|          | 0.00/827 [00:00<?, ?B/s]

In [13]:
print(swerick_dataset)

DatasetDict({
    train: Dataset({
        features: ['protocole', 'texte', '__index_level_0__'],
        num_rows: 104
    })
    test: Dataset({
        features: ['protocole', 'texte', '__index_level_0__'],
        num_rows: 26
    })
})


In [16]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [14]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)#add the MASK term

In [15]:
batch_size = 64
train_dataloader = DataLoader(
    lm_datasets["train"],
    shuffle=True,
    batch_size=batch_size,
    collate_fn=data_collator,
)
train_dataloader = [
    inputs.to(device) for inputs in train_dataloader
]

In [17]:
len(train_dataloader)

1598

In [18]:
len(lm_datasets["train"])

102248

In [1]:
from transformers import PreTrainedTokenizerFast

wrapped_tokenizer = PreTrainedTokenizerFast(
    #tokenizer_object=tokenizer,
    tokenizer_file="tokenizer_swerick.json", # You can load from the tokenizer file, alternatively
    unk_token="[UNK]",
    pad_token="[PAD]",
    cls_token="[CLS]",
    sep_token="[SEP]",
    mask_token="[MASK]",
)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from transformers import BertConfig as TransformersBertConfig
import os
import sys
from typing import Optional, cast
from omegaconf import DictConfig
from omegaconf import OmegaConf as om


class BertConfig(TransformersBertConfig):

    def __init__(
        self,
        alibi_starting_size: int = 512,
        attention_probs_dropout_prob: float = 0.0,
        **kwargs,
    ):
        """Configuration class for MosaicBert.

        Args:
            alibi_starting_size (int): Use `alibi_starting_size` to determine how large of an alibi tensor to
                create when initializing the model. You should be able to ignore this parameter in most cases.
                Defaults to 512.
            attention_probs_dropout_prob (float): By default, turn off attention dropout in Mosaic BERT
                (otherwise, Flash Attention will be off by default). Defaults to 0.0.
        """
        super().__init__(
            attention_probs_dropout_prob=attention_probs_dropout_prob, **kwargs)
        self.alibi_starting_size = alibi_starting_size

with open("examples/examples/benchmarks/bert/yamls/main/mosaic-bert-base-uncased.yaml") as f:
        yaml_cfg = om.load(f)
cfg = cast(DictConfig, yaml_cfg)  
print(cfg)  
        

pretrained_model_name = "KBLab/bert-base-swedish-cased"
model_config=cfg.model.get('model_config', None)
print(model_config)
config = BertConfig.from_pretrained(
        pretrained_model_name, **model_config)
print(config)
if config.vocab_size % 8 != 0:
        config.vocab_size += 8 - (config.vocab_size % 8)

import bert_layers as bert_layers_module
from composer.metrics.nlp import (BinaryF1Score, LanguageCrossEntropy,
                                  MaskedAccuracy)
from composer.models.huggingface import HuggingFaceModel
model = bert_layers_module.BertForMaskedLM(config)

    # We have to do it again here because wrapping by HuggingFaceModel changes it
if config.vocab_size % 8 != 0:
    config.vocab_size += 8 - (config.vocab_size % 8)
model.resize_token_embeddings(config.vocab_size)

{'data_local': 'swerick_mosaic', 'data_remote': None, 'max_seq_len': 128, 'tokenizer_name': 'swerick_tokenizer', 'mlm_probability': 0.3, 'run_name': 'mosaic-bert-base-uncased', 'model': {'name': 'mosaic_bert', 'pretrained_model_name': None, 'tokenizer_name': '${tokenizer_name}', 'model_config': {'num_attention_heads': 12, 'num_hidden_layers': 12, 'attention_probs_dropout_prob': 0.0}}, 'train_loader': {'name': 'text', 'dataset': {'local': '${data_local}', 'remote': '${data_remote}', 'split': 'swerick_data_sentence_train', 'tokenizer_name': '${tokenizer_name}', 'max_seq_len': '${max_seq_len}', 'shuffle': True, 'mlm_probability': '${mlm_probability}'}, 'drop_last': True, 'num_workers': 1}, 'eval_loader': {'name': 'text', 'dataset': {'local': '${data_local}', 'remote': '${data_remote}', 'split': 'swerick_data_sentence_test', 'tokenizer_name': '${tokenizer_name}', 'max_seq_len': '${max_seq_len}', 'shuffle': False, 'mlm_probability': 0.15}, 'drop_last': False, 'num_workers': 8}, 'scheduler':



Embedding(50328, 768, padding_idx=0)

In [3]:
from transformers import DataCollatorForLanguageModeling
mlm_probability = 0.30
collate_fn = DataCollatorForLanguageModeling(
        tokenizer=wrapped_tokenizer,
        mlm=mlm_probability is not None,
        mlm_probability=mlm_probability)

In [4]:
import pickle
with open("from_scratc_dataset","rb") as f:
    tokenized_datasets = pickle.load(f)

In [5]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(tokenized_datasets["train"],collate_fn=collate_fn,batch_size=64,num_workers=4)
test_dataloader = DataLoader(tokenized_datasets["test"],collate_fn=collate_fn,batch_size=64,num_workers=4)

In [6]:
from transformers import TrainingArguments
from transformers import Trainer
import preprocessing
batch_size = 64
num_epochs=100
# Show the training loss with every epoch
logging_steps = len(tokenized_datasets["train"]) // batch_size
print(len(tokenized_datasets["train"]) // batch_size)
model_name ="scratch"

trainer = preprocessing.create_trainer(model,model_name,batch_size,logging_steps,train_dataset=tokenized_datasets["train"],eval_dataset=tokenized_datasets["test"],data_collator=collate_fn,tokenizer=wrapped_tokenizer,num_epochs=100)

52588


In [7]:
trainer.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mlaurinemeier[0m ([33muppsala_ml[0m). Use [1m`wandb login --relogin`[0m to force relogin


  0%|          | 0/5258900 [00:00<?, ?it/s]You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


OutOfMemoryError: CUDA out of memory. Tried to allocate 1.54 GiB (GPU 0; 23.67 GiB total capacity; 8.89 GiB already allocated; 409.38 MiB free; 8.95 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

Error in callback <bound method _WandbInit._pause_backend of <wandb.sdk.wandb_init._WandbInit object at 0x7f7dddfed710>> (for post_run_cell), with arguments args (<ExecutionResult object at 7f7dddfbbc10, execution_count=7 error_before_exec=None error_in_exec=CUDA out of memory. Tried to allocate 1.54 GiB (GPU 0; 23.67 GiB total capacity; 8.89 GiB already allocated; 409.38 MiB free; 8.95 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF info=<ExecutionInfo object at 7f7da7e1bd90, raw_cell="trainer.train()" store_history=True silent=False shell_futures=True cell_id=vscode-notebook-cell:/home/laurinemeier/swerick/from_scratch_model.ipynb#X41sZmlsZQ%3D%3D> result=None>,),kwargs {}:


TypeError: _WandbInit._pause_backend() takes 1 positional argument but 2 were given

In [41]:
import os
import tempfile
from composer.utils import export_for_inference

save_format="torchscript"
working_dir=tempfile.TemporaryDirectory()
model_save_path=os.path.join(working_dir.name,"model.pt")

export_for_inference(model=model,save_format=save_format,save_path=model_save_path)

Scripting with torch.jit.script failed and sample inputs are not provided for tracing with torch.jit.trace
Traceback (most recent call last):
  File "/home/laurinemeier/.local/lib/python3.11/site-packages/composer/utils/inference.py", line 207, in export_for_inference
    export_model = torch.jit.script(model)
                   ^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/laurinemeier/.local/lib/python3.11/site-packages/torch/jit/_script.py", line 1284, in script
    return torch.jit._recursive.create_script_module(
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/laurinemeier/.local/lib/python3.11/site-packages/torch/jit/_recursive.py", line 480, in create_script_module
    return create_script_module_impl(nn_module, concrete_type, stubs_fn)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/laurinemeier/.local/lib/python3.11/site-packages/torch/jit/_recursive.py", line 492, in create_script_module_impl
    method_stubs = stubs_fn(nn_mo

RuntimeError: Scritping and tracing failed! No model is getting exported.

In [37]:
model.state_dict

<bound method Module.state_dict of HuggingFaceModel(
  (model): BertForMaskedLM(
    (bert): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(50328, 768, padding_idx=0)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0-11): 12 x BertLayer(
            (attention): BertUnpadAttention(
              (self): BertUnpadSelfAttention(
                (dropout): Dropout(p=0.0, inplace=False)
                (Wqkv): Linear(in_features=768, out_features=2304, bias=True)
              )
              (output): BertSelfOutput(
                (dense): Linear(in_features=768, out_features=768, bias=True)
                (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
    

In [19]:
!python3 convert_composer_to_hf.py --composer_path From_scratch_train/ep0-ba15356-rank0.pt --hf_output model_0/ --output_precision bf16


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Downloading checkpoint from From_scratch_train/ep0-ba15356-rank0.pt -> /tmp/tmp70xilx7w/local-composer-checkpoint.pt
Loading checkpoint into CPU RAM...
##############################
Saving HF Model Config...
{'alibi_starting_size': 512, 'architectures': ['BertForMaskedLM'], 'attention_probs_dropout_prob': 0.0, 'classifier_dropout': None, 'gradient_checkpointing': False, 'hidden_act': 'gelu', 'hidden_dropout_prob': 0.1, 'hidden_size': 768, 'initializer_range': 0.02, 'intermediate_size': 3072, 'layer_norm_eps': 1e-12, 'max_position_embeddings': 512, 'model_type': 'bert', 'num_attention_heads': 12, 'num_hidden_layers': 12, 'output_past': True, 'pad_token_id': 0, 'position_embedding_type': 'absolute', 'transfor

In [31]:
import torch
from transformers import AutoModelForMaskedLM
model = AutoModelForMaskedLM.from_pretrained("model1",trust_remote_code=True)

from transformers import pipeline
fill_mask = pipeline("fill-mask", model=model, tokenizer=wrapped_tokenizer)
fill_mask("hey [MASK]")

Explicitly passing a `revision` is encouraged when loading a configuration with custom code to ensure no malicious code has been contributed in a newer revision.


ValueError: Unrecognized configuration class <class 'transformers_modules.model1.configuration_mpt.MPTConfig'> for this kind of AutoModel: AutoModelForMaskedLM.
Model type should be one of AlbertConfig, BartConfig, BertConfig, BigBirdConfig, CamembertConfig, ConvBertConfig, Data2VecTextConfig, DebertaConfig, DebertaV2Config, DistilBertConfig, ElectraConfig, ErnieConfig, EsmConfig, FlaubertConfig, FNetConfig, FunnelConfig, IBertConfig, LayoutLMConfig, LongformerConfig, LukeConfig, MBartConfig, MegaConfig, MegatronBertConfig, MobileBertConfig, MPNetConfig, MvpConfig, NezhaConfig, NystromformerConfig, PerceiverConfig, QDQBertConfig, ReformerConfig, RemBertConfig, RobertaConfig, RobertaPreLayerNormConfig, RoCBertConfig, RoFormerConfig, SqueezeBertConfig, TapasConfig, Wav2Vec2Config, XLMConfig, XLMRobertaConfig, XLMRobertaXLConfig, XmodConfig, YosoConfig.