In [1]:
import sys

app = "/app"
if app not in sys.path:
    sys.path.append(app)
sys.path

['/Users/lemberge/code/dendritic/notebooks',
 '/opt/homebrew/Cellar/python@3.11/3.11.5/Frameworks/Python.framework/Versions/3.11/lib/python311.zip',
 '/opt/homebrew/Cellar/python@3.11/3.11.5/Frameworks/Python.framework/Versions/3.11/lib/python3.11',
 '/opt/homebrew/Cellar/python@3.11/3.11.5/Frameworks/Python.framework/Versions/3.11/lib/python3.11/lib-dynload',
 '',
 '/Users/lemberge/code/dendritic/.venv/lib/python3.11/site-packages',
 '/app']

In [2]:
!pwd

/Users/lemberge/code/dendritic/notebooks


In [3]:
!ls /data/text/emboj_abstracts

ls: /data/text/emboj_abstracts: No such file or directory


In [4]:
from datasets import load_dataset

dataset = load_dataset(
    "text",
    data_dir="../data/text/emboj_abstracts/",
    data_files={'train': 'train/examples.txt', 'test': 'test/examples.txt'},
)
dataset

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 12881
    })
    test: Dataset({
        features: ['text'],
        num_rows: 1802
    })
})

In [5]:
MODEL_NAME = "roberta-base"

In [6]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
print(f"tokenizer vocab size: {tokenizer.vocab_size}")
print(f"max length: {tokenizer.max_model_input_sizes[MODEL_NAME]}")

tokenizer vocab size: 50265
max length: 512


In [7]:
# tokenize the dataset
def tokenization(examples):
    return tokenizer(
    examples["text"],
        max_length=tokenizer.max_model_input_sizes[MODEL_NAME],
        truncation=True,
        return_special_tokens_mask=True,
    )
tokenized = dataset.map(tokenization, batched=True)
tokenized

DatasetDict({
    train: Dataset({
        features: ['text', 'input_ids', 'attention_mask', 'special_tokens_mask'],
        num_rows: 12881
    })
    test: Dataset({
        features: ['text', 'input_ids', 'attention_mask', 'special_tokens_mask'],
        num_rows: 1802
    })
})

In [8]:
tokenized = tokenized.remove_columns(["text"])
tokenized

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'special_tokens_mask'],
        num_rows: 12881
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'special_tokens_mask'],
        num_rows: 1802
    })
})

In [9]:
from transformers import DataCollatorForLanguageModeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer,
    mlm=True,
    mlm_probability=0.15
)

In [10]:
from transformers import (
    # RobertaForMaskedLM,
    AutoConfig
)
from src.models.modeling_dendroberta import RobertaForMaskedLM

config = AutoConfig.from_pretrained(MODEL_NAME)
model = RobertaForMaskedLM(config=config)

In [11]:
model_size = sum(t.numel() for t in model.parameters())
print(f"Model size: {model_size/1000**2:.1f}M parameters")

Model size: 124.7M parameters


In [12]:
from transformers import Trainer, TrainingArguments
from datetime import datetime

args = TrainingArguments(
    output_dir="../models",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    evaluation_strategy="steps",
    eval_steps=500,
    prediction_loss_only=True,
    logging_steps=100,
    gradient_accumulation_steps=1,
    num_train_epochs=1,
    weight_decay=0.1,
    warmup_steps=1_000,
    lr_scheduler_type="cosine",
    learning_rate=5e-4,
    save_steps=5_000,
    fp16=False,
    push_to_hub=False,
    logging_dir = f"../runs/lm-dendroberta-{datetime.now().isoformat().replace(':','-')}"
)

In [13]:
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=args,
    data_collator=data_collator,
    train_dataset=tokenized['train'],
    eval_dataset=tokenized["test"],
)

In [14]:
# Load the TensorBoard notebook extension
%load_ext tensorboard

In [15]:
trainer.train()

  0%|          | 0/12881 [00:00<?, ?it/s]

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


RuntimeError: MPS backend out of memory (MPS allocated: 29.44 GB, other allocations: 5.31 GB, max allowed: 36.27 GB). Tried to allocate 2.41 GB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).

In [None]:
# launch tensorboard
%tensorboard --logdir runs

In [None]:
model.device