In [4]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForMaskedLM, DataCollatorForLanguageModeling, pipeline, TrainingArguments, Trainer

2025-02-17 08:54:10.522040: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [5]:
tomatoes = load_dataset('rotten_tomatoes')
tomatoes

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 8530
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 1066
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 1066
    })
})

In [6]:
# Load model for masked language modeling (MLM)
model_checkpoint = 'bert-base-cased'
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

BertForMaskedLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From ðŸ‘‰v4.50ðŸ‘ˆ onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.
Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another arc

In [7]:
# Preprocessing Data
def preprocess_function(examples):
    return tokenizer(examples['text'], truncation=True)

tokenized_train = tomatoes['train'].map(preprocess_function, batched=True)
tokenized_train = tokenized_train.remove_columns('label')

tokenized_test = tomatoes['test'].map(preprocess_function, batched=True)
tokenized_test = tokenized_test.remove_columns('label')

In [8]:
# Masking Tokens
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,
    mlm_probability=0.15
)

In [9]:
# Define a training arguments for parameter tuning
args = TrainingArguments(
    "model",
    learning_rate=2e-5,
    weight_decay=0.01,
    per_device_train_batch_size=16,
     per_device_eval_batch_size=16,
     num_train_epochs=10,
     save_strategy="epoch",
     report_to="none"
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    processing_class=tokenizer,
    data_collator=data_collator,
)

In [10]:
# Save pre-trained tokenizer
tokenizer.save_pretrained('mlm')

('mlm/tokenizer_config.json',
 'mlm/special_tokens_map.json',
 'mlm/vocab.txt',
 'mlm/added_tokens.json',
 'mlm/tokenizer.json')

In [10]:
# Train model
trainer.train()

KeyboardInterrupt: 

In [None]:
# Save updated model
model.save_pretrained('mlm')

In [12]:
mask_filler = pipeline('fill-mask', model='bert-base-cased')
preds = mask_filler("What a horrible [MASK]!")

for pred in preds:
    print(f'>>>{pred["sequence"]}')

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu


>>>What a horrible idea!
>>>What a horrible dream!
>>>What a horrible thing!
>>>What a horrible day!
>>>What a horrible thought!


In [None]:
mask_filler = pipeline('fill-mask', model='mlm')
preds = mask_filler("What a horrible [MASK]!")

for pred in preds:
    print(f'>>>{pred["sequence"]}', pred)