In [None]:
!pip install transformers datasets torch -q

import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForMaskedLM, DataCollatorForLanguageModeling, Trainer, TrainingArguments




In [None]:
!pip uninstall transformers -y
!pip install transformers==4.56.2
!pip install datasets -U
!pip install torch


Found existing installation: transformers 4.57.0
Uninstalling transformers-4.57.0:
  Successfully uninstalled transformers-4.57.0
Collecting transformers==4.56.2
  Downloading transformers-4.56.2-py3-none-any.whl.metadata (40 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.1/40.1 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
Downloading transformers-4.56.2-py3-none-any.whl (11.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.6/11.6 MB[0m [31m77.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: transformers
Successfully installed transformers-4.56.2




In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForMaskedLM, DataCollatorForLanguageModeling, Trainer, TrainingArguments, pipeline
import torch

# --- Step 3: Load tiny dataset ---
dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="train[:1%]")  # 1% for very fast training

# --- Step 4: Tokenizer ---
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)

tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=["text"])

# --- Step 5: Data collator ---
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,
    mlm_probability=0.15
)

# --- Step 6: Load model ---
model = AutoModelForMaskedLM.from_pretrained("distilbert-base-uncased")

# --- Step 7: Training arguments ---
training_args = TrainingArguments(
    output_dir="./mlm-wikitext",
    per_device_train_batch_size=16,
    num_train_epochs=1,
    save_steps=500,
    save_total_limit=1,
    logging_steps=100,
    report_to="none",  # disable wandb logging
    fp16=torch.cuda.is_available(),
)

# --- Step 8: Trainer ---
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,
    data_collator=data_collator
)

# --- Step 9: Train ---
trainer.train()

# --- Step 10: Save model ---
trainer.save_model("./mlm-wikitext-model")

# --- Step 11: Test model ---
fill_mask = pipeline(
    "fill-mask",
    model="./mlm-wikitext-model",
    tokenizer=tokenizer
)

sentence = "The capital of France is [MASK]."
predictions = fill_mask(sentence)
for pred in predictions:
    print(f"{pred['sequence']} | Score: {pred['score']:.4f}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Map:   0%|          | 0/367 [00:00<?, ? examples/s]

Step,Training Loss


Device set to use cuda:0


the capital of france is paris. | Score: 0.1590
the capital of france is marseille. | Score: 0.1410
the capital of france is lyon. | Score: 0.0860
the capital of france is nantes. | Score: 0.0757
the capital of france is toulouse. | Score: 0.0725


In [None]:
sentence = "The fastest land animal is the [MASK]."
predictions = fill_mask(sentence)
for pred in predictions:
    print(f"{pred['sequence']} | Score: {pred['score']:.4f}")


the fastest land animal is the elephant. | Score: 0.0518
the fastest land animal is the jaguar. | Score: 0.0449
the fastest land animal is the deer. | Score: 0.0381
the fastest land animal is the horse. | Score: 0.0320
the fastest land animal is the tiger. | Score: 0.0230


In [None]:
sentence = "The largest ocean on Earth is the [MASK]."
predictions = fill_mask(sentence)
for pred in predictions:
    print(f"{pred['sequence']} | Score: {pred['score']:.4f}")


the largest ocean on earth is the atlantic. | Score: 0.4032
the largest ocean on earth is the pacific. | Score: 0.1211
the largest ocean on earth is the ocean. | Score: 0.0586
the largest ocean on earth is the caribbean. | Score: 0.0509
the largest ocean on earth is the arctic. | Score: 0.0456


In [None]:
sentence = "The first president of the United States was [MASK]."
predictions = fill_mask(sentence)
for pred in predictions:
    print(f"{pred['sequence']} | Score: {pred['score']:.4f}")



the first president of the united states was elected. | Score: 0.0329
the first president of the united states was theodore. | Score: 0.0221
the first president of the united states was lincoln. | Score: 0.0205
the first president of the united states was washington. | Score: 0.0205
the first president of the united states was william. | Score: 0.0190


In [None]:
sentence = "Water boils at [MASK] degrees Celsius."
predictions = fill_mask(sentence)
for pred in predictions:
    print(f"{pred['sequence']} | Score: {pred['score']:.4f}")


water boils at 90 degrees celsius. | Score: 0.0457
water boils at 60 degrees celsius. | Score: 0.0426
water boils at 45 degrees celsius. | Score: 0.0367
water boils at 10 degrees celsius. | Score: 0.0259
water boils at 40 degrees celsius. | Score: 0.0234


In [None]:
sentence = "A group of lions is called a [MASK]."
predictions = fill_mask(sentence)
for pred in predictions:
    print(f"{pred['sequence']} | Score: {pred['score']:.4f}")


a group of lions is called a lion. | Score: 0.3074
a group of lions is called a lions. | Score: 0.0406
a group of lions is called a group. | Score: 0.0391
a group of lions is called a tiger. | Score: 0.0315
a group of lions is called a pack. | Score: 0.0303


In [None]:
sentence = "Many AI developers prefer [MASK] for deep learning projects."
predictions = fill_mask(sentence)
for pred in predictions:
    print(f"{pred['sequence']} | Score: {pred['score']:.4f}")


many ai developers prefer it for deep learning projects. | Score: 0.2353
many ai developers prefer this for deep learning projects. | Score: 0.0888
many ai developers prefer software for deep learning projects. | Score: 0.0389
many ai developers prefer ai for deep learning projects. | Score: 0.0334
many ai developers prefer them for deep learning projects. | Score: 0.0334


In [None]:
sentence = "The programming language [MASK] is widely used for AI."
predictions = fill_mask(sentence)
for pred in predictions:
    print(f"{pred['sequence']} | Score: {pred['score']:.4f}")


the programming language language is widely used for ai. | Score: 0.0658
the programming language syntax is widely used for ai. | Score: 0.0388
the programming language paradigm is widely used for ai. | Score: 0.0374
the programming language python is widely used for ai. | Score: 0.0337
the programming language framework is widely used for ai. | Score: 0.0294


In [4]:
!jupyter nbconvert --ClearMetadataPreprocessor.enabled=True --to notebook --output cleaned.ipynb Untitled_4.ipynb


This application is used to convert notebook files (*.ipynb)
        to various other formats.


Options
The options below are convenience aliases to configurable class-options,
as listed in the "Equivalent to" description-line of the aliases.
To see all configurable class-options for some <cmd>, use:
    <cmd> --help-all

--debug
    set log level to logging.DEBUG (maximize logging output)
    Equivalent to: [--Application.log_level=10]
--show-config
    Show the application's configuration (human-readable format)
    Equivalent to: [--Application.show_config=True]
--show-config-json
    Show the application's configuration (json format)
    Equivalent to: [--Application.show_config_json=True]
--generate-config
    generate default config file
    Equivalent to: [--JupyterApp.generate_config=True]
-y
    Answer yes to any questions instead of prompting.
    Equivalent to: [--JupyterApp.answer_yes=True]
--execute
    Execute the notebook prior to export.
    Equivalent to: [--ExecutePr

In [5]:
!ls


sample_data
