In [None]:
!pip install -q transformers torch sentencepiece accelerate

In [None]:
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForMaskedLM,
    AutoModelForCausalLM,
    AutoModelForSeq2SeqLM,
    pipeline
)

torch.manual_seed(42)



<torch._C.Generator at 0x7afcaa94bd10>

In [None]:
MODELS = {
    "BERT": "bert-base-uncased",
    "RoBERTa": "roberta-base",
    "BART": "facebook/bart-base"
}

In [None]:
GEN_PROMPT = "The future of artificial intelligence is"

def run_text_generation():
    print("\n=== TEXT GENERATION ===\n")

    for name, model_id in MODELS.items():
        print(f"\n--- {name} ---")

        try:
            if name == "BART":
                tokenizer = AutoTokenizer.from_pretrained(model_id)
                model = AutoModelForSeq2SeqLM.from_pretrained(model_id)

                inputs = tokenizer(GEN_PROMPT, return_tensors="pt")
                outputs = model.generate(**inputs, max_length=30)
                print(tokenizer.decode(outputs[0], skip_special_tokens=True))

            else:
                tokenizer = AutoTokenizer.from_pretrained(model_id)
                model = AutoModelForMaskedLM.from_pretrained(model_id)

                inputs = tokenizer(GEN_PROMPT, return_tensors="pt")
                outputs = model(**inputs)
                print("Output logits shape:", outputs.logits.shape)

        except Exception as e:
            print("Error:", e)

run_text_generation()



=== TEXT GENERATION ===


--- BERT ---


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Output logits shape: torch.Size([1, 8, 30522])

--- RoBERTa ---


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Output logits shape: torch.Size([1, 8, 50265])

--- BART ---


config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

The future of artificial intelligence is


In [None]:
MASK_TEXT = "Artificial intelligence will change the [MASK] forever."

def run_fill_mask():
    print("\n=== FILL MASK ===\n")

    for name, model_id in MODELS.items():
        print(f"\n--- {name} ---")

        try:
            tokenizer = AutoTokenizer.from_pretrained(model_id)
            model = AutoModelForMaskedLM.from_pretrained(model_id)

            fill_mask = pipeline("fill-mask", model=model, tokenizer=tokenizer)

            # Fix mask token differences
            text = MASK_TEXT.replace("[MASK]", tokenizer.mask_token)
            results = fill_mask(text)

            for r in results[:3]:
                print(f"{r['token_str']}  (score={r['score']:.4f})")

        except Exception as e:
            print("Error:", e)

run_fill_mask()



=== FILL MASK ===


--- BERT ---


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu


world  (score=0.7784)
universe  (score=0.0732)
future  (score=0.0691)

--- RoBERTa ---


Device set to use cpu


 world  (score=0.8113)
 workplace  (score=0.0415)
 industry  (score=0.0299)

--- BART ---


Device set to use cpu


 world  (score=0.4523)
 way  (score=0.2696)
 face  (score=0.0250)


In [None]:
CONTEXT = """
Transformers are deep learning models introduced in 2017.
They rely on self-attention mechanisms and are widely used in NLP tasks.
"""

QUESTION = "When were transformers introduced?"

def run_question_answering():
    print("\n=== QUESTION ANSWERING ===\n")

    for name, model_id in MODELS.items():
        print(f"\n--- {name} ---")

        try:
            qa = pipeline(
                "question-answering",
                model=model_id,
                tokenizer=model_id
            )

            result = qa(question=QUESTION, context=CONTEXT)
            print("Answer:", result["answer"])
            print("Score:", result["score"])

        except Exception as e:
            print("Error:", e)

run_question_answering()



=== QUESTION ANSWERING ===


--- BERT ---


Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use cpu
Some weights of RobertaForQuestionAnswering were not initialized from the model checkpoint at roberta-base and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Answer: .
They
Score: 0.003237243741750717

--- RoBERTa ---


Device set to use cpu


Answer: .
Score: 0.002986922743730247

--- BART ---


Some weights of BartForQuestionAnswering were not initialized from the model checkpoint at facebook/bart-base and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use cpu


Answer: Transformers are deep learning models introduced in
Score: 0.027988277841359377


| Task           | Model       | Classification        | Observation (What actually happened?)                                                              | Why did this happen? (Architectural Reason)                                                                                                                                               |
| -------------- | ----------- | --------------------- | -------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| **Generation** | **BERT**    | **Failure**           | Did not generate text. Only returned logits. No sentence continuation.  | BERT is a **bidirectional encoder** trained with **Masked Language Modeling (MLM)**. It has no notion of left-to-right next-token prediction, so free-form generation is impossible.      |
|                | **RoBERTa** | **Failure**           | Same behavior as BERT; returned logits without generating text.                    | RoBERTa is also an **encoder-only model**, optimized MLM training but still lacks an autoregressive decoder.                                                                              |
|                | **BART**    | **Partial** | Output was identical to the prompt: `"The future of artificial intelligence is"`. No continuation. | BART is **encoder–decoder**, but it is trained for **denoising / reconstruction**, not causal continuation. With no corruption signal or decoding guidance, it defaults to copying input. |
| **Fill-Mask**  | **BERT**    | **Success**           | Predicted sensible tokens: `world`, `universe`, `future` with high confidence.                     | This is BERT’s **native task**. MLM directly optimizes masked token prediction using bidirectional context.                                                                               |
|                | **RoBERTa** | **Success**           | Very strong prediction, slightly sharper than BERT.                     | Same MLM objective as BERT but with **more data, no NSP**, and better optimization.                                                                                                       |
|                | **BART**    | **Success**   | Predicted `world`, but with much lower confidence and noisier alternatives.                        | BART can simulate MLM via denoising, but masking is not its primary inference mode.                                                                                                     |
| **QA**         | **BERT**    | **Failure**           | Output was meaningless, extremely low confidence.                                | QA head was randomly initialized. Encoder representations are good, but without fine-tuning, span prediction fails.                                                                   |
|                | **RoBERTa** | **Failure**           | Returned `"."` with very low score.                                                                | Same issue as BERT: strong encoder, but no QA supervision → random span selection.                                                                                                    |
|                | **BART**    | **Partial Success**   | Returned a long phrase from context: `"Transformers are deep learning models introduced in"`       | Encoder-decoder allows generative extraction, but without QA fine-tuning it behaves more like summarization than precise span selection.                                              |




