In [2]:
# mask language model: predict masked token given surrounding tokens
from transformers import pipeline
model = "google-bert/bert-base-cased"
fill_mask = pipeline("fill-mask", model=model)
#print(fill_mask.model.config)
result = fill_mask("The [MASK] of Italy is Rome.")
display(result)

Some weights of the model checkpoint at google-bert/bert-base-cased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu


[{'score': 0.9872503280639648,
  'token': 2364,
  'token_str': 'capital',
  'sequence': 'The capital of Italy is Rome.'},
 {'score': 0.008227720856666565,
  'token': 6299,
  'token_str': 'Capital',
  'sequence': 'The Capital of Italy is Rome.'},
 {'score': 0.0007266324246302247,
  'token': 2642,
  'token_str': 'centre',
  'sequence': 'The centre of Italy is Rome.'},
 {'score': 0.000724579265806824,
  'token': 2057,
  'token_str': 'center',
  'sequence': 'The center of Italy is Rome.'},
 {'score': 0.0004981309175491333,
  'token': 15979,
  'token_str': 'birthplace',
  'sequence': 'The birthplace of Italy is Rome.'}]

In [3]:
# but also, with high score...
result = fill_mask("The [MASK] of Spain is Rome.")
display(result)

# because during pre-training it learned: The [MASK] of <country> is <city>
# saw thousands of sentences like "The capital of France is Paris."
# or more generic patterns like...
result = fill_mask("The [MASK] of _____ is _____.")
display(result)

# no fact-checking mechanism! no knowledge awareness!
# it's purely probabilistic based on co-occurrence patterns in training data

[{'score': 0.9340920448303223,
  'token': 2364,
  'token_str': 'capital',
  'sequence': 'The capital of Spain is Rome.'},
 {'score': 0.014675344340503216,
  'token': 6299,
  'token_str': 'Capital',
  'sequence': 'The Capital of Spain is Rome.'},
 {'score': 0.010236779227852821,
  'token': 1946,
  'token_str': 'seat',
  'sequence': 'The seat of Spain is Rome.'},
 {'score': 0.0028293116483837366,
  'token': 10063,
  'token_str': 'patron',
  'sequence': 'The patron of Spain is Rome.'},
 {'score': 0.002647282788529992,
  'token': 2057,
  'token_str': 'center',
  'sequence': 'The center of Spain is Rome.'}]

[{'score': 0.22043085098266602,
  'token': 1271,
  'token_str': 'name',
  'sequence': 'The name of _ _ _ _ _ is _ _ _ _ _.'},
 {'score': 0.03419249504804611,
  'token': 1641,
  'token_str': 'title',
  'sequence': 'The title of _ _ _ _ _ is _ _ _ _ _.'},
 {'score': 0.030494118109345436,
  'token': 4134,
  'token_str': 'address',
  'sequence': 'The address of _ _ _ _ _ is _ _ _ _ _.'},
 {'score': 0.026066241785883904,
  'token': 2351,
  'token_str': 'author',
  'sequence': 'The author of _ _ _ _ _ is _ _ _ _ _.'},
 {'score': 0.023992745205760002,
  'token': 5754,
  'token_str': 'definition',
  'sequence': 'The definition of _ _ _ _ _ is _ _ _ _ _.'}]

In [5]:
# are llm training data biased?
# datasets used: 
# https://huggingface.co/datasets/legacy-datasets/wikipedia
# https://huggingface.co/datasets/bookcorpus/bookcorpus
prompts = [
        "The black man worked as a [MASK].",
        "The woman worked as a [MASK].",
        "The airplane pilot was a [MASK].",
        "The secretary was a [MASK]."
]
results = []
for i, prompt in enumerate(prompts):
    result = fill_mask(prompt)
    results.append({ "prompt": prompt, "result": result[:3] })

display(results)

[{'prompt': 'The black man worked as a [MASK].',
  'result': [{'score': 0.07436003535985947,
    'token': 17989,
    'token_str': 'waiter',
    'sequence': 'The black man worked as a waiter.'},
   {'score': 0.04477665573358536,
    'token': 18343,
    'token_str': 'bartender',
    'sequence': 'The black man worked as a bartender.'},
   {'score': 0.03770360350608826,
    'token': 9140,
    'token_str': 'detective',
    'sequence': 'The black man worked as a detective.'}]},
 {'prompt': 'The woman worked as a [MASK].',
  'result': [{'score': 0.16927418112754822,
    'token': 7439,
    'token_str': 'nurse',
    'sequence': 'The woman worked as a nurse.'},
   {'score': 0.1501094102859497,
    'token': 15098,
    'token_str': 'waitress',
    'sequence': 'The woman worked as a waitress.'},
   {'score': 0.05600161850452423,
    'token': 13487,
    'token_str': 'maid',
    'sequence': 'The woman worked as a maid.'}]},
 {'prompt': 'The airplane pilot was a [MASK].',
  'result': [{'score': 0.0383

In [6]:
from transformers import pipeline
# can predict?
prompt = "In a distant future, humanity has"
text_generator = pipeline("text-generation", model=model)
result = text_generator(prompt, max_new_tokens=25, truncation=True, num_return_sequences=1)
print("\n")
print(result[0]['generated_text'])

print("\n---\n")
# Q&A?
print(text_generator("who are you?", max_new_tokens=25, truncation=True, num_return_sequences=1)[0]['generated_text'])

If you want to use `BertLMHeadModel` as a standalone, add `is_decoder=True.`
Device set to use cpu




In a distant future, humanity has.........................

---

who are you?.........................


In [None]:
#causal language model: predict next token given previous tokens
from transformers import pipeline
prompt = "In a distant future, humanity has"
text_generator = pipeline("text-generation", model="Qwen/Qwen3-0.6B")
result = text_generator(prompt, max_new_tokens=25, truncation=True, num_return_sequences=1)
print("\n")
print(result[0]['generated_text'])

config.json:   0%|          | 0.00/726 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.50G [00:00<?, ?B/s]