In [1]:
# 1) Install / import
!pip install transformers        # run once in Colab or your venv
from transformers import pipeline, DistilBertTokenizer

# 2) —— Example A: use the 🤗 pipeline for sentiment analysis ——
# Here we explicitly pick a model fine‐tuned on SST-2 (binary sentiment).
sentiment_pipe = pipeline(
    task="sentiment-analysis",
    model="distilbert-base-uncased-finetuned-sst-2-english",
)

# Pass a list of texts and get back labels + scores
samples = [
    "This movie was absolutely fantastic!",
    "I really hated how it dragged on and on…",
]
preds = sentiment_pipe(samples)
print("pipeline output:\n", preds)



config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Device set to use mps:0


pipeline output:
 [{'label': 'POSITIVE', 'score': 0.999874472618103}, {'label': 'NEGATIVE', 'score': 0.9996241331100464}]


In [2]:
# 3) —— Example B: raw DistilBERT tokenization + introspection ——

# load the tokenizer
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

sentence = "Tokenize this sentence using DistilBERT."
tokens = tokenizer(sentence, return_tensors="pt")

# pull out IDs & attention mask
input_ids     = tokens["input_ids"].squeeze().tolist()
attention_mask = tokens["attention_mask"].squeeze().tolist()

print(f"\nOriginal sentence:\n{sentence}\n")
print(f"Input IDs:\n{input_ids}\n")
print(f"Attention mask:\n{attention_mask}\n")

# map each ID back to its token
print("Decoded tokens:")
for idx in input_ids:
    print(f"{idx:6d} → {tokenizer.decode([idx])}")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]


Original sentence:
Tokenize this sentence using DistilBERT.

Input IDs:
[101, 19204, 4697, 2023, 6251, 2478, 4487, 16643, 23373, 1012, 102]

Attention mask:
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

Decoded tokens:
   101 → [CLS]
 19204 → token
  4697 → ##ize
  2023 → this
  6251 → sentence
  2478 → using
  4487 → di
 16643 → ##sti
 23373 → ##lbert
  1012 → .
   102 → [SEP]
