In [1]:
import torch
from transformers import pipeline
from transformers import BertTokenizer, AutoModel
import pandas as pd
import random

In [2]:
# The pipeline('fill-mask') automatically loads a pretrained masked-language model.
# BERT predicts the missing [MASK] token
# result contains a list of top token predictions with probabilities.

unmasker = pipeline('fill-mask', model='bert-base-uncased')

test = "Every morning last summer in Greece, " + \
       "I visited the [MASK] where I would swim, " + \
       "play in the sand, and sunbathe."

result = unmasker(test)
result

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Device set to use cuda:0


[{'score': 0.47743356227874756,
  'token': 3509,
  'token_str': 'beach',
  'sequence': 'every morning last summer in greece, i visited the beach where i would swim, play in the sand, and sunbathe.'},
 {'score': 0.22595980763435364,
  'token': 12212,
  'token_str': 'beaches',
  'sequence': 'every morning last summer in greece, i visited the beaches where i would swim, play in the sand, and sunbathe.'},
 {'score': 0.054036807268857956,
  'token': 4770,
  'token_str': 'pool',
  'sequence': 'every morning last summer in greece, i visited the pool where i would swim, play in the sand, and sunbathe.'},
 {'score': 0.04365421459078789,
  'token': 12679,
  'token_str': 'pools',
  'sequence': 'every morning last summer in greece, i visited the pools where i would swim, play in the sand, and sunbathe.'},
 {'score': 0.029276153072714806,
  'token': 2697,
  'token_str': 'lake',
  'sequence': 'every morning last summer in greece, i visited the lake where i would swim, play in the sand, and sunbathe.

## Sentiment Data

In [3]:
url = 'https://www.dropbox.com/scl/fi/i2esmtinb4qor0mzokybp/fed_sentiment_training.csv?rlkey=v9u7afunmy8w0v0lwizba5g25&dl=1'
df = pd.read_csv(url, sep='\t')

In [4]:
df.head(10)

Unnamed: 0,ID,text,sentiment
0,157_1,The action was taken to cushion the effects on...,dovish
1,161_2,Such trends could foster inflationary imbalanc...,hawkish
2,52_0,The Federal Open Market Committee at its meeti...,neutral
3,21_5,Although continuing favorable trends bolster l...,dovish
4,78_7,The Committee perceives that the upside and do...,neutral
5,115_6,"Nonetheless, the Committee judges that some in...",hawkish
6,160_4,"As a consequence, the pool of available worker...",hawkish
7,114_3,"Readings on core inflation have been elevated,...",hawkish
8,60_0,The Federal Open Market Committee at its meeti...,dovish
9,80_7,"The Committee judges that, on balance, the ris...",dovish


In [5]:
df.loc[0, "text"]

'The action was taken to cushion the effects on prospective economic growth in the United States of increasing weakness in foreign economies and of less accommodative financial conditions domestically'

### Tokenizing Text

In [6]:
# Loads the same tokenizer that BERT was trained with.
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
tokenizer # [PAD] is used to pad the text to the max length 512

BertTokenizer(name_or_path='bert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)

In [9]:
# The tokenizer’s vocabulary contains about 30,000 subword tokens.
# Subword tokens look like un, ##happy, etc.
# Each has a unique integer ID that BERT uses internally.

vocab = tokenizer.get_vocab()
print(f"Total number of tokens in vocabulary: {len(vocab)} \n---------")
for _ in range(10): # Repeats the process 10 times but I am not going to use the loop variable
    word, idx = random.choice(list(vocab.items()))
    print(word, idx)

Total number of tokens in vocabulary: 30522 
---------
unfortunately 6854
##ם 29800
went 2253
年 1840
cites 17248
[unused384] 389
nazis 13157
windy 27370
furnace 17533
robertson 9923


In [10]:
# takes one text string (the first row) and converts it into tensors suitable for BERT.
encoded_input1 = tokenizer(df.loc[0, "text"],
                           max_length=100,
                           padding="max_length",
                           return_tensors='pt')

In [11]:
encoded_input1

{'input_ids': tensor([[  101,  1996,  2895,  2001,  2579,  2000, 22936,  1996,  3896,  2006,
         17464,  3171,  3930,  1999,  1996,  2142,  2163,  1997,  4852, 11251,
          1999,  3097, 18730,  1998,  1997,  2625, 16222,  5358,  5302,  2850,
          6024,  3361,  3785, 27143,   102,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0

In [12]:
# Each number corresponds to a token ID in BERT’s 30k-word vocabulary.
# tensor of shape [1, 100]
encoded_input1.input_ids

tensor([[  101,  1996,  2895,  2001,  2579,  2000, 22936,  1996,  3896,  2006,
         17464,  3171,  3930,  1999,  1996,  2142,  2163,  1997,  4852, 11251,
          1999,  3097, 18730,  1998,  1997,  2625, 16222,  5358,  5302,  2850,
          6024,  3361,  3785, 27143,   102,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0]])

In [13]:
# This tells BERT which tokens are real and which are padding.
# 1 → real token (to be attended to)
# 0 → padding token (ignore it)

encoded_input1.attention_mask

tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0]])

In [14]:
# 1D tensor of shape [100]
temp_tokens = encoded_input1["input_ids"][0]  # ← Add [0] here to get first sequence
print("Tokens IDs:")
print(temp_tokens)

Tokens IDs:
tensor([  101,  1996,  2895,  2001,  2579,  2000, 22936,  1996,  3896,  2006,
        17464,  3171,  3930,  1999,  1996,  2142,  2163,  1997,  4852, 11251,
         1999,  3097, 18730,  1998,  1997,  2625, 16222,  5358,  5302,  2850,
         6024,  3361,  3785, 27143,   102,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0])


In [15]:
# translates those integer IDs back into their string tokens — i.e., the subwords that BERT uses internally.
print("\n------------------------------------------\n")
print("Tokens:")
print(tokenizer.convert_ids_to_tokens(temp_tokens))


------------------------------------------

Tokens:
['[CLS]', 'the', 'action', 'was', 'taken', 'to', 'cushion', 'the', 'effects', 'on', 'prospective', 'economic', 'growth', 'in', 'the', 'united', 'states', 'of', 'increasing', 'weakness', 'in', 'foreign', 'economies', 'and', 'of', 'less', 'acc', '##om', '##mo', '##da', '##tive', 'financial', 'conditions', 'domestically', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]']


### Obtaining Embeddings

In [16]:
# Loads the pretrained BERT model weights.
# Each input token will be represented by a 768-dimensional embedding at each layer.
model = AutoModel.from_pretrained("bert-base-uncased",
                                  output_hidden_states=True, # keeps internal layer outputs.
                                  output_attentions=True, # keeps attention matrices
                                  attn_implementation="eager"
                                  )

print(model.config)

BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "dtype": "float32",
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_attentions": true,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.57.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}



In [17]:
# Step 1: Get BERT output without computing gradients (inference mode)
# Disables gradient tracking (saves memory & compute). Normally, PyTorch builds
# a computation graph during forward passes so it can later compute gradients (for training).
with torch.no_grad():
    result1 = model(**encoded_input1)
    # The ** operator unpacks the dictionary into keyword arguments
    # ** (double star) → unpacks a dictionary into keyword arguments

In [None]:
def greet(name, age):
    print(f"Hello {name}, you are {age} years old.")
person = {'name': 'Alice', 'age': 25}
greet(**person)

In [18]:
# Step 2: Extract token-level embeddings from BERT's last layer
last_hidden_state = result1.last_hidden_state
print(f"Token embeddings shape: {last_hidden_state.shape}")

Token embeddings shape: torch.Size([1, 100, 768])


In [19]:
# Step 3: Get attention mask (1 = real token, 0 = padding)
attention_mask = encoded_input1["attention_mask"]  # [1, 30]
print(f"Attention mask shape: {attention_mask.shape}")
print("\n------------------------------------------\n")
print("attention mask:")
print(attention_mask)
print("\n------------------------------------------\n")

Attention mask shape: torch.Size([1, 100])

------------------------------------------

attention mask:
tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0]])

------------------------------------------



In [20]:
# Step 4: Zero out padding token embeddings
# unsqueeze(-1) adds dimension: [1, 100] → [1, 100, 1]
# This allows broadcasting when multiplying with embeddings [1, 100, 768]
masked_embeddings = last_hidden_state * attention_mask.unsqueeze(-1)

In [21]:
# Step 5: Compute mean pooling (average of non-padding tokens)
# Numerator: sum all 30 token embeddings
sum_embeddings = masked_embeddings.sum(dim=1)  # [1, 768]

In [22]:
# Denominator: count how many real tokens
num_real_tokens = attention_mask.sum(dim=1, keepdim=True)  # [1, 1]

In [23]:
# Final sentence embedding: average of real token embeddings
mean_embedding1 = sum_embeddings / num_real_tokens  # [1, 768]

In [24]:
print(f"Sentence embedding shape: {mean_embedding1.shape}")
print("\n------------------------------------------\n")
print("First Ten Elements of Embedding:")
print(mean_embedding1[0, :10])

Sentence embedding shape: torch.Size([1, 768])

------------------------------------------

First Ten Elements of Embedding:
tensor([-0.7555, -0.1470, -0.0567,  0.2348,  0.1567,  0.0438, -0.0582,  0.4445,
        -0.1379, -0.1501])


In [25]:
# %% Now scale to ALL examples in the dataset
import numpy as np

# Move model to GPU
model = model.to('cuda')

batch_size = 32  # Process 32 texts at once
all_embeddings = []

for i in range(0, len(df), batch_size):
    batch_texts = df["text"][i:i+batch_size].tolist()

    # Tokenize the batch
    encoded_input = tokenizer(batch_texts,
                             max_length=30, # for speed
                             padding="max_length",
                             truncation=True,
                             return_tensors='pt').to('cuda')  # Move batch to GPU

    with torch.no_grad():
        result = model(**encoded_input)

    last_hidden_state = result.last_hidden_state
    attention_mask = encoded_input["attention_mask"]
    masked_embeddings = last_hidden_state * attention_mask.unsqueeze(-1)
    mean_embedding = masked_embeddings.sum(dim=1) / attention_mask.sum(dim=1, keepdim=True)

    # Move back to CPU and convert to numpy
    batch_embeddings = mean_embedding.cpu().numpy()
    all_embeddings.extend(batch_embeddings)

# stack into matrix
embeddings_matrix = np.vstack(all_embeddings)

print(f"Embeddings matrix shape: {embeddings_matrix.shape}")
print(f"Number of texts: {embeddings_matrix.shape[0]}")
print(f"Embedding dimension: {embeddings_matrix.shape[1]}")

# Store in dataframe
df['embedding'] = all_embeddings

Embeddings matrix shape: (1243, 768)
Number of texts: 1243
Embedding dimension: 768


In [26]:
df.head()

Unnamed: 0,ID,text,sentiment,embedding
0,157_1,The action was taken to cushion the effects on...,dovish,"[-0.759528, -0.2019381, -0.063900575, 0.202355..."
1,161_2,Such trends could foster inflationary imbalanc...,hawkish,"[-0.2612877, -0.2571696, 0.27028108, 0.2405703..."
2,52_0,The Federal Open Market Committee at its meeti...,neutral,"[-0.5020466, -0.20166348, -0.05087775, -0.0904..."
3,21_5,Although continuing favorable trends bolster l...,dovish,"[-0.440186, 0.1610609, 0.30311587, 0.05906713,..."
4,78_7,The Committee perceives that the upside and do...,neutral,"[-0.3763697, 0.045836966, 0.26960185, 0.275766..."
