In [None]:
import torch
from transformers import pipeline
from transformers import BertTokenizer, AutoModel
import pandas as pd
import random

In [None]:
unmasker = pipeline('fill-mask', model='bert-base-uncased')

test = "Every morning last summer in Greece, " + \
       "I visited the [MASK] where I would swim, " + \
       "play in the sand, and sunbathe."

result = unmasker(test)
result

## Sentiment Data

In [None]:
url = 'https://www.dropbox.com/scl/fi/i2esmtinb4qor0mzokybp/fed_sentiment_training.csv?rlkey=v9u7afunmy8w0v0lwizba5g25&dl=1'
df = pd.read_csv(url, sep='\t')

In [None]:
df.head(10)

In [None]:
df.loc[0, "text"]

### Tokenizing Text

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
tokenizer

In [None]:
vocab = tokenizer.get_vocab()
print(f"Total number of tokens in vocabulary: {len(vocab)} \n---------")
for _ in range(10):
    word, idx = random.choice(list(vocab.items()))
    print(word, idx)

In [None]:
encoded_input1 = tokenizer(df.loc[0, "text"],
                           max_length=100,
                           padding="max_length",
                           return_tensors='pt')

print("Tokens:")
temp_tokens = encoded_input1["input_ids"][0]  # ← Add [0] here to get first sequence
print(tokenizer.convert_ids_to_tokens(temp_tokens))
print("\n------------------------------------------\n")
print("Tokens IDs:")
print(temp_tokens)

### Obtaining Embeddings

In [None]:
model = AutoModel.from_pretrained("bert-base-uncased",
                                  output_hidden_states=True,
                                  output_attentions=True,
                                  attn_implementation="eager"
                                  )

print(model.config)

In [None]:
# Step 1: Get BERT output without computing gradients (inference mode)
with torch.no_grad():
    result1 = model(**encoded_input1)

# Step 2: Extract token-level embeddings from BERT's last layer
last_hidden_state = result1.last_hidden_state
print(f"Token embeddings shape: {last_hidden_state.shape}")

In [None]:
# Step 3: Get attention mask (1 = real token, 0 = padding)
attention_mask = encoded_input1["attention_mask"]  # [1, 30]
print("attention mask:")
print(attention_mask)
print("\n------------------------------------------\n")

In [None]:
# Step 4: Zero out padding token embeddings
# unsqueeze(-1) adds dimension: [1, 30] → [1, 30, 1]
# This allows broadcasting when multiplying with embeddings [1, 30, 768]
masked_embeddings = last_hidden_state * attention_mask.unsqueeze(-1)

# Step 5: Compute mean pooling (average of non-padding tokens)
# Numerator: sum all 30 token embeddings
sum_embeddings = masked_embeddings.sum(dim=1)  # [1, 768]

# Denominator: count how many real tokens
num_real_tokens = attention_mask.sum(dim=1, keepdim=True)  # [1, 1]

# Final sentence embedding: average of real token embeddings
mean_embedding1 = sum_embeddings / num_real_tokens  # [1, 768]

print(f"Sentence embedding shape: {mean_embedding1.shape}")
print("\n------------------------------------------\n")
print("First Ten Elements of Embedding:")
print(mean_embedding1[0, :10])

In [None]:
# %% Now scale to ALL examples in the dataset
import numpy as np

# Move model to GPU
model = model.to('cuda')

batch_size = 32  # Process 32 texts at once
all_embeddings = []

for i in range(0, len(df), batch_size):
    batch_texts = df["text"][i:i+batch_size].tolist()

    # Tokenize the batch
    encoded_input = tokenizer(batch_texts,
                             max_length=30, # for speed
                             padding="max_length",
                             truncation=True,
                             return_tensors='pt').to('cuda')  # Move batch to GPU

    with torch.no_grad():
        result = model(**encoded_input)

    last_hidden_state = result.last_hidden_state
    attention_mask = encoded_input["attention_mask"]
    masked_embeddings = last_hidden_state * attention_mask.unsqueeze(-1)
    mean_embedding = masked_embeddings.sum(dim=1) / attention_mask.sum(dim=1, keepdim=True)

    # Move back to CPU and convert to numpy
    batch_embeddings = mean_embedding.cpu().numpy()
    all_embeddings.extend(batch_embeddings)

# stack into matrix
embeddings_matrix = np.vstack(all_embeddings)

print(f"Embeddings matrix shape: {embeddings_matrix.shape}")
print(f"Number of texts: {embeddings_matrix.shape[0]}")
print(f"Embedding dimension: {embeddings_matrix.shape[1]}")

# Store in dataframe
df['embedding'] = all_embeddings

In [None]:
df.head()