In [None]:
here = '1_Method/'

## Define Vanilla BERT Encoder

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch
from sklearn.cluster import KMeans
from tqdm import tqdm
tqdm.pandas()

import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModel
from string import punctuation
import unicodedata
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import re
from sklearn.metrics.pairwise import cosine_similarity

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model = AutoModel.from_pretrained(
    'bert-base-uncased',
    output_hidden_states=True,
    return_dict=True
)

###############################
# Preprocessing function
###############################
def preprocess_text(text):
    import unicodedata

    if pd.isnull(text) or text.strip() == "":
        return None

    # Normalize Unicode (remov accents, special characters)
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')

    # Remove redundant whitespace
    text = " ".join(text.split())

    return text.strip()

###############################
# BERT embedding function
###############################
def get_bert_embedding(text):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Use GPU
    model.to(device)

    # This function should rely on a loaded tokenizer & model with hidden_states=True
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
    with torch.no_grad():
        outputs = model(**inputs, output_hidden_states=True, return_dict=True)
        # second-to-last layer
        second_last_layer = outputs.hidden_states[-2]  # shape: [batch_size, seq_len, hidden_size]
        embeddings = second_last_layer.mean(dim=1)     # mean pooling
    return embeddings.cpu().numpy().flatten()

###############################
# Cosine similarity function
###############################
def compute_similarity(emb1, emb2):
    emb1_normalized = torch.nn.functional.normalize(emb1, p=2, dim=1)
    emb2_normalized = torch.nn.functional.normalize(emb2, p=2, dim=1)
    return torch.mm(emb1_normalized, emb2_normalized.t())

#### Get base embeddings

def get_base_embs(phrase, adjective_list):
  base_embs = {}
  for adj in adjective_list:
    base_sentence = phrase.replace("...", f"{adj}")
    base_emb = get_bert_embedding(base_sentence)
    base_embs[adj] = base_emb
  return base_embs


tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model = AutoModel.from_pretrained(
    'bert-base-uncased',
    output_hidden_states=True,
    return_dict=True
)

## Define Other Encoders

In [9]:
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
tqdm.pandas()

models = {
    'sbert': SentenceTransformer('sentence-transformers/all-mpnet-base-v2'),
    'qwen': SentenceTransformer("Qwen/Qwen3-Embedding-4B")
}

def encode(models, name, docs):
  """
  Input models (dict), model (name), docs (pd.Series)
  Returns embeddings (list)
  """

  if name == 'bert':
    processed = docs.apply(preprocess_text)
    return processed.progress_apply(get_bert_embedding)

  else:
    return models[name].encode(docs, show_progress_bar=True).tolist()

## Encode Adjectives

In [None]:
import pandas as pd
import pickle

with open(here + 'data/Adjectives.pkl', 'rb') as f:
  adj = pickle.load(f)

In [None]:
adj = pd.DataFrame(adj, columns=['Adjective'])

In [None]:
import inflect
p = inflect.engine()

def _create_phrase(word):
  word = p.a(word)
  phrase = f'{word} life is my ideal life.'
  return phrase

adj['Phrase'] = adj['Adjective'].apply(_create_phrase)

In [None]:
adj['bert'] = encode(models, 'bert', adj['Phrase'])
adj['sbert'] = encode(models, 'sbert', adj['Phrase'])
adj['qwen'] = encode(models, 'qwen', adj['Phrase'])

Batches:   0%|          | 0/39 [00:00<?, ?it/s]

Batches:   0%|          | 0/39 [00:00<?, ?it/s]

## Encode Ideal Life Responses

In [5]:
import pandas as pd

df = pd.read_csv(here + "data/IdealLifeResponses.csv")

In [10]:
df['bert'] = encode(models, 'bert', df['response'])
df['sbert'] = encode(models, 'sbert', df['response'])
df['qwen'] = encode(models, 'qwen', df['response'])

100%|██████████| 44964/44964 [07:17<00:00, 102.68it/s]


Batches:   0%|          | 0/1406 [00:00<?, ?it/s]

Batches:   0%|          | 0/1406 [00:00<?, ?it/s]

In [11]:
dct = df.to_dict(orient='records')

In [12]:
import pickle

with open(here + 'data/IdealLifeResponse_Encoded.pkl', 'wb') as f:
  pickle.dump(dct, f)