In [212]:
import kagglehub
import json
from transformers import AutoConfig, BertModel, PreTrainedTokenizer, BertConfig, AutoTokenizer
import pandas as pd
import torch
from scipy.spatial.distance import cosine

# Download latest version
path = kagglehub.model_download("omlande/bert-multiclassification-sentiment-analysis/pyTorch/default")
print("Path to model files:", path)

tokenizer = AutoTokenizer.from_pretrained("bert_emotion_classifier")
model = BertModel.from_pretrained("bert_emotion_classifier", use_safetensors=True)

Path to model files: C:\Users\kevin\.cache\kagglehub\models\omlande\bert-multiclassification-sentiment-analysis\pyTorch\default\1


'\nModelConfigPath = path + "\\bert_emotion_classifier"\nModelConfig = json.loads(ModelPath)\nTokenizerPath = ModelPath + "\\tokenizer_config.json"\nTokenizerConfig = json.loads(TokenizerPath)\n'

In [213]:
def get_word_embedding(word, layer_nums):
    # Tokenize the word into subtokens and add special tokens [CLS] and [SEP]
    subtokens = [tokenizer.cls_token] + tokenizer.tokenize(word) + [tokenizer.sep_token]
    # Convert subtokens to input IDs
    input_ids = tokenizer.convert_tokens_to_ids(subtokens)
    # Wrap it in a tensor and add an extra batch dimension
    input_ids = torch.tensor(input_ids).unsqueeze(0)
    # Make sure the model does not compute gradients
    with torch.no_grad():
        # Get the model outputs
        outputs = model(input_ids, output_hidden_states=True)
    # Check if layer_nums is a list or a single integer
    if isinstance(layer_nums, int):
        layer_nums = [layer_nums]
    # Use the hidden state from the specified layers as word embedding
    embeddings = [outputs.hidden_states[i] for i in layer_nums]
    # Average the embeddings from the specified layers
    averaged_embedding = torch.mean(torch.stack(embeddings), dim=0)
    # Ignore the first and the last token ([CLS] and [SEP])
    averaged_embedding = averaged_embedding[0, 1:-1]
    # Get the mean of the subtoken vectors to get the word vector
    word_embedding = torch.mean(averaged_embedding, dim=0)
    # Convert tensor to a numpy array
    word_embedding = word_embedding.numpy()
    return word_embedding

In [214]:
def calculate_similarity(word1, word2, layer_nums):
    word1_embedding = get_word_embedding(word1, layer_nums)
    word2_embedding = get_word_embedding(word2, layer_nums)
    similarity = 1 - cosine(word1_embedding, word2_embedding)
    return similarity

In [230]:
# similarity queries (default to cosine similarity: 0 least similar, to 1 most similar)
pairs = [
    ('happy', 'happy'),   # a minivan is a kind of car
    ('happy', 'elated'),   # still a wheeled vehicle
    ('depressed', 'sad'),  # ok, no wheels, but still a vehicle
    ('car', 'minivan'),    # ... and so on
    ('car', 'communism'),
]
for w1, w2 in pairs:
    print('%r\t%r\t%.2f' % (w1, w2, calculate_similarity(w1, w2, 0)))

'happy'	'happy'	1.00
'happy'	'elated'	0.12
'depressed'	'sad'	0.32
'car'	'minivan'	0.24
'car'	'communism'	0.10
