In [212]:
import kagglehub
import json
from transformers import AutoConfig, BertModel, PreTrainedTokenizer, BertConfig, AutoTokenizer
import pandas as pd
import torch
from scipy.spatial.distance import cosine
from nltk.corpus import sentiwordnet as swn

# Download latest version
path = kagglehub.model_download("omlande/bert-multiclassification-sentiment-analysis/pyTorch/default")
print("Path to model files:", path)

tokenizer = AutoTokenizer.from_pretrained("bert_emotion_classifier")
model = BertModel.from_pretrained("bert_emotion_classifier", use_safetensors=True)

Path to model files: C:\Users\kevin\.cache\kagglehub\models\omlande\bert-multiclassification-sentiment-analysis\pyTorch\default\1


'\nModelConfigPath = path + "\\bert_emotion_classifier"\nModelConfig = json.loads(ModelPath)\nTokenizerPath = ModelPath + "\\tokenizer_config.json"\nTokenizerConfig = json.loads(TokenizerPath)\n'

In [213]:
def get_word_embedding(word, layer_nums):
    # Tokenize the word into subtokens and add special tokens [CLS] and [SEP]
    subtokens = [tokenizer.cls_token] + tokenizer.tokenize(word) + [tokenizer.sep_token]
    # Convert subtokens to input IDs
    input_ids = tokenizer.convert_tokens_to_ids(subtokens)
    # Wrap it in a tensor and add an extra batch dimension
    input_ids = torch.tensor(input_ids).unsqueeze(0)
    # Make sure the model does not compute gradients
    with torch.no_grad():
        # Get the model outputs
        outputs = model(input_ids, output_hidden_states=True)
    # Check if layer_nums is a list or a single integer
    if isinstance(layer_nums, int):
        layer_nums = [layer_nums]
    # Use the hidden state from the specified layers as word embedding
    embeddings = [outputs.hidden_states[i] for i in layer_nums]
    # Average the embeddings from the specified layers
    averaged_embedding = torch.mean(torch.stack(embeddings), dim=0)
    # Ignore the first and the last token ([CLS] and [SEP])
    averaged_embedding = averaged_embedding[0, 1:-1]
    # Get the mean of the subtoken vectors to get the word vector
    word_embedding = torch.mean(averaged_embedding, dim=0)
    # Convert tensor to a numpy array
    word_embedding = word_embedding.numpy()
    return word_embedding

In [214]:
def calculate_similarity(word1, word2, layer_nums):
    word1_embedding = get_word_embedding(word1, layer_nums)
    word2_embedding = get_word_embedding(word2, layer_nums)
    similarity = 1 - cosine(word1_embedding, word2_embedding)
    return similarity

In [230]:
# similarity queries (default to cosine similarity: 0 least similar, to 1 most similar)
pairs = [
    ('happy', 'happy'),   # a minivan is a kind of car
    ('happy', 'elated'),   # still a wheeled vehicle
    ('depressed', 'sad'),  # ok, no wheels, but still a vehicle
    ('car', 'minivan'),    # ... and so on
    ('car', 'communism'),
]
for w1, w2 in pairs:
    print('%r\t%r\t%.2f' % (w1, w2, calculate_similarity(w1, w2, 0)))

'happy'	'happy'	1.00
'happy'	'elated'	0.12
'depressed'	'sad'	0.32
'car'	'minivan'	0.24
'car'	'communism'	0.10


In [109]:
import torch
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification, BertConfig, BertTokenizerFast
from torch.utils.data import DataLoader, Dataset
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
from safetensors.torch import load_file

In [111]:
# Load the model
#model_path = "C:/Users/kevin/OneDrive - UvA/Y5S2/Text Mining/TM25_Project/TM2025_project/bert_emotion_classifier/model.safetensors"
#config_path = "C:/Users/kevin/OneDrive - UvA/Y5S2/Text Mining/TM25_Project/TM2025_project/bert_emotion_classifier/config.json"
model_path = "../TM2025_project/bert_emotion_classifier/model.safetensors"
config_path = "../TM2025_project/bert_emotion_classifier/config.json"
state_dict = load_file(model_path)
config = BertConfig.from_json_file(config_path)
model = BertForSequenceClassification(config)
model.load_state_dict(state_dict)
model.eval()

# Load tokenizer
tokenizer = BertTokenizer(
    vocab_file="../TM2025_project/bert_emotion_classifier/vocab.txt",
    do_lower_case=True,
    unk_token="[UNK]",
    sep_token="[SEP]",
    pad_token="[PAD]",
    cls_token="[CLS]",
    mask_token="[MASK]"
)

# Load data
df = pd.read_csv("../TM2025_project/data/preprocessed_letters.csv")
text_column = "letter"

In [112]:
# Define Dataset class
class LetterDataset(Dataset):
    def __init__(self, texts, tokenizer, max_len=512):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encodings = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding='max_length',
            max_length=self.max_len,
            return_tensors='pt'
        )
        return {key: val.squeeze(0) for key, val in encodings.items()}

In [115]:
# Create dataset and dataloader
dataset = LetterDataset(df[text_column].tolist(), tokenizer)
dataloader = DataLoader(dataset, batch_size=8)

In [117]:
# Run predictions
all_preds = []
with torch.no_grad():
    for batch in tqdm(dataloader):
        outputs = model(
            input_ids=batch['input_ids'],
            attention_mask=batch['attention_mask']
        )
        preds = torch.argmax(outputs.logits, dim=1)
        all_preds.extend(preds.tolist())

100%|███████████████████████████████████████████████████████████████████████████████████████████████| 51/51 [04:16<00:00,  5.03s/it]


In [119]:
# Map predictions to labels
id2label = {
    int(k): v for k, v in model.config.id2label.items()
}
df["predicted_sentiment"] = [id2label[p] for p in all_preds]

In [121]:
# Save results
df.to_csv("C:/Users/kevin/OneDrive - UvA/Y5S2/Text Mining/TM25_Project/TM2025_project/sentiment_predictions.csv", index=False)
print("Sentiment analysis completed. Results saved to 'sentiment_predictions.csv'.")

Sentiment analysis completed. Results saved to 'sentiment_predictions.csv'.
