In [1]:
import torch
from transformers import AutoModel, AutoTokenizer
import numpy as np
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import pandas as pd
from transformers import AutoTokenizer
import random
from tqdm import tqdm


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [3]:
# 1. Create the model instance
model = AutoModel.from_pretrained("SamLowe/roberta-base-go_emotions")
model.to(device)
# 2. Load the saved weights
model_path = "model_best_batchsize144.pth"
model.load_state_dict(torch.load(model_path))
model.eval()

Some weights of RobertaModel were not initialized from the model checkpoint at SamLowe/roberta-base-go_emotions and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RuntimeError: Error(s) in loading state_dict for RobertaModel:
	Unexpected key(s) in state_dict: "embeddings.position_ids". 

In [None]:
tokenizer = AutoTokenizer.from_pretrained("SamLowe/roberta-base-go_emotions")
roberta_base_model= AutoModel.from_pretrained("SamLowe/roberta-base-go_emotions")
roberta_base_model.to(device)
model.eval()

In [None]:
sample_num = 50
original_df = pd.read_csv("D:/MELD/MELD.Raw/MELD.Raw/dev_sent_emo.csv")

# Create an empty DataFrame to store the randomly selected rows
selected_rows = []

# Iterate over each unique emotion category
unique_emotions =original_df['Emotion'].unique()
for emotion in unique_emotions:
    # Randomly select min_count rows for each emotion
    sampled_rows = original_df[original_df['Emotion'] == emotion].sample(n=sample_num, random_state=random.randint(1,50),replace=True)
    
    # Append the sampled rows to the selected_rows list
    selected_rows.append(sampled_rows)

# Concatenate the selected rows into a new DataFrame
selected_df = pd.concat(selected_rows)

data = selected_df['Utterance'].to_list()
labels = selected_df['Emotion'].to_numpy()

In [None]:
inputs = tokenizer.batch_encode_plus(
    data,
    return_tensors="pt",
    add_special_tokens=True,
    truncation=True,
    padding='max_length',
    max_length=512  # or whatever max length you desire
)
input_ids = inputs["input_ids"]
attention_mask = inputs['attention_mask']
input_ids = input_ids.to(device)
attention_mask = attention_mask.to(device)

In [None]:
def take_output_embd(input_ids_list,attention_masks_list, model):
    batch_size = 10

    # Calculate the number of batches needed
    num_batches = len(input_ids_list) // batch_size
    if len(input_ids_list) % batch_size != 0:
        num_batches += 1

    # Initialize an empty list to store the embeddings
    all_embeddings_out2 = []

    # Iterate over batches
    for i in tqdm(range(num_batches)):
        start_idx = i * batch_size
        end_idx = (i + 1) * batch_size

        # Extract a batch of input data
        batch_input_ids = input_ids_list[start_idx:end_idx]
        batch_attention_masks = attention_masks_list[start_idx:end_idx]
        batch_input_ids.to(device)
        batch_attention_masks.to(device)
        # Perform inference for the batch
        with torch.no_grad():
            batch_outputs = model(batch_input_ids, attention_mask=batch_attention_masks).last_hidden_state[:,0,:]
        all_embeddings_out2.append(batch_outputs)

    # Concatenate the list of embeddings to get the final result
    return torch.cat(all_embeddings_out2, dim=0)

In [None]:
aligned_embeds = take_output_embd(input_ids,attention_mask,model=model)
go_emo_embeds = take_output_embd(input_ids,attention_mask,model=roberta_base_model)

In [None]:
def plot_pca(embeddings, labels):
    # Example data: Replace with your actual data
    embeddings = embeddings.cpu().numpy()  

    # Perform dimensionality reduction using t-SNE
    tsne = TSNE(n_components=2, random_state=42)
    embeddings_2d = tsne.fit_transform(embeddings)

    # Define a dictionary to map emotions to colors
    emotion_to_color = {
        'neutral': 'blue',
        'surprise': 'green',
        #'fear': 'red',
        'sadness': 'purple',
        'joy': 'yellow',
        #'disgust': 'orange',
        'anger': 'red'
    }  # Define your own color mapping

    # Map emotions to colors
    colors = [emotion_to_color[emotion] for emotion in emotion_to_color.keys()]

    # Create a scatter plot with points colored by labels
    plt.figure(figsize=(8, 6))
    for emotion, color in emotion_to_color.items():
        mask = labels == emotion
        plt.scatter(embeddings_2d[mask, 0], embeddings_2d[mask, 1], c=color, label=emotion)

    plt.title("t-SNE Visualization of Embeddings with Colored Emotions")
    plt.xlabel("Dimension 1")
    plt.ylabel("Dimension 2")

    # Add a legend to the plot
    plt.legend()

    # Show the plot
    plt.show()

plot_pca(aligned_embeds, labels)
plot_pca(go_emo_embeds, labels)