In [1]:
import pandas as pd
import numpy as np
import random
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModel
import torch
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
train_df = pd.read_csv('train_sent_emo.csv',encoding= 'utf-8') 

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Initialize the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("SamLowe/roberta-base-go_emotions")
model = AutoModel.from_pretrained("SamLowe/roberta-base-go_emotions")
model.to(device)

def get_embedding(text):
    """Get the embedding for a text."""
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    inputs = {name: tensor.to(device) for name, tensor in inputs.items()}  # Move input tensors to GPU
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state[:, 0, :].squeeze().cpu().numpy()



# Precompute embeddings for all utterances
train_df['Embedding'] = [get_embedding(utterance) for utterance in tqdm(train_df['Utterance'], desc="Extracting Embeddings")]

Some weights of RobertaModel were not initialized from the model checkpoint at SamLowe/roberta-base-go_emotions and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Extracting Embeddings: 100%|██████████| 9989/9989 [05:13<00:00, 31.86it/s]


In [4]:
def calculate_distance(embedding1, embedding2):
    """Calculate cosine similarity between two embeddings using PyTorch on GPU."""
    # Move embeddings to GPU and add an extra batch dimension
    embedding1 = torch.tensor(embedding1).unsqueeze(0).to(device)
    embedding2 = torch.tensor(embedding2).unsqueeze(0).to(device)
    
    # Normalize the embeddings (for cosine similarity)
    embedding1 = embedding1 / embedding1.norm(dim=-1, keepdim=True)
    embedding2 = embedding2 / embedding2.norm(dim=-1, keepdim=True)
    
    # Calculate cosine similarity
    cosine_sim = torch.nn.functional.cosine_similarity(embedding1, embedding2)
    
    return cosine_sim.item()


def generate_all_triplets_with_criteria(data):
    triplets = []

    # Wrap the main loop with tqdm for progress bar
    for _, anchor in tqdm(data.iterrows(), total=data.shape[0], desc="Generating Triplets"):
        positive_samples = data[data['Emotion'] == anchor['Emotion']]
        negative_samples = data[data['Emotion'] != anchor['Emotion']]
        
        anchor_embedding = anchor['Embedding']

        for _, positive in positive_samples.iterrows():
            if positive['Utterance'] == anchor['Utterance']:
                continue

            positive_embedding = positive['Embedding']

            for _, negative in negative_samples.iterrows():
                negative_embedding = negative['Embedding']
                
                anchor_positive_distance = calculate_distance(anchor_embedding, positive_embedding)
                anchor_negative_distance = calculate_distance(anchor_embedding, negative_embedding)

                # Check semi-hard and easy triplet conditions
                is_semi_hard = (anchor_positive_distance < anchor_negative_distance) and \
                               (anchor_negative_distance - anchor_positive_distance < 0.2)  # Adjust threshold as needed

                is_easy = anchor_positive_distance < anchor_negative_distance

                if is_semi_hard or is_easy:
                    triplet = (anchor['Utterance'], positive['Utterance'], negative['Utterance'])
                    triplets.append(triplet)

    return pd.DataFrame(triplets, columns=['anchor', 'positive', 'negative'])

# Generate all possible triplets with criteria
triplet_data_all_criteria = generate_all_triplets_with_criteria(train_df)

Generating Triplets:   0%|          | 0/9989 [00:00<?, ?it/s]

In [None]:
triplet_data_all_criteria.to_csv('triplet_data_easy_and_semi_hard.csv',encoding='utf-8')