In [None]:
!pip install cohere sentence_transformers torch datasets openai

In [None]:
import requests
import json
from scipy.stats import pearsonr

from datasets import load_dataset

import torch
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.optim as optim
import gc

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics.pairwise import cosine_similarity

import cohere
from openai import OpenAI
import time

import psutil

In [None]:
#Dictionary to keep track of performance metrics
performance_dict = {}

In [None]:
#Function to retrieve Nebula Embeddings
def nebula_embedder(sent):
    url = "https://api-nebula.symbl.ai/v1/model/embed"

    payload = json.dumps({
    "text": f"""{sent}"""
    })
    headers = {
    'ApiKey': 'nebula_api',
    'Content-Type': 'application/json'
    }

    response = requests.request("POST", url, headers=headers, data=payload)

    response_json = response.json()

    return response_json['embedding']

#Function to retrieve Cohere Embeddings
def cohere_embedder(sent, model='embed-english-v3.0', input_type='classification'):
    co = cohere.Client('<cohere_api_key>')

    response = co.embed(
    texts=[sent],
    model=model,
    input_type=input_type
    )

    return(response.embeddings[0])

#Function to retrieve Open API Embeddings
def openai_embedder(sent, model="text-embedding-3-small"):
    client = OpenAI(api_key='<openai_api_key>')
    sent = sent.replace("\n", " ")
    return client.embeddings.create(input = [sent], model=model).data[0].embedding

## Dataset 1: Polarity Detection in Amazon Reviews

In [None]:
amazon_polarity_dataset = load_dataset("amazon_polarity")
amazon_polarity_subset = amazon_polarity_dataset["train"].select(range(1000))

In [None]:
# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def create_dataloaders(encoded_data,batch_size):

    # Split the dataset into training and validation sets
    train_data, val_data = train_test_split(encoded_data, test_size=0.2, random_state=42)

    # Function to collate batch
    def collate_batch(batch):
        labels, embeddings = zip(*batch)
        labels = torch.tensor(labels, dtype=torch.long)
        embeddings = torch.stack(embeddings, dim=0).to(device)
        return labels, embeddings

    # Prepare data loaders
    train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True, collate_fn=collate_batch)
    val_loader = DataLoader(val_data, batch_size=batch_size, collate_fn=collate_batch)

    # Example usage:
    for labels, embeddings in train_loader:
        print(f"Dataloader Sample Shape: {labels.shape}, {embeddings.shape}")
        break
    return train_loader, val_loader

In [None]:
#Model Class Definition
class TextClassifier(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, output_dim):
        super(TextClassifier, self).__init__()
        self.fc1 = nn.Linear(embedding_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, output_dim)
        self.sigmoid = nn.Sigmoid()

    def forward(self, text):
        out = torch.relu(self.fc1(text))
        out = self.fc2(out)
        out = self.sigmoid(out)
        return out

#Create a function for Training loop
def train(model, train_dataloader, epoch_num, optimizer, criterion, model_path):
    """
    Performs training of the given model on the training data with the associated parameters

    Parameters:
        Model instance to be trained -> torch.nn.Module
        Training data loader -> torch.utils.data.DataLoader
        Number of training epochs -> int
        Optimizer to be used while training -> torch.optim
        Loss function that is to be optimized -> torch.nn.modules.loss
        Name to be used while saving the model -> str

    Returns:
        Training Loss at after each epoch -> list[float]
    """

    train_loss = []
    model.train()
    print(f"\nTraining..")
    for epoch in range(epoch_num):
        epoch_loss = 0
        for _, batch in enumerate(train_dataloader):
            labels, texts = batch
            optimizer.zero_grad()
            #sentence_embeddings = model.encode(texts)
            predictions = model(texts)
            loss = criterion(predictions, labels)
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()

        train_loss.append(epoch_loss)
        print(f"Epoch {epoch+1} Loss: {epoch_loss}")
    torch.save(model.state_dict(), model_path)

    return train_loss

#Create a function for Evaluation loop
def evaluate(model, test_dataloader):
    """
    Performs evaluation of the given model using classification accuracy on the test data with the associated parameters

    Parameters:
        Model instance to be trained -> torch.nn.Module
        Test data loader -> torch.utils.data.DataLoader

    Returns:
        Classification Accuracy on the Test Set -> float
    """

    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for _, batch in enumerate(test_dataloader):
            labels, texts = batch
            #sentence_embeddings = model.encode(texts)
            predictions = model(texts)
            preds = predictions.argmax(1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    acc_score = accuracy_score(all_labels, all_preds)
    print(f"\nTest Accuracy: {acc_score}")

    return acc_score

In [None]:
#Function to train and evaluate the model for the chosen embeddings
def train_and_evaluate_d1(embedder, model_path, dataset=amazon_polarity_subset):
    """
    Trains and evaluates a binary sentiment analyzer using the given embedder to encode the inputs

    Parameters:
    The embedding model of choice -> func
    Local path where the trained model's parameters will be saved -> str
    Dataset containing amazon reviews mapped to their true sentiment -> datasets.dataset

    Returns:
    Dictionary contatinig the evaluation metrics including training loss and test accuracy, and the inference latency -> Dict[str->[int]]
    """

    #Define model parameters
    BATCH_SIZE = 32
    HIDDEN_DIM = 64
    OUTPUT_DIM = 2  # Number of classes
    NUM_EPOCHS = 10
    LEARNING_RATE = 0.001

    memory_usage_init = psutil.Process().memory_info().rss

    #Preprocess and encode dataset using Nebula Embeddings
    start = time.time()
    encoded_data = [(example["label"], torch.tensor(embedder(example["content"]))) for example in dataset]
    end = time.time()
    inference_time = round(end-start,2)

    train_loader, test_loader = create_dataloaders(encoded_data, BATCH_SIZE)

    #Set the embedding dimension
    EMBEDDING_DIM = len(encoded_data[0][1])

    #Initialize model, criterion, optimizer
    classifier_model = TextClassifier(EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM).to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(classifier_model.parameters(), lr=LEARNING_RATE)
    classifier_model.to(device)

    #Perform training
    train_loss = train(model=classifier_model, train_dataloader=train_loader, epoch_num=NUM_EPOCHS, optimizer=optimizer,
                       criterion=criterion, model_path=model_path)

    #Perform evaluation
    classifier_model = TextClassifier(EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)
    classifier_model.load_state_dict(torch.load(model_path))

    eval_acc = evaluate(classifier_model, test_loader)

    memory_usage_final = psutil.Process().memory_info().rss

    memory_usage = (memory_usage_final - memory_usage_init)/(1024 ** 2)

    classifier_model.cpu()
    del classifier_model
    gc.collect()
    torch.cuda.empty_cache()

    return {'eval_metrics':[train_loss,eval_acc],'compute_metrics':[inference_time, memory_usage]}

### Using Nebula Embeddings Model

In [None]:
print("Embedding Model: Nebula\n")

metrics = train_and_evaluate_d1(embedder=nebula_embedder, model_path='nebula_dataset1.pth')
performance_dict['nebula_d1'] = metrics

### Using Open AI Embeddings Model

In [None]:
print("Embedding Model: Open AI\n")

metrics = train_and_evaluate_d1(embedder=openai_embedder, model_path='openai_dataset1.pth')
performance_dict['open_ai_d1'] = metrics

### Using Cohere Embeddings Model

In [None]:
print("Embedding Model: Cohere\n")

metrics = train_and_evaluate_d1(embedder=cohere_embedder, model_path='cohere_dataset1.pth')
performance_dict['cohere_d1'] = metrics

## Dataset 2: Sentiment detection in Banking Data

In [None]:
banking_dataset = load_dataset("banking77")
banking_subset = banking_dataset["train"].select(range(1000))

In [None]:
#Function to train and evaluate the model for the chosen embeddings
def train_and_evaluate_d2(embedder, model_path, dataset=banking_subset):
    """
    Trains and evaluates a multi-class classifier using the given embedder to encode the inputs

    Parameters:
    The embedding model of choice -> func
    Local path where the trained model's parameters will be saved -> str
    Dataset containing customer queries mapped to their true intent -> datasets.dataset

    Returns:
    Dictionary contatinig the evaluation metrics including training loss and test accuracy, and the inference latency -> Dict[str->[int]]
    """

    #Define model parameters
    BATCH_SIZE = 32
    HIDDEN_DIM = 64
    OUTPUT_DIM = 77  # Number of classes
    NUM_EPOCHS = 10
    LEARNING_RATE = 0.001

    memory_usage_init = psutil.virtual_memory().percent

    #Preprocess and encode dataset using Nebula Embeddings
    start = time.time()
    encoded_data = [(example["label"], torch.tensor(embedder(example["text"]))) for example in dataset]
    end = time.time()
    inference_time = round(end-start,2)

    train_loader, test_loader = create_dataloaders(encoded_data, BATCH_SIZE)

    #Set the embedding dimension
    EMBEDDING_DIM = len(encoded_data[0][1])

    #Initialize model, criterion, optimizer
    classifier_model = TextClassifier(EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM).to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(classifier_model.parameters(), lr=LEARNING_RATE)

    #Perform training
    train_loss = train(model=classifier_model, train_dataloader=train_loader, epoch_num=NUM_EPOCHS, optimizer=optimizer,
                       criterion=criterion, model_path=model_path)

    #Perform evaluation
    classifier_model = TextClassifier(EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)
    classifier_model.load_state_dict(torch.load(model_path))

    eval_acc = evaluate(classifier_model, test_loader)

    memory_usage_final = psutil.virtual_memory().percent

    memory_usage = memory_usage_final - memory_usage_init

    classifier_model.cpu()
    del classifier_model
    gc.collect()
    torch.cuda.empty_cache()

    return {'eval_metrics':[train_loss,eval_acc],'compute_metrics':inference_time}

### Using Nebula Embeddings Model

In [None]:
print("Embedding Model: Nebula\n")

metrics = train_and_evaluate_d2(embedder=nebula_embedder, model_path='nebula_dataset2.pth')
performance_dict['nebula_d2'] = metrics

### Using Open AI Embeddings Model

In [None]:
print("Embedding Model: Open AI\n")

metrics = train_and_evaluate_d2(embedder=openai_embedder, model_path='openai_dataset2.pth')
performance_dict['open_ai_d2'] = metrics

### Using Cohere Embeddings Model

In [None]:
print("Embedding Model: Cohere\n")

metrics = train_and_evaluate_d2(embedder=cohere_embedder, model_path='cohere_dataset2.pth')
performance_dict['cohere_d2'] = metrics

## Dataset 3: Sentences Involving Compositional Knowldedge (SICK)

In [None]:
sick_dataset = load_dataset("sick")
test_set = sick_dataset['test'].select(range(1000))

In [None]:
def evaluate_d3(embedder,data=test_set):
    """
    Computes the Pearson's corerlation coefficient between the true and predicted similarity scores for the given dataset

    Parameters:
    The embedding model of choice -> func
    Dataset containing sentence pairs and true relatedness score for each pair -> datasets.dataset

    Returns:
    Dictionary contatinig the Pearson's correlation coefficient and the inference latency -> Dict[str->[int]]
    """

    #Initialize the arrays to store the true and predicted similarity scores
    true_similarity_scores = []
    pred_similarity_scores = []

    start = time.time()

    #Counter variable to keep track of number of records processed, and invoke sleep once 100 records are done
    counter = 0

    for iter in test_set:

        counter+=1

        sent1 = iter['sentence_A']
        sent2 = iter['sentence_B']
        #Extract the true relatedness score for the given pair of sentences
        relatedness_score = iter['relatedness_score']
        true_similarity_scores.append(relatedness_score)

        #Encode the pair of sentences using the chosen embedder and compute cosine similarity score
        encoded_sent1 = embedder(sent1)
        encoded_sent2 = embedder(sent2)
        cosine_score = cosine_similarity([encoded_sent1],[encoded_sent2])[0][0]
        pred_similarity_scores.append(cosine_score)

    end = time.time()

    inference_time = end - start

    pearson_corr, _ = pearsonr(true_similarity_scores, pred_similarity_scores)

    print(f"Pearson's Correaltion: {pearson_corr}")

    return {'eval_metrics':[pearson_corr],'compute_metrics':[inference_time]}

### Using Nebula Embeddings Model

In [None]:
print("Embedding Model: Nebula\n")

metrics = evaluate_d3(embedder=nebula_embedder)
performance_dict['nebula_d3'] = metrics

In [None]:
performance_dict['nebula_d3']

### Using Open AI Embeddings Model

In [None]:
print("Embedding Model: Open AI\n")

metrics = evaluate_d3(embedder=openai_embedder)
performance_dict['open_ai_d3'] = metrics

### Using Cohere Embeddings Model

In [None]:
print("Embedding Model: Cohere\n")

metrics = evaluate_d3(embedder=cohere_embedder)
performance_dict['cohere_d3'] = metrics

In [None]:
performance_dict