This code is based on Lab 5.4 Contextualized_Vectors with multiple adjustments.

In [52]:
import torch
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, AutoModel
import os

In [37]:
# Opening the file
file_path = 'translated_dataset_4.csv'
data = pd.read_csv(file_path)

In [38]:
# Getting Lithuanian sentences (dataset and CTranslate)
lt_dataset_sentences = data.iloc[:, 2]
lt_ctranslate_sentences = data.iloc[:, 3]

In [39]:
# Loading the model
model_name = 'xlm-roberta-base'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)



In [61]:
# Generating sentence vectors
def generate_vectors(sentences):
    """ Takes a list of sentences and generate vector representations for each of them.
    sentence in a list using a pre-trained transformer model. Returns a NumPy array 
    containing the sentence vectors, where each row represents the vector of one sentence."""
    
    vectors = []
    model.eval()
    with torch.no_grad():
        for sent in sentences:
            tokens = tokenizer(sent, return_tensors = "pt", truncation = True, max_length = 512, padding = True)
            output = model(**tokens)
            vector = output.last_hidden_state.mean(dim=1).squeeze().numpy()
            vectors.append(vector)
            
    return np.array(vectors)

In [62]:
# Get vectors for both datasets
dataset_vectors = generate_vectors(lt_dataset_sentences)
ctranslate_vectors = generate_vectors(lt_ctranslate_sentences)

In [63]:
# Computing cosine similarity for corresponding pairs
pair_similarities = []
for i in range(523):
    sim = cosine_similarity([dataset_vectors[i]], [ctranslate_vectors[i]])[0][0]
    pair_similarities.append((i + 1, i + 1 + 523, sim))

In [64]:
# Creating a DataFrame to store results
similarity_df = pd.DataFrame(pair_similarities, columns = ["Dataset Sentence ID", "CTranslate Sentence ID", "Cosine Similarity"])
print(similarity_df.head())

   Dataset Sentence ID  CTranslate Sentence ID  Cosine Similarity
0                    1                     524           1.000000
1                    2                     525           1.000000
2                    3                     526           0.997474
3                    4                     527           1.000000
4                    5                     528           0.998462


In [65]:
# Exporting the results to a CSV file
output_filename_csv = "similarity_scores.csv"
output_file_path_csv = "/Users/urtejakubauskaite/Desktop/Language as Data/Labs/Lab5/code/" + output_filename_csv
similarity_df.to_csv(output_file_path_csv, index = False)