# Helper Functions

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')


In [None]:
import numpy as np
from numpy import dot
from numpy.linalg import norm
import pandas as pd
import csv
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
import time

def cos_similarity(vector_a, vector_b):
    dot_product = np.dot(vector_a, vector_b)
    norm_a = np.linalg.norm(vector_a)
    norm_b = np.linalg.norm(vector_b)

    similarity = dot_product / (norm_a * norm_b)

    return similarity


def process_csv(model, input_file, output_file):
    start_time = time.time()
    with open(input_file, 'r', newline='') as infile, open(output_file, 'w', newline='') as outfile:
        reader = csv.reader(infile)
        writer = csv.writer(outfile)

        for row in reader:
            # for validation data
            # id, word1, word2, gold_similarity = row

            # for testing data
            id, word1, word2 = row

            similarity = model(word1, word2)

            writer.writerow([id, similarity])

    end_time = time.time() - start_time
    print(end_time)


# Tokenize, remove stop words, lemmatize, and stem
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

def preprocess_text(text):
    words = text.lower().split()
    words = [word for word in words if word not in stop_words]
    words = [lemmatizer.lemmatize(word) for word in words]
    words = [stemmer.stem(word) for word in words]
    return words

# Word2Vec

In [None]:
from gensim.models import Word2Vec

df = pd.read_csv('./data/Training-dataset.csv')

tokenized_synopses = [preprocess_text(text) for text in df['plot_synopsis']]

# Train Word2Vec (using Skipgram)
word2vec_model = Word2Vec(sentences=tokenized_synopses, vector_size=300, window=1, min_count=1, workers=4, sg=1)

# Special token for OOV words
oov_token = '<OOV>'
word2vec_model.wv[oov_token] = np.random.normal(size=word2vec_model.vector_size)

def calculate_similarity_w2v(text1, text2):
    try:
        words1 = preprocess_text(text1)
        words2 = preprocess_text(text2)

        # Get vectors for individual words or use the OOV token if not found
        vectors1 = [word2vec_model.wv[word] if word in word2vec_model.wv else word2vec_model.wv[oov_token] for word in words1]
        vectors2 = [word2vec_model.wv[word] if word in word2vec_model.wv else word2vec_model.wv[oov_token] for word in words2]

        if not vectors1 or not vectors2:
            return 0.0

        # Average the word vectors to obtain vectors for multi-word terms
        vector1 = np.mean(vectors1, axis=0)
        vector2 = np.mean(vectors2, axis=0)

        return cos_similarity(vector1, vector2)

    except KeyError:
        return 0.0

input_file_path = './data/Task-1-test-dataset1.csv'
output_file_path = '/10728942-Task1-method-b.csv'
process_csv(calculate_similarity_w2v, input_file_path, output_file_path)


# BERT

In [None]:
!pip install transformers

In [None]:
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F
from sklearn.metrics.pairwise import cosine_similarity

def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0]
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

def get_bert_embeddings(term, model, tokenizer):
    inputs = tokenizer(term, padding=True, truncation=True, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)

    embeddings = mean_pooling(outputs, inputs['attention_mask'])
    embeddings = F.normalize(embeddings, p=2, dim=1)

    return embeddings

def calculate_similarity_bert(text1, text2):
    try:
        embeddings1 = get_bert_embeddings(text1, model, tokenizer)
        embeddings2 = get_bert_embeddings(text2, model, tokenizer)
        # Convert tensors to numpy arrays for cosine_similarity
        embeddings1_np = embeddings1.cpu().numpy()
        embeddings2_np = embeddings2.cpu().numpy()

        # Calculate cosine similarity using sklearn
        cos_sim = cosine_similarity(embeddings1_np, embeddings2_np)[0][0]
        return(cos_sim)

    except Exception as e:
        print(f"Error calculating similarity: {e}")
        return 0.0

if __name__ == "__main__":
    # https://huggingface.co/jinaai/jina-embeddings-v2-base-en
    model_name = "jinaai/jina-embeddings-v2-base-en"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    # trust_remote_code is needed to use the encode method
    model = AutoModel.from_pretrained(model_name, trust_remote_code=True)

    input_csv_file = './data/Task-1-test-dataset1.csv'
    output_csv_file = '10728942-Task1-method-c.csv'

    process_csv(calculate_similarity_bert, input_csv_file, output_csv_file)



# Evaluation Script

In [None]:
%cd /content/drive/MyDrive/Colab Notebooks/NLP CW/Task 1


!python task1_eval_script_student_version.py '10728942-Task1-method-c-validation.csv' 'Task-1-validation-dataset.csv'