In [12]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from rank_bm25 import BM25Okapi
import os
import numpy
from scripts.utils import read_gold_data
import numpy as np
from collections import Counter

In [47]:
data_path_name = "data-release" # set data path name!

data = read_gold_data(data_path_name)
corpus = data["corpus"]
baseline_queries_train = data["baseline"]["train"]
baseline_queries_dev = data["baseline"]["dev"]
perspective_queries_train = data["perspective"]["train"]
perspective_queries_dev = data["perspective"]["dev"]

# uncomment when using test set
# baseline_queries_test = data["baseline"]["test"]
# perspective_queries_test = data["perspective"]["test"]

In [48]:
sbert_encoder = SentenceTransformer("paraphrase-multilingual-mpnet-base-v2")

In [49]:
corpus_embeddings = sbert_encoder.encode(corpus["argument"].values)
corpus["sbert_embeddings"] = list(corpus_embeddings)

In [50]:
query_train_embeddings = sbert_encoder.encode(baseline_queries_train["text"].values) 
query_dev_embeddings = sbert_encoder.encode(baseline_queries_dev["text"].values) 

baseline_queries_train["sbert_embeddings"] = list(query_train_embeddings)
baseline_queries_dev["sbert_embeddings"] = list(query_dev_embeddings)

In [61]:
# uncomment when using test set

# query_test_embeddings = sbert_encoder.encode(perspective_queries_test["text"].values)

# perspective_queries_test["sbert_embeddings"] = list(query_test_embeddings)

In [52]:
train_similarities = cosine_similarity(
    list(baseline_queries_train["sbert_embeddings"].values), list(corpus["sbert_embeddings"].values)
)
dev_similarities = cosine_similarity(
    list(baseline_queries_dev["sbert_embeddings"].values), list(corpus["sbert_embeddings"].values)
)

In [62]:
# uncomment when using test set

# test_similarities = cosine_similarity(
#     list(perspective_queries_test["sbert_embeddings"].values), list(corpus["sbert_embeddings"].values)
# )

In [54]:
train_scores = []
for i in range(len(baseline_queries_train["query_id"].values)):
    row = train_similarities[i, :]
    top_indices = np.argsort(row)[-50:]
    topics = {}
    for argument_index in top_indices:
        current_topic = corpus.iloc[argument_index]['topic']
        if current_topic in topics:
            topics[current_topic] += 1
        else:
            topics[current_topic] = 1
    train_scores.append({
        'query_id': baseline_queries_train["query_id"].values[i],
        'similarity_scores': train_similarities[i],
        'topic': topics
    })

In [55]:
train_scores_df = pd.DataFrame(train_scores)

In [56]:
dev_scores = []
for i in range(len(baseline_queries_dev["query_id"].values)):
    row = dev_similarities[i, :]
    top_indices = np.argsort(row)[-50:]
    topics = {}
    for argument_index in top_indices:
        current_topic = corpus.iloc[argument_index]['topic']
        if current_topic in topics:
            topics[current_topic] += 1
        else:
            topics[current_topic] = 1
    dev_scores.append({
        'query_id': baseline_queries_dev["query_id"].values[i],
        'similarity_scores': dev_similarities[i],
        'topic': topics
    })

In [57]:
dev_scores_df = pd.DataFrame(dev_scores)

In [63]:
# uncomment when using test set
# test_scores = []
# for i in range(len(perspective_queries_test["query_id"].values)):
#     row = test_similarities[i, :]
#     top_indices = np.argsort(row)[-50:]
#     topics = {}
#     for argument_index in top_indices:
#         current_topic = corpus.iloc[argument_index]['topic']
#         if current_topic in topics:
#             topics[current_topic] += 1
#         else:
#             topics[current_topic] = 1
#     test_scores.append({
#         'query_id': perspective_queries_test["query_id"].values[i],
#         'similarity_scores': test_similarities[i],
#         'topic': topics
#     })

In [64]:
# test_scores_df = pd.DataFrame(test_scores)

In [65]:
destination_folder = "final-scores" # change destination folder name!

train_scores_df.to_json(f"{destination_folder}/baseline_train_similarity_scores.jsonl", orient="records", lines=True)
dev_scores_df.to_json(f"{destination_folder}/baseline_dev_similarity_scores.jsonl", orient="records", lines=True)

# uncomment when using test set
# test_scores_df.to_json(f"{destination_folder}/surprise/perspective_similarity_scores.jsonl", orient="records", lines=True)