In [None]:
import pandas as pd
import numpy as np
import os
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from scripts.utils import read_gold_data

In [2]:
data_path_name = "data-release" # set data path name!

data = read_gold_data(data_path_name)
corpus = data["corpus"]
perspective_queries_train = data["perspective"]["train"]
perspective_queries_dev = data["perspective"]["dev"]
# perspective_queries_test = data["perspective"]["test"]

In [3]:
all_demographic_properties = set()
for index, corpus_data in corpus.iterrows():
    for prop_name, prop_value in corpus_data["demographic_profile"].items():
        if isinstance(prop_value, list):
            all_demographic_properties.update(prop_value)
        else:
            all_demographic_properties.add(prop_value)

In [4]:
for index, query_data in perspective_queries_train.iterrows():
    for prop_name, prop_value in query_data["demographic_property"].items():
        all_demographic_properties.add(prop_value)

for index, query_data in perspective_queries_dev.iterrows():
    for prop_name, prop_value in query_data["demographic_property"].items():
        all_demographic_properties.add(prop_value)

In [4]:
# uncomment for test data

# for index, query_data in perspective_queries_test.iterrows():
#     for prop_name, prop_value in query_data["demographic_property"].items():
#         all_demographic_properties.add(prop_value)

In [5]:
all_demographic_properties = list(all_demographic_properties)

In [6]:
corpus_demographics_embeddings = []
for index, corpus_data in corpus.iterrows():
    argument_demographics = [0] * len(all_demographic_properties)
    for prop_name, prop_value in corpus_data["demographic_profile"].items():
        if isinstance(prop_value, list):
            for item in prop_value:
                argument_demographics[all_demographic_properties.index(item)] += 1
        else:
            argument_demographics[all_demographic_properties.index(prop_value)] += 1
    corpus_demographics_embeddings.append(argument_demographics)

In [7]:
queries_train_demographics_embeddings = []
for index, query_data in perspective_queries_train.iterrows():
    query_demographics = [0.5] * len(all_demographic_properties)
    for prop_name, prop_value in query_data["demographic_property"].items():
        query_demographics[all_demographic_properties.index(prop_value)] += 0.5
    queries_train_demographics_embeddings.append(query_demographics)

queries_dev_demographics_embeddings = []
for index, query_data in perspective_queries_dev.iterrows():
    query_demographics = [0.5] * len(all_demographic_properties)
    for prop_name, prop_value in query_data["demographic_property"].items():
        query_demographics[all_demographic_properties.index(prop_value)] += 0.5
    queries_dev_demographics_embeddings.append(query_demographics)

In [7]:
# uncomment for test data

# queries_test_demographics_embeddings = []
# for index, query_data in perspective_queries_test.iterrows():
#     query_demographics = [0.5] * len(all_demographic_properties)
#     for prop_name, prop_value in query_data["demographic_property"].items():
#         query_demographics[all_demographic_properties.index(prop_value)] += 0.5
#     queries_test_demographics_embeddings.append(query_demographics)

In [8]:
train_demographic_similarities = []
for query_demographics in queries_train_demographics_embeddings:
    query_argument_similarities = []
    for corpus_demographics in corpus_demographics_embeddings:
        if corpus_demographics[query_demographics.index(1)] == 1:
            query_argument_similarities.append(1)
        else:
            query_argument_similarities.append(0)
    train_demographic_similarities.append(query_argument_similarities)

dev_demographic_similarities = []
for query_demographics in queries_dev_demographics_embeddings:
    query_argument_similarities = []
    for corpus_demographics in corpus_demographics_embeddings:
        if corpus_demographics[query_demographics.index(1)] == 1:
            query_argument_similarities.append(1)
        else:
            query_argument_similarities.append(0)
    dev_demographic_similarities.append(query_argument_similarities)

In [8]:
# uncomment for test data

# test_demographic_similarities = []
# for query_demographics in queries_test_demographics_embeddings:
#     query_argument_similarities = []
#     for corpus_demographics in corpus_demographics_embeddings:
#         if corpus_demographics[query_demographics.index(1)] == 1:
#             query_argument_similarities.append(1)
#         else:
#             query_argument_similarities.append(0)
#     test_demographic_similarities.append(query_argument_similarities)

In [12]:
train_scores = []
for i in range(len(train_demographic_similarities)):
    row = train_demographic_similarities[i]
    train_scores.append({
        'query_id': perspective_queries_train["query_id"].values[i],
        'demographic_scores': row
    })

dev_scores = []
for i in range(len(dev_demographic_similarities)):
    row = dev_demographic_similarities[i]
    dev_scores.append({
        'query_id': perspective_queries_dev["query_id"].values[i],
        'demographic_scores': row
    })

In [9]:
# uncomment for test data

# test_scores = []
# for i in range(len(test_demographic_similarities)):
#     row = test_demographic_similarities[i]
#     test_scores.append({
#         'query_id': perspective_queries_test["query_id"].values[i],
#         'demographic_scores': row
#     })

In [10]:
train_scores_df = pd.DataFrame(train_scores)
dev_scores_df = pd.DataFrame(dev_scores)

# uncomment for test data
# test_scores_df = pd.DataFrame(test_scores)

In [11]:
folder_name = "final-scores" # set folder name

train_scores_df.to_json(f"{folder_name}/explicit_train_demographic_scores.jsonl", orient="records", lines=True)
dev_scores_df.to_json(f"{folder_name}/explicit_dev_demographic_scores.jsonl", orient="records", lines=True)

# uncomment for test data and potentially change name to avoid duplicate / overwriting
# test_scores_df.to_json(f"{folder_name}/explicit_test_demographic_scores.jsonl", orient="records", lines=True)