In [332]:
import os
import pandas as pd
import numpy as np
from scripts.utils import read_gold_data

In [333]:
data = read_gold_data("data-release")
corpus = data["corpus"]
perspective_queries_dev = data["perspective"]["dev"]

In [None]:
similarity_scores_path = "" # set path here
topic_scores_path = "" # set path here
demographic_scores_path = "" # set path here
rerank_scores_path = "" # set path here
gold_scores_path = "" # set path here

In [334]:
similarity_scores_df = pd.read_json(similarity_scores_path, lines=True)
topic_scores_df = pd.read_json(topic_scores_path, lines=True)
demographic_scores_df = pd.read_json(demographic_scores_path, lines=True)
rerank_scores_df = pd.read_json(rerank_scores_path, lines=True)
gold_scores_df = pd.read_json(gold_scores_path, lines=True)

In [335]:
common_query_ids = set(similarity_scores_df["query_id"].values).intersection(set(topic_scores_df["query_id"].values), set(demographic_scores_df["query_id"].values), set(rerank_scores_df["query_id"].values), set(gold_scores_df["query_id"].values))

In [337]:
similarity_scores_df = similarity_scores_df[similarity_scores_df['query_id'].isin(common_query_ids)]
topic_scores_df = topic_scores_df[topic_scores_df['query_id'].isin(common_query_ids)]
demographic_scores_df = demographic_scores_df[demographic_scores_df['query_id'].isin(common_query_ids)]
rerank_scores_df = rerank_scores_df[rerank_scores_df['query_id'].isin(common_query_ids)]
gold_scores_df = gold_scores_df[gold_scores_df['query_id'].isin(common_query_ids)]

In [338]:
similarity_scores = np.array(list(similarity_scores_df["similarity_scores"].values))
topic_scores = np.array(list(topic_scores_df["topic_scores"].values))
demographic_scores = np.array(list(demographic_scores_df["demographic_scores"].values))
rerank_scores = np.array(list(rerank_scores_df["rerank_scores"].values))
gold_scores = np.array(list(gold_scores_df["gold_scores"].values))

In [339]:
combined_scores = similarity_scores + topic_scores + demographic_scores + rerank_scores

In [340]:
top_indices = np.argpartition(-combined_scores, 100, axis=1)[:, :100] #  change 100 to amount of arguments regarded per query

In [341]:
similarity_scores_100 = np.take_along_axis(similarity_scores, top_indices, axis=1)
topic_scores_100 = np.take_along_axis(topic_scores, top_indices, axis=1)
demographic_scores_100 = np.take_along_axis(demographic_scores, top_indices, axis=1)
rerank_scores_100 = np.take_along_axis(rerank_scores, top_indices, axis=1)
gold_scores_100 = np.take_along_axis(gold_scores, top_indices, axis=1)


In [343]:
similarities_matrix = np.empty((0, 4)) # change for to number of input scores without gold score
gold_scores_matrix = np.array([])

In [None]:
for row in range(len(similarity_scores_100)):
    if row % 50 == 0:
        print(row)
    for column in range(100): #  change 100 to amount of arguments regarded per query
        similarity_score = similarity_scores_100[row, column]
        topic_score = topic_scores_100[row, column]
        demographic_score = demographic_scores_100[row, column]
        rerank_score = rerank_scores_100[row, column]
        gold_score = gold_scores_100[row, column]
        row_scores = np.array([similarity_score, topic_score, demographic_score, rerank_score])
        similarities_matrix = np.vstack([similarities_matrix, row_scores])
        gold_scores_matrix = np.append(gold_scores_matrix, gold_score)

In [None]:
scores_path = "" # expects something like path/new_file_name.npy
truth_path = "" # expects something like path/new_file_name_2.npy

In [345]:
np.save(scores_path, similarities_matrix)
np.save(truth_path, gold_scores_matrix)

In [346]:
X = np.load(scores_path)
y = np.load(truth_path)

In [347]:
from sklearn.linear_model import LogisticRegression

In [348]:
clf = LogisticRegression(random_state=0).fit(X, y)

In [None]:
print(clf.coef_)

In [None]:
similarity_score, topic_score, demographic_scores, rerank_score