In [101]:
import pandas as pd
import os
import re

def mask_text(object1, object2, text):
    l = re.findall(r"[\w']+|[.,!?;:]", text)
    object1 = object1.capitalize()
    object2 = object2.capitalize()
    if object1 in l:
        indices = [i for i, x in enumerate(l) if x == object1]
        for i in indices:
            l[i] = 'FIRST_ENTITY'
    if object1+'s' in l:
        indices = [i for i, x in enumerate(l) if x == object1+'s']
        for i in indices:
            l[i] = 'FIRST_ENTITY'
    if object1.lower() in l:
        indices = [i for i, x in enumerate(l) if x == object1.lower()]
        for i in indices:
            l[i] = 'FIRST_ENTITY'
    if object1.lower()+'s' in l:
        indices = [i for i, x in enumerate(l) if x == object1.lower()+'s']
        for i in indices:
            l[i] = 'FIRST_ENTITY'
    if object2 in l:
        indices = [i for i, x in enumerate(l) if x == object2]
        for i in indices:
            l[i] = 'SECOND_ENTITY'
    if object2+'s' in l:
        indices = [i for i, x in enumerate(l) if x == object2+'s']
        for i in indices:
            l[i] = 'SECOND_ENTITY'
    if object2.lower() in l:
        indices = [i for i, x in enumerate(l) if x == object2.lower()]
        for i in indices:
            l[i] = 'SECOND_ENTITY'
    if object2.lower()+'s' in l:
        indices = [i for i, x in enumerate(l) if x == object2.lower()+'s']
        for i in indices:
            l[i] = 'SECOND_ENTITY'
    out_text = " ".join(l)
    return re.sub(r' (?=\W)', '', out_text)

In [115]:
PATH_RUNS ="../data/tmp/"

run_df = pd.read_csv(os.path.join(PATH_RUNS, "sample-run.csv"))
run_df['text_masked'] = run_df.apply(lambda x: mask_text(x['object_first'], x['object_second'], x['text']), axis=1)
replace_values = {'FIRST' : 2, 'SECOND' : 3, 'NO' : 0, 'NEUTRAL' : 1}                                                                                          
run_df = run_df.replace({"stance_label": replace_values}).rename({"qid": "Topic", "docno": "ID"}, axis=1)
run_df.head(1)

Unnamed: 0.1,Unnamed: 0,Topic,query,description,narrative,object_first,object_second,stance_label,ID,score,name,stance_value,text,rank,text_masked
0,24500,100,Should I learn Python or R for data analysis?,Wondering whether you should use Python or R f...,Relevant documents should compare two programm...,Python,R,3,clueweb12-1509wb-05-28610___12,92.317061,levirank_psuedo_relevance_feedback+voting,-1.0,"R is a free, open source statistics package wr...",0,"SECOND_ENTITY is a free, open source statistic..."


In [119]:
!ls ../data/qrels/

README.md                        touche-task2-2022-relevance.qrels
touche-task2-2022-quality.qrels  touche-task2-2022-stance.qrels


In [132]:
import glob
import numpy as np

DIMENSION = "stance"

#RESULT_PATH = "../../touche22-data/results/task"+str(TASK)+"/"
QREL_PATH = "../data/qrels/touche-task2-2022-stance.qrels"
#RUN_PATH = "../../touche22-data/runs/task"+str(TASK)+"/*/output/*"

qrels = (
    pd.read_csv(QREL_PATH, header=None, sep=" ")
    .rename({0: "Topic", 1: "Q0", 2: "ID", 3: "Score"}, axis=1)
    .drop("Q0", axis=1)
)

df_with_qrels = (
    run_df
    .merge(
        qrels,
        on = ["Topic", "ID"],
        how = "left"
    )
)
df_with_qrels = df_with_qrels.replace({"Score": replace_values}).dropna(subset=['Score'])
df_with_qrels.head(1)

Unnamed: 0.1,Unnamed: 0,Topic,query,description,narrative,object_first,object_second,stance_label,ID,score,name,stance_value,text,rank,text_masked,Score
0,24500,100,Should I learn Python or R for data analysis?,Wondering whether you should use Python or R f...,Relevant documents should compare two programm...,Python,R,3,clueweb12-1509wb-05-28610___12,92.317061,levirank_psuedo_relevance_feedback+voting,-1.0,"R is a free, open source statistics package wr...",0,"SECOND_ENTITY is a free, open source statistic...",1.0


In [None]:
%%time
from simpletransformers.classification import ClassificationModel
import numpy as np
import torch
from scipy.special import softmax
import warnings
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score

def transform_train(df):
    df["text_a"] = ['FIRST_ENTITY is good'] * len(df)
    # df["text_a"] = df["cleaned_question"] # to take the whole question before the [SEP] token
    df["text_b"] = df["text_masked"] # the part after the [SEP] token
    df["labels"] = df["stance_label"]
    return df

#PATH_MODEL = "/mnt/ceph/storage/data-in-progress/data-research/arguana/fair-ranking/checkpoints/checkpoints-roberta-o1-good-masked"
PATH_MODEL = "/mnt/ceph/storage/data-in-progress/data-research/arguana/fair-ranking/checkpoints/checkpoints-roberta-o1-good-masked/checkpoints"

model = ClassificationModel("roberta", PATH_MODEL, num_labels=4, use_cuda=False, cuda_device=0)
pred_df = transform_train(df_with_qrels)
_, raw_outputs, _ = model.eval_model(pred_df)
pred_probs = softmax(raw_outputs, axis=1)
pred_preds = np.argmax(raw_outputs, axis=1)

In [140]:
from collections import Counter
Counter(df_with_qrels.stance_label)

Counter({3: 793, 2: 803, 0: 35, 1: 131})

In [136]:
print(classification_report(y_true = df_with_qrels.stance_label.tolist(), y_pred = pred_preds))

              precision    recall  f1-score   support

           0       0.03      0.31      0.06        35
           1       0.11      0.85      0.20       131
           2       0.59      0.18      0.27       803
           3       0.48      0.11      0.18       793

    accuracy                           0.20      1762
   macro avg       0.30      0.36      0.18      1762
weighted avg       0.49      0.20      0.22      1762



In [137]:
f1_score(y_true = df_with_qrels.stance_label.tolist(), y_pred = pred_preds, average='macro')

0.1763330638967639