### This notebook was run in Colab

#### Install libraries

In [1]:
!pip install torch torchvision
!pip install transformers
!pip install tensorboardx
!pip install simpletransformers

Collecting transformers
  Downloading transformers-4.34.0-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m27.5 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m21.7 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m36.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m21.4 MB/s[0m eta [36m0:00:00[0m
Col

In [2]:
from simpletransformers.classification import ClassificationModel
import numpy as np
import torch
from scipy.special import softmax
import pandas as pd
import warnings
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score

# mount the google drive
from google.colab import drive
drive.mount('/content/gdrive')

warnings.filterwarnings('ignore')

# Stance dataset used for fine-tuning is from the paper: Towards Understanding and Answering Comparative Questions
# Available for download from the Touché data: https://zenodo.org/record/6873567#.Y_XFghzMJhE

data_df = pd.read_csv('gdrive/MyDrive/stance-dataset.tsv', sep='\t', encoding='utf-8') # specify the stance dataset in the google drive
data_df = data_df[data_df['object_count'] == 2]

Mounted at /content/gdrive


In [3]:
len(data_df)

961

### Fine-tune RoBERTa

In [4]:
# Use sentiment-promted RoBERTa with masked comparison objects

data_df["answer_stance"] = [0 if l == 0 else 1 for l in data_df["answer_stance"]]  # binary stance

def transform_train(df):
    df["text_a"] = ['FIRST_ENTITY is good'] * len(df)
    df["text_b"] = df["masked_all"] # the part after the [SEP] token
    df["labels"] = df["answer_stance"]
    return df

def get_classification_model(model):
    #label_count = 4
    label_count = 2 # binary stance

    return ClassificationModel(model[0], model[1], num_labels=label_count, use_cuda=torch.cuda.is_available(), cuda_device=0)


# any available model at simpletransformers can be used for experiments

full_model = get_classification_model(['roberta', 'roberta-large'])
train_df = transform_train(data_df).sample(frac=1) #take the whole dataset

#args = {"overwrite_output_dir": True, "output_dir": "gdrive/MyDrive/checkpoints", "num_train_epochs": 10, "fp16": False, "train_batch_size": 4, "gradient_accumulation_steps": 4, "evaluate_during_training": False,
#        "learning_rate": 2e-5, "early_stopping_consider_epochs": True, "reprocess_input_data": True,
#        'save_eval_checkpoints': False, 'save_model_every_epoch' : False,	'save_optimizer_and_scheduler': False, 	'save_steps': -1, "weight": [5, 1, 1, 1]}

#binary
args = {"overwrite_output_dir": True, "output_dir": "gdrive/MyDrive/checkpoints", "num_train_epochs": 10, "fp16": False, "train_batch_size": 4, "gradient_accumulation_steps": 4, "evaluate_during_training": False,
        "learning_rate": 2e-5, "early_stopping_consider_epochs": True, "reprocess_input_data": True,
        'save_eval_checkpoints': False, 'save_model_every_epoch' : False,	'save_optimizer_and_scheduler': False, 	'save_steps': -1}
full_model.train_model(train_df, args=args)

Downloading (…)lve/main/config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

  0%|          | 0/961 [00:00<?, ?it/s]

Epoch:   0%|          | 0/10 [00:00<?, ?it/s]

Running Epoch 0 of 10:   0%|          | 0/241 [00:00<?, ?it/s]

Running Epoch 1 of 10:   0%|          | 0/241 [00:00<?, ?it/s]

Running Epoch 2 of 10:   0%|          | 0/241 [00:00<?, ?it/s]

Running Epoch 3 of 10:   0%|          | 0/241 [00:00<?, ?it/s]

Running Epoch 4 of 10:   0%|          | 0/241 [00:00<?, ?it/s]

Running Epoch 5 of 10:   0%|          | 0/241 [00:00<?, ?it/s]

Running Epoch 6 of 10:   0%|          | 0/241 [00:00<?, ?it/s]

Running Epoch 7 of 10:   0%|          | 0/241 [00:00<?, ?it/s]

Running Epoch 8 of 10:   0%|          | 0/241 [00:00<?, ?it/s]

Running Epoch 9 of 10:   0%|          | 0/241 [00:00<?, ?it/s]

(600, 0.19980526377927163)

### Mask the comparison objects in Touché documents

In [5]:
import pandas as pd
import os
import re

def mask_text(object1, object2, text):
    l = re.findall(r"[\w']+|[.,!?;:]", text)
    object1 = object1.capitalize()
    object2 = object2.capitalize()
    if object1 in l:
        indices = [i for i, x in enumerate(l) if x == object1]
        for i in indices:
            l[i] = 'FIRST_ENTITY'
    if object1+'s' in l:
        indices = [i for i, x in enumerate(l) if x == object1+'s']
        for i in indices:
            l[i] = 'FIRST_ENTITY'
    if object1.lower() in l:
        indices = [i for i, x in enumerate(l) if x == object1.lower()]
        for i in indices:
            l[i] = 'FIRST_ENTITY'
    if object1.lower()+'s' in l:
        indices = [i for i, x in enumerate(l) if x == object1.lower()+'s']
        for i in indices:
            l[i] = 'FIRST_ENTITY'
    if object2 in l:
        indices = [i for i, x in enumerate(l) if x == object2]
        for i in indices:
            l[i] = 'SECOND_ENTITY'
    if object2+'s' in l:
        indices = [i for i, x in enumerate(l) if x == object2+'s']
        for i in indices:
            l[i] = 'SECOND_ENTITY'
    if object2.lower() in l:
        indices = [i for i, x in enumerate(l) if x == object2.lower()]
        for i in indices:
            l[i] = 'SECOND_ENTITY'
    if object2.lower()+'s' in l:
        indices = [i for i, x in enumerate(l) if x == object2.lower()+'s']
        for i in indices:
            l[i] = 'SECOND_ENTITY'
    out_text = " ".join(l)
    return re.sub(r' (?=\W)', '', out_text)

In [6]:
# Available for download from the Touché data: https://zenodo.org/record/6873567#.Y_XFghzMJhE (original dataset is touche-task2-passages-version-002.jsonl.gz)
# We converted it to csv (in this repositore in ../data/touche-to-classify.csv)

PATH_RUNS ="gdrive/MyDrive/stance"

run_df = pd.read_csv(os.path.join(PATH_RUNS, "touche-to-classify.csv"))
run_df['text_masked'] = run_df.apply(lambda x: mask_text(x['object1'], x['object2'], x['text']), axis=1)
#replace_values = {'FIRST' : 2, 'SECOND' : 3, 'NO' : 0, 'NEUTRAL' : 1} # convert stance labels into numerical values
replace_values = {'FIRST' : 1, 'SECOND' : 1, 'NO' : 0, 'NEUTRAL' : 1} # binary convert stance labels into numerical values
run_df = run_df.replace({"Score": replace_values}).rename({"qid": "Topic", "docno": "ID"}, axis=1)
print(len(run_df))
run_df.head(1)

2107


Unnamed: 0,Topic,ID,Score,text,Topic.1,topic_text,description,narrative,object1,object2,text_masked
0,12,clueweb12-0002wb-18-34442___2,0,Divided by length ostensibly to fit the amount...,12,Train or plane? Which is the better choice?,A frequent traveler (domestic and internationa...,A highly relevant document will contrast the e...,Train,plane,Divided by length ostensibly to fit the amount...


### Predict the stance

In [7]:
def transform_df(df):
    df["text_a"] = ['FIRST_ENTITY is good'] * len(df)
    df["text_b"] = df["text_masked"] # the part after the [SEP] token
    df["labels"] = df["Score"]
    return df

pred_df = transform_df(run_df)
_, raw_outputs, _ = full_model.eval_model(pred_df)
pred_probs = softmax(raw_outputs, axis=1)
pred_preds = np.argmax(raw_outputs, axis=1)

  0%|          | 0/2107 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/264 [00:00<?, ?it/s]

In [8]:
# Save the prediction results

run_df['preds'] = pred_preds
run_df.to_csv(os.path.join(PATH_RUNS, "touche-to-classify-res-binary.csv"))

In [9]:
from sklearn.metrics import classification_report

print(classification_report(y_true=run_df["Score"], y_pred=run_df['preds']))

              precision    recall  f1-score   support

           0       0.56      0.67      0.61      1012
           1       0.63      0.52      0.57      1095

    accuracy                           0.59      2107
   macro avg       0.59      0.59      0.59      2107
weighted avg       0.59      0.59      0.59      2107

