In [1]:
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import GroupKFold
pd.set_option('display.max_colwidth', None)

In [2]:
%load_ext autoreload

In [3]:
%autoreload

import sys
import os

os.environ['CUDA_VISIBLE_DEVICES'] = '0'
sys.path.insert(0, "../src-py")

import sbert_training

In [4]:
data_path = '/mnt/ceph/storage/data-in-progress/data-research/arguana/arg-generation'

In [5]:
training_df = pd.read_csv('../../data/training_df.csv')
valid_df = pd.read_csv('../../data/our_valid.csv')

In [6]:
train_kp_df     = pd.read_csv('../../KPA_2021_shared_task/kpm_data/key_points_train.csv')
train_arg_df    = pd.read_csv('../../KPA_2021_shared_task/kpm_data/arguments_train.csv')
train_labels_df = pd.read_csv('../../KPA_2021_shared_task/kpm_data/labels_train.csv')

dev_kp_df     = pd.read_csv('../../KPA_2021_shared_task/kpm_data/key_points_dev.csv')
dev_arg_df    = pd.read_csv('../../KPA_2021_shared_task/kpm_data/arguments_dev.csv')
dev_labels_df = pd.read_csv('../../KPA_2021_shared_task/kpm_data/labels_dev.csv')

full_train_kp_df = pd.concat([train_kp_df,dev_kp_df])
full_train_arg_df  = pd.concat([train_arg_df,dev_arg_df])
full_train_labels_df = pd.concat([train_labels_df,dev_labels_df])

test_kp_df     = pd.read_csv('../../KPA_2021_shared_task/test_data/key_points_test.csv')
test_arg_df    = pd.read_csv('../../KPA_2021_shared_task//test_data/arguments_test.csv')

In [32]:
train_df = train_labels_df.merge(train_arg_df, how='inner', left_on='arg_id', right_on='arg_id')
train_df = train_df.merge(train_kp_df[['key_point_id', 'key_point']], how='inner', left_on='key_point_id', right_on='key_point_id')

In [33]:
len(train_df)

20635

In [7]:
all_train_df = full_train_labels_df.merge(full_train_arg_df, how='inner', left_on='arg_id', right_on='arg_id')
all_train_df = all_train_df.merge(full_train_kp_df[['key_point_id', 'key_point']], how='inner', left_on='key_point_id', right_on='key_point_id')

In [8]:
all_train_df.head()

Unnamed: 0,arg_id,key_point_id,label,argument,topic,stance,key_point
0,arg_0_0,kp_0_0,0,`people reach their limit when it comes to their quality of life and should be able to end their suffering. this can be done with little or no suffering by assistance and the person is able to say good bye.,Assisted suicide should be a criminal offence,-1,Assisted suicide gives dignity to the person that wants to commit it
1,arg_0_1,kp_0_0,0,"A patient should be able to decide when they have had enough ""care"".",Assisted suicide should be a criminal offence,-1,Assisted suicide gives dignity to the person that wants to commit it
2,arg_0_2,kp_0_0,0,"a person has the right to end their suffering and if somebody takes pity on them and chooses to help, that person should not be punished.",Assisted suicide should be a criminal offence,-1,Assisted suicide gives dignity to the person that wants to commit it
3,arg_0_4,kp_0_0,0,a person should have the right to be able to choose if they want to live or die,Assisted suicide should be a criminal offence,-1,Assisted suicide gives dignity to the person that wants to commit it
4,arg_0_5,kp_0_0,0,a person should have the right to die on their own terms,Assisted suicide should be a criminal offence,-1,Assisted suicide gives dignity to the person that wants to commit it


In [9]:
test_arg_df.head()

Unnamed: 0,arg_id,argument,topic,stance
0,arg_0_0,Routine child vaccinations isn't mandatory since children don't spread the virus,Routine child vaccinations should be mandatory,-1
1,arg_0_1,Routine child vaccinations should not be mandatory because children may not bear the side effects of it.,Routine child vaccinations should be mandatory,-1
2,arg_0_2,Routine child vaccinations should not be necessary as children can't catch the disease,Routine child vaccinations should be mandatory,-1
3,arg_0_3,A vaccine that has not been sufficiently tested and without knowledge of side effects is not recommended for children,Routine child vaccinations should be mandatory,-1
4,arg_0_4,"As long as vaccines are not free of side effects, it cannot make them mandatory for our children.",Routine child vaccinations should be mandatory,-1


In [10]:
test_kp_df.head()

Unnamed: 0,key_point_id,key_point,topic,stance
0,kp_0_0,"Routine child vaccinations, or their side effects, are dangerous",Routine child vaccinations should be mandatory,-1
1,kp_0_1,Mandatory vaccination contradicts basic rights,Routine child vaccinations should be mandatory,-1
2,kp_0_2,The parents and not the state should decide,Routine child vaccinations should be mandatory,-1
3,kp_0_3,Routine child vaccinations are not necessary to keep children healthy,Routine child vaccinations should be mandatory,-1
4,kp_0_4,Routine child vaccinations are effective,Routine child vaccinations should be mandatory,1


In [11]:
def match_argument_with_keypoints(result, kp_dict, arg_dict):
    
    for arg, arg_embedding in arg_dict.items():
        result[arg] = {}
        for kp, kp_embedding in kp_dict.items():
            result[arg][kp] = util.pytorch_cos_sim(arg_embedding, kp_embedding).item()
        
        #Applying softmax
        kp_scores = list(result[arg].items())
        kp_ids, kp_scores = zip(*kp_scores)
        #print(kp_ids)
        #print(kp_scores)
        #kp_scores = torch.softmax(torch.Tensor(kp_scores), 0).tolist()
        #print(kp_scores)
        result[arg] = {kp_id:score for kp_id, score in zip(kp_ids, kp_scores)}
        

    return result

def predict(model, argument_df, keypoint_df, output_path, append_topic=False):
    argument_keypoints = {}
    for topic in argument_df.topic.unique():
        for stance in [-1, 1]:
            topic_keypoints_ids = keypoint_df[(keypoint_df.topic==topic) & (keypoint_df.stance==stance)]['key_point_id'].tolist()
            topic_keypoints = keypoint_df[(keypoint_df.topic==topic) & (keypoint_df.stance==stance)]['key_point'].tolist()
            if append_topic:
                topic_keypoints = [topic + ' <SEP> ' + x for x in topic_keypoints]
                
            topic_keypoints_embeddings = model.encode(topic_keypoints)
            topic_kp_embed = dict(zip(topic_keypoints_ids, topic_keypoints_embeddings))

            topic_arguments_ids = argument_df[(argument_df.topic==topic) & (argument_df.stance==stance)]['arg_id'].tolist()
            topic_arguments = argument_df[(argument_df.topic==topic) & (argument_df.stance==stance)]['argument'].tolist()
            topic_arguments_embeddings = model.encode(topic_arguments)
            topic_arg_embed= dict(zip(topic_arguments_ids, topic_arguments_embeddings))

            argument_keypoints = match_argument_with_keypoints(argument_keypoints, topic_kp_embed, topic_arg_embed)
    
    json.dump(argument_keypoints, open(output_path, 'w'))
    
    return argument_keypoints

def predict_and_evaluate(argument_df, keypoint_df, gold_data_dir, subset_name):
    pred_df = {}
    for model_path in models_list:
        append_topic= 'topic_added' in model_path
        #Predict
        model = SentenceTransformer(model_path)
        model_name = model_path.split('/')[-1]
        predictions_file = pred_output_path+model_name+ '-' + subset_name + '-preds.json'
        json_preds = predict(model, argument_df, keypoint_df, predictions_file, append_topic)

        #Evaluate
        arg_df, kp_df, labels_df = load_kpm_data(gold_data_dir, subset=subset_name)
        merged_df = get_predictions(predictions_file, labels_df, arg_df)
        print('Evaluating {}:'.format(model_name))
        evaluate_predictions(merged_df)
        
        pred_df[model_name] = merged_df

    return pred_df

def predict_models(argument_df, keypoint_df, gold_data_dir, subset_name):
    pred_df = {}
    for model_path in models_list:
        append_topic= 'topic_added' in model_path
        #Predict
        model = SentenceTransformer(model_path)
        model_name = model_path.split('/')[-1]
        predictions_file = pred_output_path+model_name+ '-' + subset_name + '-preds.json'
        json_preds = predict(model, argument_df, keypoint_df, predictions_file, append_topic)

        #Evaluate
        arg_df, kp_df, labels_df = load_kpm_data(gold_data_dir, subset=subset_name)
        merged_df = get_predictions(predictions_file, labels_df, arg_df)
        #print('Evaluating {}:'.format(model_name))
        #evaluate_predictions(merged_df)
        
        pred_df[model_name] = merged_df

    return pred_df

In [36]:
def ensamble_training(all_train_df, output_path):
    skf = GroupKFold(n_splits=5)
    fold = -1
    for train_index, test_index in skf.split(all_train_df, groups=all_train_df.topic):
        fold += 1
        tmp_train_df, tmp_test_df = all_train_df.iloc[train_index], all_train_df.iloc[test_index]

        df = tmp_train_df.copy()
        df['keypoint'] = df.apply(lambda x: x['topic'] + ' <SEP> ' + x['key_point'], axis=1)
        df['label'] = df.label.apply(lambda x: int(x))
        df[['argument', 'keypoint', 'label']].to_csv(data_path + '/keypoint-analysis-sharedtask/siamese-data/training_df_contrastive-fold-{}.csv'.format(fold))

        df = tmp_test_df.copy()
        df['keypoint'] = df.apply(lambda x: x['topic'] + ' <SEP> ' + x['key_point'], axis=1)
        df['label'] = df.label.apply(lambda x: int(x))
        df[['argument', 'keypoint', 'label']].to_csv(data_path + '/keypoint-analysis-sharedtask/siamese-data/valid_df_contrastive-fold-{}.csv'.format(fold))

        tmp_test_key_points_df = tmp_test_df[['key_point_id', 'key_point', 'topic', 'stance']].drop_duplicates()
        tmp_test_arguments_df = tmp_test_df[['arg_id', 'argument', 'topic', 'stance']].drop_duplicates()
        tmp_test_labels_df = tmp_test_df[['arg_id', 'key_point_id', 'label']]
        tmp_test_key_points_df.to_csv('../../data/cross-validation/key_points_test.csv')
        tmp_test_arguments_df.to_csv('../../data/cross-validation/arguments_test.csv')
        tmp_test_labels_df.to_csv('../../data/cross-validation/labels_test.csv')


        sbert_training.train_model(data_path + '/keypoint-analysis-sharedtask/siamese-data/',
                                '../../data/cross-validation/',
                                'test',
                                output_path,
                                'roberta-large',
                                model_suffix='final-model-fold-{}'.format(fold), 
                                data_file_suffix='contrastive-fold-{}'.format(fold), 
                                num_epochs=10, max_seq_length=70, add_special_token=True, train_batch_size=32, loss='ContrastiveLoss')

In [13]:
#ensamble_training(all_train_df, data_path + '/keypoint-analysis-sharedtask/final-experiment/') #train on training and dev df

In [None]:
ensamble_training(train_df, data_path + '/keypoint-analysis-sharedtask/final-experiment-on-training-data/') #train on only the training df

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


2023-07-05 15:10:58 - Use pytorch device: cuda
2023-07-05 15:10:58 - Read Triplet train dataset


Epoch:   0%|          | 0/10 [00:00<?, ?it/s]

Iteration:   0%|          | 0/509 [00:00<?, ?it/s]

2023-07-05 15:13:20 - TripletEvaluator: Evaluating the model on dev dataset in epoch 0 after 500 steps:
mAP strict= 0.813615255802711 ; mAP relaxed = 0.909790865791593
2023-07-05 15:13:21 - mAP strict:   	81.36
2023-07-05 15:13:21 - mAP relaxed:   	90.98
2023-07-05 15:13:21 - Save model to /mnt/ceph/storage/data-in-progress/data-research/arguana/arg-generation/keypoint-analysis-sharedtask/final-experiment-on-training-data/roberta-large-final-model-fold-0-2023-07-05_15-10-46
2023-07-05 15:13:23 - TripletEvaluator: Evaluating the model on dev dataset after epoch 0:
mAP strict= 0.803426516935683 ; mAP relaxed = 0.9220588871127353
2023-07-05 15:13:25 - mAP strict:   	80.34
2023-07-05 15:13:25 - mAP relaxed:   	92.21
2023-07-05 15:13:25 - Save model to /mnt/ceph/storage/data-in-progress/data-research/arguana/arg-generation/keypoint-analysis-sharedtask/final-experiment-on-training-data/roberta-large-final-model-fold-0-2023-07-05_15-10-46


Iteration:   0%|          | 0/509 [00:00<?, ?it/s]

2023-07-05 15:14:36 - TripletEvaluator: Evaluating the model on dev dataset in epoch 1 after 500 steps:
mAP strict= 0.8839568800525848 ; mAP relaxed = 0.9518063634749213
2023-07-05 15:14:38 - mAP strict:   	88.40
2023-07-05 15:14:38 - mAP relaxed:   	95.18
2023-07-05 15:14:38 - Save model to /mnt/ceph/storage/data-in-progress/data-research/arguana/arg-generation/keypoint-analysis-sharedtask/final-experiment-on-training-data/roberta-large-final-model-fold-0-2023-07-05_15-10-46
2023-07-05 15:14:40 - TripletEvaluator: Evaluating the model on dev dataset after epoch 1:
mAP strict= 0.8920986973698117 ; mAP relaxed = 0.959683400592996
2023-07-05 15:14:41 - mAP strict:   	89.21
2023-07-05 15:14:41 - mAP relaxed:   	95.97
2023-07-05 15:14:41 - Save model to /mnt/ceph/storage/data-in-progress/data-research/arguana/arg-generation/keypoint-analysis-sharedtask/final-experiment-on-training-data/roberta-large-final-model-fold-0-2023-07-05_15-10-46


Iteration:   0%|          | 0/509 [00:00<?, ?it/s]

2023-07-05 15:15:53 - TripletEvaluator: Evaluating the model on dev dataset in epoch 2 after 500 steps:
mAP strict= 0.9102662215154933 ; mAP relaxed = 0.9664335333899432
2023-07-05 15:15:54 - mAP strict:   	91.03
2023-07-05 15:15:54 - mAP relaxed:   	96.64
2023-07-05 15:15:54 - Save model to /mnt/ceph/storage/data-in-progress/data-research/arguana/arg-generation/keypoint-analysis-sharedtask/final-experiment-on-training-data/roberta-large-final-model-fold-0-2023-07-05_15-10-46
2023-07-05 15:15:57 - TripletEvaluator: Evaluating the model on dev dataset after epoch 2:
mAP strict= 0.9079224545216688 ; mAP relaxed = 0.9679078947221074
2023-07-05 15:15:58 - mAP strict:   	90.79
2023-07-05 15:15:58 - mAP relaxed:   	96.79


Iteration:   0%|          | 0/509 [00:00<?, ?it/s]

2023-07-05 15:17:09 - TripletEvaluator: Evaluating the model on dev dataset in epoch 3 after 500 steps:
mAP strict= 0.8963748408878406 ; mAP relaxed = 0.9531792403111503
2023-07-05 15:17:10 - mAP strict:   	89.64
2023-07-05 15:17:10 - mAP relaxed:   	95.32
2023-07-05 15:17:11 - TripletEvaluator: Evaluating the model on dev dataset after epoch 3:
mAP strict= 0.9011174844208716 ; mAP relaxed = 0.9539262564015809
2023-07-05 15:17:12 - mAP strict:   	90.11
2023-07-05 15:17:12 - mAP relaxed:   	95.39


Iteration:   0%|          | 0/509 [00:00<?, ?it/s]

2023-07-05 15:18:23 - TripletEvaluator: Evaluating the model on dev dataset in epoch 4 after 500 steps:
mAP strict= 0.9111058492649764 ; mAP relaxed = 0.9637008379471735
2023-07-05 15:18:24 - mAP strict:   	91.11
2023-07-05 15:18:24 - mAP relaxed:   	96.37
2023-07-05 15:18:25 - TripletEvaluator: Evaluating the model on dev dataset after epoch 4:
mAP strict= 0.9119080324362787 ; mAP relaxed = 0.9606603954118315
2023-07-05 15:18:26 - mAP strict:   	91.19
2023-07-05 15:18:26 - mAP relaxed:   	96.07


Iteration:   0%|          | 0/509 [00:00<?, ?it/s]

2023-07-05 15:19:37 - TripletEvaluator: Evaluating the model on dev dataset in epoch 5 after 500 steps:
mAP strict= 0.9058037598134792 ; mAP relaxed = 0.9636197220574767
2023-07-05 15:19:38 - mAP strict:   	90.58
2023-07-05 15:19:38 - mAP relaxed:   	96.36
2023-07-05 15:19:39 - TripletEvaluator: Evaluating the model on dev dataset after epoch 5:
mAP strict= 0.9116562688135061 ; mAP relaxed = 0.9682348142226977
2023-07-05 15:19:40 - mAP strict:   	91.17
2023-07-05 15:19:40 - mAP relaxed:   	96.82
2023-07-05 15:19:40 - Save model to /mnt/ceph/storage/data-in-progress/data-research/arguana/arg-generation/keypoint-analysis-sharedtask/final-experiment-on-training-data/roberta-large-final-model-fold-0-2023-07-05_15-10-46


Iteration:   0%|          | 0/509 [00:00<?, ?it/s]

2023-07-05 15:20:52 - TripletEvaluator: Evaluating the model on dev dataset in epoch 6 after 500 steps:
mAP strict= 0.9012131803602553 ; mAP relaxed = 0.9659202605199448
2023-07-05 15:20:53 - mAP strict:   	90.12
2023-07-05 15:20:53 - mAP relaxed:   	96.59
2023-07-05 15:20:54 - TripletEvaluator: Evaluating the model on dev dataset after epoch 6:
mAP strict= 0.9024928803456639 ; mAP relaxed = 0.9664991927589555
2023-07-05 15:20:55 - mAP strict:   	90.25
2023-07-05 15:20:55 - mAP relaxed:   	96.65


Iteration:   0%|          | 0/509 [00:00<?, ?it/s]

2023-07-05 15:22:06 - TripletEvaluator: Evaluating the model on dev dataset in epoch 7 after 500 steps:
mAP strict= 0.9207939225418749 ; mAP relaxed = 0.9730516828383852
2023-07-05 15:22:07 - mAP strict:   	92.08
2023-07-05 15:22:07 - mAP relaxed:   	97.31
2023-07-05 15:22:07 - Save model to /mnt/ceph/storage/data-in-progress/data-research/arguana/arg-generation/keypoint-analysis-sharedtask/final-experiment-on-training-data/roberta-large-final-model-fold-0-2023-07-05_15-10-46
2023-07-05 15:22:09 - TripletEvaluator: Evaluating the model on dev dataset after epoch 7:
mAP strict= 0.9193774410861497 ; mAP relaxed = 0.9731269969287369
2023-07-05 15:22:11 - mAP strict:   	91.94
2023-07-05 15:22:11 - mAP relaxed:   	97.31


Iteration:   0%|          | 0/509 [00:00<?, ?it/s]

2023-07-05 15:23:21 - TripletEvaluator: Evaluating the model on dev dataset in epoch 8 after 500 steps:
mAP strict= 0.9173016133334644 ; mAP relaxed = 0.9732520261023871
2023-07-05 15:23:22 - mAP strict:   	91.73
2023-07-05 15:23:22 - mAP relaxed:   	97.33
2023-07-05 15:23:23 - TripletEvaluator: Evaluating the model on dev dataset after epoch 8:
mAP strict= 0.9170725680261181 ; mAP relaxed = 0.9730804154005035
2023-07-05 15:23:25 - mAP strict:   	91.71
2023-07-05 15:23:25 - mAP relaxed:   	97.31


Iteration:   0%|          | 0/509 [00:00<?, ?it/s]

2023-07-05 15:24:35 - TripletEvaluator: Evaluating the model on dev dataset in epoch 9 after 500 steps:
mAP strict= 0.9197727316639425 ; mAP relaxed = 0.9754032208752408
2023-07-05 15:24:36 - mAP strict:   	91.98
2023-07-05 15:24:36 - mAP relaxed:   	97.54
2023-07-05 15:24:36 - Save model to /mnt/ceph/storage/data-in-progress/data-research/arguana/arg-generation/keypoint-analysis-sharedtask/final-experiment-on-training-data/roberta-large-final-model-fold-0-2023-07-05_15-10-46
2023-07-05 15:24:39 - TripletEvaluator: Evaluating the model on dev dataset after epoch 9:
mAP strict= 0.9198276172545423 ; mAP relaxed = 0.9754032208752408
2023-07-05 15:24:40 - mAP strict:   	91.98
2023-07-05 15:24:40 - mAP relaxed:   	97.54
2023-07-05 15:24:40 - Save model to /mnt/ceph/storage/data-in-progress/data-research/arguana/arg-generation/keypoint-analysis-sharedtask/final-experiment-on-training-data/roberta-large-final-model-fold-0-2023-07-05_15-10-46


Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


2023-07-05 15:24:48 - Use pytorch device: cuda
2023-07-05 15:24:48 - Read Triplet train dataset


Epoch:   0%|          | 0/10 [00:00<?, ?it/s]

Iteration:   0%|          | 0/519 [00:00<?, ?it/s]

2023-07-05 15:25:58 - TripletEvaluator: Evaluating the model on dev dataset in epoch 0 after 500 steps:
mAP strict= 0.8418132616014364 ; mAP relaxed = 0.9700350526273638
2023-07-05 15:26:00 - mAP strict:   	84.18
2023-07-05 15:26:00 - mAP relaxed:   	97.00
2023-07-05 15:26:00 - Save model to /mnt/ceph/storage/data-in-progress/data-research/arguana/arg-generation/keypoint-analysis-sharedtask/final-experiment-on-training-data/roberta-large-final-model-fold-1-2023-07-05_15-24-42
2023-07-05 15:26:04 - TripletEvaluator: Evaluating the model on dev dataset after epoch 0:
mAP strict= 0.8294343305342927 ; mAP relaxed = 0.9552328874849959
2023-07-05 15:26:05 - mAP strict:   	82.94
2023-07-05 15:26:05 - mAP relaxed:   	95.52


Iteration:   0%|          | 0/519 [00:00<?, ?it/s]

2023-07-05 15:27:15 - TripletEvaluator: Evaluating the model on dev dataset in epoch 1 after 500 steps:
mAP strict= 0.7888991201683473 ; mAP relaxed = 0.9204343667742974
2023-07-05 15:27:16 - mAP strict:   	78.89
2023-07-05 15:27:16 - mAP relaxed:   	92.04
2023-07-05 15:27:19 - TripletEvaluator: Evaluating the model on dev dataset after epoch 1:
mAP strict= 0.7975542958175607 ; mAP relaxed = 0.9286520240687176
2023-07-05 15:27:20 - mAP strict:   	79.76
2023-07-05 15:27:20 - mAP relaxed:   	92.87


Iteration:   0%|          | 0/519 [00:00<?, ?it/s]

2023-07-05 15:28:30 - TripletEvaluator: Evaluating the model on dev dataset in epoch 2 after 500 steps:
mAP strict= 0.7901546463727549 ; mAP relaxed = 0.9172256739657507
2023-07-05 15:28:32 - mAP strict:   	79.02
2023-07-05 15:28:32 - mAP relaxed:   	91.72
2023-07-05 15:28:34 - TripletEvaluator: Evaluating the model on dev dataset after epoch 2:
mAP strict= 0.7950148295283774 ; mAP relaxed = 0.9283130110639737
2023-07-05 15:28:36 - mAP strict:   	79.50
2023-07-05 15:28:36 - mAP relaxed:   	92.83


Iteration:   0%|          | 0/519 [00:00<?, ?it/s]

2023-07-05 15:29:46 - TripletEvaluator: Evaluating the model on dev dataset in epoch 3 after 500 steps:
mAP strict= 0.8528222718627827 ; mAP relaxed = 0.9477863498132157
2023-07-05 15:29:47 - mAP strict:   	85.28
2023-07-05 15:29:47 - mAP relaxed:   	94.78
2023-07-05 15:29:50 - TripletEvaluator: Evaluating the model on dev dataset after epoch 3:
mAP strict= 0.8452806415122964 ; mAP relaxed = 0.9486908091294207
2023-07-05 15:29:51 - mAP strict:   	84.53
2023-07-05 15:29:51 - mAP relaxed:   	94.87


Iteration:   0%|          | 0/519 [00:00<?, ?it/s]

2023-07-05 15:31:02 - TripletEvaluator: Evaluating the model on dev dataset in epoch 4 after 500 steps:
mAP strict= 0.8304611331750753 ; mAP relaxed = 0.9359027782877003
2023-07-05 15:31:03 - mAP strict:   	83.05
2023-07-05 15:31:03 - mAP relaxed:   	93.59
2023-07-05 15:31:06 - TripletEvaluator: Evaluating the model on dev dataset after epoch 4:
mAP strict= 0.8332152026373233 ; mAP relaxed = 0.940929534754787
2023-07-05 15:31:07 - mAP strict:   	83.32
2023-07-05 15:31:07 - mAP relaxed:   	94.09


Iteration:   0%|          | 0/519 [00:00<?, ?it/s]

2023-07-05 15:32:17 - TripletEvaluator: Evaluating the model on dev dataset in epoch 5 after 500 steps:
mAP strict= 0.841552263524677 ; mAP relaxed = 0.940960833242562
2023-07-05 15:32:19 - mAP strict:   	84.16
2023-07-05 15:32:19 - mAP relaxed:   	94.10
2023-07-05 15:32:21 - TripletEvaluator: Evaluating the model on dev dataset after epoch 5:
mAP strict= 0.8434935219757269 ; mAP relaxed = 0.9431003213500997
2023-07-05 15:32:22 - mAP strict:   	84.35
2023-07-05 15:32:22 - mAP relaxed:   	94.31


Iteration:   0%|          | 0/519 [00:00<?, ?it/s]

In [14]:
ls /mnt/ceph/storage/data-in-progress/data-research/arguana/arg-generation/keypoint-analysis-sharedtask/final-experiment/

[0m[01;34mroberta-large-final-model-fold-0-2023-07-03_14-50-42[0m/
[01;34mroberta-large-final-model-fold-1-2023-07-03_15-07-56[0m/
[01;34mroberta-large-final-model-fold-2-2023-07-03_15-22-51[0m/
[01;34mroberta-large-final-model-fold-3-2023-07-03_15-37-30[0m/
[01;34mroberta-large-final-model-fold-4-2023-07-03_15-52-27[0m/


In [15]:
models_list = [
     data_path + '/keypoint-analysis-sharedtask/final-experiment/roberta-large-final-model-fold-0-2023-07-03_14-50-42',
     data_path + '/keypoint-analysis-sharedtask/final-experiment/roberta-large-final-model-fold-1-2023-07-03_15-07-56',
     data_path + '/keypoint-analysis-sharedtask/final-experiment/roberta-large-final-model-fold-2-2023-07-03_15-22-51',
     data_path + '/keypoint-analysis-sharedtask/final-experiment/roberta-large-final-model-fold-3-2023-07-03_15-37-30',
     data_path + '/keypoint-analysis-sharedtask/final-experiment/roberta-large-final-model-fold-4-2023-07-03_15-52-27',
]

pred_output_path = data_path + '/keypoint-analysis-sharedtask/siamese-data/preds/'

In [16]:
from sentence_transformers import SentenceTransformer, InputExample, LoggingHandler, losses, models, util
import torch
from track_1_kp_matching import *
# testing wheter prediction work on the dev data
test_keypoints_df = pd.read_csv('../../KPA_2021_shared_task/kpm_data/key_points_dev.csv')
test_arguments_df = pd.read_csv('../../KPA_2021_shared_task/kpm_data/arguments_dev.csv')
preds_df = predict_and_evaluate(test_arguments_df, test_keypoints_df,  '../../KPA_2021_shared_task/kpm_data', 'dev')

2023-07-05 14:31:27 - Load pretrained SentenceTransformer: /mnt/ceph/storage/data-in-progress/data-research/arguana/arg-generation/keypoint-analysis-sharedtask/final-experiment/roberta-large-final-model-fold-0-2023-07-03_14-50-42
2023-07-05 14:31:38 - Use pytorch device: cuda


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

loaded predictions for 932 arguments
Evaluating roberta-large-final-model-fold-0-2023-07-03_14-50-42:
mAP strict= 0.8908228837841844 ; mAP relaxed = 0.9808871827254642
2023-07-05 14:32:00 - Load pretrained SentenceTransformer: /mnt/ceph/storage/data-in-progress/data-research/arguana/arg-generation/keypoint-analysis-sharedtask/final-experiment/roberta-large-final-model-fold-1-2023-07-03_15-07-56
2023-07-05 14:32:09 - Use pytorch device: cuda


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

loaded predictions for 932 arguments
Evaluating roberta-large-final-model-fold-1-2023-07-03_15-07-56:
mAP strict= 0.8966189731994281 ; mAP relaxed = 0.9707501481994847
2023-07-05 14:32:10 - Load pretrained SentenceTransformer: /mnt/ceph/storage/data-in-progress/data-research/arguana/arg-generation/keypoint-analysis-sharedtask/final-experiment/roberta-large-final-model-fold-2-2023-07-03_15-22-51
2023-07-05 14:32:18 - Use pytorch device: cuda


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

loaded predictions for 932 arguments
Evaluating roberta-large-final-model-fold-2-2023-07-03_15-22-51:
mAP strict= 0.9329552696118272 ; mAP relaxed = 0.9961648676018134
2023-07-05 14:32:20 - Load pretrained SentenceTransformer: /mnt/ceph/storage/data-in-progress/data-research/arguana/arg-generation/keypoint-analysis-sharedtask/final-experiment/roberta-large-final-model-fold-3-2023-07-03_15-37-30
2023-07-05 14:32:28 - Use pytorch device: cuda


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

loaded predictions for 932 arguments
Evaluating roberta-large-final-model-fold-3-2023-07-03_15-37-30:
mAP strict= 0.9024235520682247 ; mAP relaxed = 0.9828698992734068
2023-07-05 14:32:30 - Load pretrained SentenceTransformer: /mnt/ceph/storage/data-in-progress/data-research/arguana/arg-generation/keypoint-analysis-sharedtask/final-experiment/roberta-large-final-model-fold-4-2023-07-03_15-52-27
2023-07-05 14:32:38 - Use pytorch device: cuda


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

loaded predictions for 932 arguments
Evaluating roberta-large-final-model-fold-4-2023-07-03_15-52-27:
mAP strict= 0.8848682333231876 ; mAP relaxed = 0.9932914042757078


In [23]:
final_pred_df = preds_df[models_list[0].split('/')[-1]].copy()
final_pred_df['score1'] = preds_df[models_list[1].split('/')[-1]]['score']
final_pred_df['score2'] = preds_df[models_list[2].split('/')[-1]]['score']
final_pred_df['score3'] = preds_df[models_list[3].split('/')[-1]]['score']
final_pred_df['score4'] = preds_df[models_list[3].split('/')[-1]]['score']
final_pred_df['score']   = final_pred_df.apply(lambda row: np.mean([row['score'], row['score1'], row['score2'], row['score3'], row['score4']]), axis=1)

In [24]:
final_pred_df.head()

Unnamed: 0,arg_id,topic,stance,key_point_id,score,label,label_strict,label_relaxed,score1,score2,score3,score4
0,arg_4_0,We should abandon the use of school uniform,-1,kp_4_1,0.912448,1.0,1.0,1.0,0.986386,0.900745,0.918817,0.918817
1,arg_4_1,We should abandon the use of school uniform,-1,kp_4_2,0.836688,1.0,1.0,1.0,0.962031,0.910586,0.773339,0.773339
2,arg_4_2,We should abandon the use of school uniform,-1,kp_4_3,0.905298,1.0,1.0,1.0,0.978076,0.84965,0.904299,0.904299
3,arg_4_3,We should abandon the use of school uniform,-1,kp_4_3,0.949089,1.0,1.0,1.0,0.982908,0.918987,0.955536,0.955536
4,arg_4_4,We should abandon the use of school uniform,-1,kp_4_2,0.93226,1.0,1.0,1.0,0.978586,0.910103,0.95923,0.95923


In [26]:
final_pred_df.to_pickle('../../data/dev_argument_keypoint_predicted_scores.pkl')

In [27]:
from sentence_transformers import SentenceTransformer, InputExample, LoggingHandler, losses, models, util
import torch
from track_1_kp_matching import *
# predicting the test data
test_keypoints_df = pd.read_csv('../../KPA_2021_shared_task/test_data/key_points_test.csv')
test_arguments_df = pd.read_csv('../../KPA_2021_shared_task/test_data/arguments_test.csv')
preds_df = predict_models(test_arguments_df, test_keypoints_df,  '../../data/cross-validation', 'test')

2023-07-05 14:47:00 - Load pretrained SentenceTransformer: /mnt/ceph/storage/data-in-progress/data-research/arguana/arg-generation/keypoint-analysis-sharedtask/final-experiment/roberta-large-final-model-fold-0-2023-07-03_14-50-42
2023-07-05 14:47:02 - Use pytorch device: cuda


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/6 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/5 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/3 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/5 [00:00<?, ?it/s]

loaded predictions for 723 arguments
2023-07-05 14:47:03 - Load pretrained SentenceTransformer: /mnt/ceph/storage/data-in-progress/data-research/arguana/arg-generation/keypoint-analysis-sharedtask/final-experiment/roberta-large-final-model-fold-1-2023-07-03_15-07-56
2023-07-05 14:47:05 - Use pytorch device: cuda


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/6 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/5 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/3 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/5 [00:00<?, ?it/s]

loaded predictions for 723 arguments
2023-07-05 14:47:06 - Load pretrained SentenceTransformer: /mnt/ceph/storage/data-in-progress/data-research/arguana/arg-generation/keypoint-analysis-sharedtask/final-experiment/roberta-large-final-model-fold-2-2023-07-03_15-22-51
2023-07-05 14:47:08 - Use pytorch device: cuda


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/6 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/5 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/3 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/5 [00:00<?, ?it/s]

loaded predictions for 723 arguments
2023-07-05 14:47:09 - Load pretrained SentenceTransformer: /mnt/ceph/storage/data-in-progress/data-research/arguana/arg-generation/keypoint-analysis-sharedtask/final-experiment/roberta-large-final-model-fold-3-2023-07-03_15-37-30
2023-07-05 14:47:11 - Use pytorch device: cuda


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/6 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/5 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/3 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/5 [00:00<?, ?it/s]

loaded predictions for 723 arguments
2023-07-05 14:47:12 - Load pretrained SentenceTransformer: /mnt/ceph/storage/data-in-progress/data-research/arguana/arg-generation/keypoint-analysis-sharedtask/final-experiment/roberta-large-final-model-fold-4-2023-07-03_15-52-27
2023-07-05 14:47:14 - Use pytorch device: cuda


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/6 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/5 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/3 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/5 [00:00<?, ?it/s]

loaded predictions for 723 arguments


In [28]:
final_pred_df = preds_df[models_list[0].split('/')[-1]].copy()
final_pred_df['score1'] = preds_df[models_list[1].split('/')[-1]]['score']
final_pred_df['score2'] = preds_df[models_list[2].split('/')[-1]]['score']
final_pred_df['score3'] = preds_df[models_list[3].split('/')[-1]]['score']
final_pred_df['score4'] = preds_df[models_list[3].split('/')[-1]]['score']
final_pred_df['score']   = final_pred_df.apply(lambda row: np.mean([row['score'], row['score1'], row['score2'], row['score3'], row['score4']]), axis=1)

In [29]:
final_pred_df.head()

Unnamed: 0.1,arg_id,topic,stance,key_point_id,score,Unnamed: 0,label,label_strict,label_relaxed,score1,score2,score3,score4
0,arg_1_0,Homeschooling should be banned,-1,kp_1_3,0.746209,1357.0,1.0,1.0,1.0,0.868329,0.763999,0.682272,0.682272
1,arg_1_1,Homeschooling should be banned,-1,kp_1_1,0.936784,1141.0,1.0,1.0,1.0,0.955544,0.942282,0.923475,0.923475
2,arg_1_2,Homeschooling should be banned,-1,kp_1_2,0.969901,1234.0,0.0,0.0,0.0,0.992001,0.975318,0.977233,0.977233
3,arg_1_3,Homeschooling should be banned,-1,kp_1_2,0.965106,1235.0,0.0,0.0,0.0,0.993555,0.966211,0.983325,0.983325
4,arg_1_4,Homeschooling should be banned,-1,kp_1_2,0.796182,1236.0,0.0,0.0,0.0,0.876498,0.669439,0.805731,0.805731


In [30]:
final_pred_df.to_pickle('../../data/test_argument_keypoint_predicted_scores.pkl')