In [1]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/Colab Notebooks/NLP/argmine/code/src-ipynb

Mounted at /content/drive
/content/drive/MyDrive/Colab Notebooks/NLP/argmine/code/src-ipynb


In [2]:
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

pd.set_option('display.max_colwidth', None)

In [None]:
!pip install sentence_transformers;

### Predicting:

In [4]:
import os
import sys

sys.path.insert(0, "../src-py")

In [5]:
from track_1_kp_matching import *

In [6]:
from sentence_transformers import SentenceTransformer, InputExample, LoggingHandler, losses, models, util
import torch

In [7]:
valid_df = pd.read_csv('../../data/valid_df.csv', index_col=0)
valid_keypoints_df = pd.read_csv('../../data/kpm_data/key_points_dev.csv')
valid_arguments_df = pd.read_csv('../../data/kpm_data/arguments_dev.csv')

# valid_keypoints_df = pd.read_csv('../../data/key_points_our_valid.csv')
# valid_arguments_df = pd.read_csv('../../data/arguments_our_valid.csv')

In [8]:
def match_argument_with_keypoints(result, kp_dict, arg_dict):
    
    for arg, arg_embedding in arg_dict.items():
        result[arg] = {}
        for kp, kp_embedding in kp_dict.items():
            result[arg][kp] = util.pytorch_cos_sim(arg_embedding, kp_embedding).item()
        
        #Applying softmax
        kp_scores = list(result[arg].items())
        kp_ids, kp_scores = zip(*kp_scores)
        result[arg] = {kp_id:score for kp_id, score in zip(kp_ids, kp_scores)}
        

    return result

def predict(model, argument_df, keypoint_df, output_path, append_topic=False):
    argument_keypoints = {}
    for topic in argument_df.topic.unique():
        for stance in [-1, 1]:
            topic_keypoints_ids = keypoint_df[(keypoint_df.topic==topic) & (keypoint_df.stance==stance)]['key_point_id'].tolist()
            topic_keypoints = keypoint_df[(keypoint_df.topic==topic) & (keypoint_df.stance==stance)]['key_point'].tolist()
            if append_topic:
                topic_keypoints = [topic + ' <SEP> ' + x for x in topic_keypoints]
                
            topic_keypoints_embeddings = model.encode(topic_keypoints)
            topic_kp_embed = dict(zip(topic_keypoints_ids, topic_keypoints_embeddings))

            topic_arguments_ids = argument_df[(argument_df.topic==topic) & (argument_df.stance==stance)]['arg_id'].tolist()
            topic_arguments = argument_df[(argument_df.topic==topic) & (argument_df.stance==stance)]['argument'].tolist()
            topic_arguments_embeddings = model.encode(topic_arguments)
            topic_arg_embed= dict(zip(topic_arguments_ids, topic_arguments_embeddings))

            argument_keypoints = match_argument_with_keypoints(argument_keypoints, topic_kp_embed, topic_arg_embed)
    
    json.dump(argument_keypoints, open(output_path, 'w'))
    
    return argument_keypoints

In [9]:
models_list = [
    '../../data/siamese-models/roberta-base-contrastive-10-epochs-2022-01-26_10-26-13',
    # '../../data/siamese-models/',
]

pred_output_path = '../../data/predictions/'

In [10]:
def predict_and_evaluate(argument_df, keypoint_df, gold_data_dir, subset_name):
    pred_df = {}
    for model_path in models_list:
        append_topic= 'topic_added' in model_path
        #Predict
        model = SentenceTransformer(model_path)
        model_name = model_path.split('/')[-1]
        predictions_file = pred_output_path+model_name+ '-' + subset_name + '-preds.json'
        json_preds = predict(model, argument_df, keypoint_df, predictions_file, append_topic)

        #Evaluate
        arg_df, kp_df, labels_df = load_kpm_data(gold_data_dir, subset=subset_name)
        merged_df = get_predictions(predictions_file, labels_df, arg_df)
        print('Evaluating {}:'.format(model_name))
        evaluate_predictions(merged_df)
        
        pred_df[model_name] = merged_df

    return pred_df

In [11]:
pred_dfs = predict_and_evaluate(valid_arguments_df, valid_keypoints_df,  '../../data/kpm_data', 'dev')

loaded predictions for 932 arguments
Evaluating roberta-base-contrastive-10-epochs-2022-01-26_10-26-13:
mAP strict= 0.8373949261891496 ; mAP relaxed = 0.9608110703052493


---------

### Predicting on the final test set:

In [12]:
# test_arg_df = pd.read_csv('../../data/arguments_our_test.csv')
# test_keypoints_df = pd.read_csv('../../data/key_points_our_test.csv')

test_arg_df = pd.read_csv('../../data/test_data/arguments_test.csv')
test_keypoints_df = pd.read_csv('../../data/test_data/key_points_test.csv')

test_pred_keypoints_df = pd.read_pickle('../../data/pagerank-test-keypoints.pkl')
# test_pred_keypoints_df = pd.read_csv('../../data/pagerank-generated-test-keypoints.csv')

In [13]:
test_pred_keypoints_aspect_df = pd.read_csv('../../data/test_split_with_aspects.csv')

In [14]:
model = SentenceTransformer(models_list[0])
json_preds = predict(model, test_arg_df, test_keypoints_df, pred_output_path+"preds", True)

-----------