In [None]:
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import GroupKFold
pd.set_option('display.max_colwidth', None)

In [None]:
%load_ext autoreload

In [None]:
%autoreload

import sys
import os

os.environ['CUDA_VISIBLE_DEVICES'] = '2'
sys.path.insert(0, "../src-py")

import sbert_training

In [None]:
training_df = pd.read_csv('../../data/training_df.csv')
valid_df = pd.read_csv('../../data/our_valid.csv')

In [None]:
train_kp_df     = pd.read_csv('../../KPA_2021_shared_task/kpm_data/key_points_train.csv')
train_arg_df    = pd.read_csv('../../KPA_2021_shared_task/kpm_data/arguments_train.csv')
train_labels_df = pd.read_csv('../../KPA_2021_shared_task/kpm_data/labels_train.csv')

dev_kp_df     = pd.read_csv('../../KPA_2021_shared_task/kpm_data/key_points_dev.csv')
dev_arg_df    = pd.read_csv('../../KPA_2021_shared_task/kpm_data/arguments_dev.csv')
dev_labels_df = pd.read_csv('../../KPA_2021_shared_task/kpm_data/labels_dev.csv')

full_train_kp_df = pd.concat([train_kp_df,dev_kp_df])
full_train_arg_df  = pd.concat([train_arg_df,dev_arg_df])
full_train_labels_df = pd.concat([train_labels_df,dev_labels_df])

test_kp_df     = pd.read_csv('../../KPA_2021_shared_task/test_data/key_points_test.csv')
test_arg_df    = pd.read_csv('../../KPA_2021_shared_task//test_data/arguments_test.csv')

In [None]:
all_train_df = full_train_labels_df.merge(full_train_arg_df, how='inner', left_on='arg_id', right_on='arg_id')
all_train_df = all_train_df.merge(full_train_kp_df[['key_point_id', 'key_point']], how='inner', left_on='key_point_id', right_on='key_point_id')

In [None]:
all_train_df.head()

In [None]:
test_arg_df.head()

In [None]:
test_kp_df.head()

In [None]:
def match_argument_with_keypoints(result, kp_dict, arg_dict):
    
    for arg, arg_embedding in arg_dict.items():
        result[arg] = {}
        for kp, kp_embedding in kp_dict.items():
            result[arg][kp] = util.pytorch_cos_sim(arg_embedding, kp_embedding).item()
        
        #Applying softmax
        kp_scores = list(result[arg].items())
        kp_ids, kp_scores = zip(*kp_scores)
        #print(kp_ids)
        #print(kp_scores)
        #kp_scores = torch.softmax(torch.Tensor(kp_scores), 0).tolist()
        #print(kp_scores)
        result[arg] = {kp_id:score for kp_id, score in zip(kp_ids, kp_scores)}
        

    return result

def predict(model, argument_df, keypoint_df, output_path, append_topic=False):
    argument_keypoints = {}
    for topic in argument_df.topic.unique():
        for stance in [-1, 1]:
            topic_keypoints_ids = keypoint_df[(keypoint_df.topic==topic) & (keypoint_df.stance==stance)]['key_point_id'].tolist()
            topic_keypoints = keypoint_df[(keypoint_df.topic==topic) & (keypoint_df.stance==stance)]['key_point'].tolist()
            if append_topic:
                topic_keypoints = [topic + ' <SEP> ' + x for x in topic_keypoints]
                
            topic_keypoints_embeddings = model.encode(topic_keypoints)
            topic_kp_embed = dict(zip(topic_keypoints_ids, topic_keypoints_embeddings))

            topic_arguments_ids = argument_df[(argument_df.topic==topic) & (argument_df.stance==stance)]['arg_id'].tolist()
            topic_arguments = argument_df[(argument_df.topic==topic) & (argument_df.stance==stance)]['argument'].tolist()
            topic_arguments_embeddings = model.encode(topic_arguments)
            topic_arg_embed= dict(zip(topic_arguments_ids, topic_arguments_embeddings))

            argument_keypoints = match_argument_with_keypoints(argument_keypoints, topic_kp_embed, topic_arg_embed)
    
    json.dump(argument_keypoints, open(output_path, 'w'))
    
    return argument_keypoints

def predict_and_evaluate(argument_df, keypoint_df, gold_data_dir, subset_name):
    pred_df = {}
    for model_path in models_list:
        append_topic= 'topic_added' in model_path
        #Predict
        model = SentenceTransformer(model_path)
        model_name = model_path.split('/')[-1]
        predictions_file = pred_output_path+model_name+ '-' + subset_name + '-preds.json'
        json_preds = predict(model, argument_df, keypoint_df, predictions_file, append_topic)

        #Evaluate
        arg_df, kp_df, labels_df = load_kpm_data(gold_data_dir, subset=subset_name)
        merged_df = get_predictions(predictions_file, labels_df, arg_df)
        print('Evaluating {}:'.format(model_name))
        evaluate_predictions(merged_df)
        
        pred_df[model_name] = merged_df

    return pred_df

def predict_models(argument_df, keypoint_df, gold_data_dir, subset_name):
    pred_df = {}
    for model_path in models_list:
        append_topic= 'topic_added' in model_path
        #Predict
        model = SentenceTransformer(model_path)
        model_name = model_path.split('/')[-1]
        predictions_file = pred_output_path+model_name+ '-' + subset_name + '-preds.json'
        json_preds = predict(model, argument_df, keypoint_df, predictions_file, append_topic)

        #Evaluate
        arg_df, kp_df, labels_df = load_kpm_data(gold_data_dir, subset=subset_name)
        merged_df = get_predictions(predictions_file, labels_df, arg_df)
        #print('Evaluating {}:'.format(model_name))
        #evaluate_predictions(merged_df)
        
        pred_df[model_name] = merged_df

    return pred_df

In [None]:
skf = GroupKFold(n_splits=5)
fold = -1
for train_index, test_index in skf.split(all_train_df, groups=all_train_df.topic):
    fold += 1
    tmp_train_df, tmp_test_df = all_train_df.iloc[train_index], all_train_df.iloc[test_index]

    df = tmp_train_df.copy()
    df['keypoint'] = df.apply(lambda x: x['topic'] + ' <SEP> ' + x['key_point'], axis=1)
    df['label'] = df.label.apply(lambda x: int(x))
    df[['argument', 'keypoint', 'label']].to_csv('/workspace/ceph_data/data-in-progress/data-research/arguana/arg-generation/keypoint-analysis-sharedtask/siamese-data/training_df_contrastive-fold-{}.csv'.format(fold))
    
    df = tmp_test_df.copy()
    df['keypoint'] = df.apply(lambda x: x['topic'] + ' <SEP> ' + x['key_point'], axis=1)
    df['label'] = df.label.apply(lambda x: int(x))
    df[['argument', 'keypoint', 'label']].to_csv('/workspace/ceph_data/data-in-progress/data-research/arguana/arg-generation/keypoint-analysis-sharedtask/siamese-data/valid_df_contrastive-fold-{}.csv'.format(fold))

    tmp_test_key_points_df = tmp_test_df[['key_point_id', 'key_point', 'topic', 'stance']].drop_duplicates()
    tmp_test_arguments_df = tmp_test_df[['arg_id', 'argument', 'topic', 'stance']].drop_duplicates()
    tmp_test_labels_df = tmp_test_df[['arg_id', 'key_point_id', 'label']]
    tmp_test_key_points_df.to_csv('../../data/cross-validation/key_points_test.csv')
    tmp_test_arguments_df.to_csv('../../data/cross-validation/arguments_test.csv')
    tmp_test_labels_df.to_csv('../../data/cross-validation/labels_test.csv')


    sbert_training.train_model('/workspace/ceph_data/data-in-progress/data-research/arguana/arg-generation/keypoint-analysis-sharedtask/siamese-data/',
                            '../../data/cross-validation/',
                            'test',
                            '/workspace/ceph_data/data-in-progress/data-research/arguana/arg-generation/keypoint-analysis-sharedtask/siamese-models/',
                            'roberta-large',
                            model_suffix='contrastive-10-epochs-fold-{}'.format(fold), 
                            data_file_suffix='contrastive-fold-{}'.format(fold), 
                            num_epochs=1, max_seq_length=70, add_special_token=True, train_batch_size=32, loss='ContrastiveLoss')


In [None]:
models_list = [
    '/workspace/ceph_data/data-in-progress/data-research/arguana/arg-generation/keypoint-analysis-sharedtask/siamese-models/roberta-base-contrastive-10-epochs-fold-fold-0-2021-06-24_14-14-22',
     '/workspace/ceph_data/data-in-progress/data-research/arguana/arg-generation/keypoint-analysis-sharedtask/siamese-models/roberta-base-contrastive-10-epochs-fold-fold-1-2021-06-24_14-18-39',
     '/workspace/ceph_data/data-in-progress/data-research/arguana/arg-generation/keypoint-analysis-sharedtask/siamese-models/roberta-base-contrastive-10-epochs-fold-fold-2-2021-06-24_14-22-54',
     '/workspace/ceph_data/data-in-progress/data-research/arguana/arg-generation/keypoint-analysis-sharedtask/siamese-models/roberta-base-contrastive-10-epochs-fold-fold-3-2021-06-24_14-26-51',
     '/workspace/ceph_data/data-in-progress/data-research/arguana/arg-generation/keypoint-analysis-sharedtask/siamese-models/roberta-base-contrastive-10-epochs-fold-4-2021-06-24_14-56-42',
]

pred_output_path = '/workspace/ceph_data/data-in-progress/data-research/arguana/arg-generation/keypoint-analysis-sharedtask/siamese-data/preds/'


In [None]:
from sentence_transformers import SentenceTransformer, InputExample, LoggingHandler, losses, models, util
import torch
from track_1_kp_matching import *
# testing wheter prediction work on the dev data
test_keypoints_df = pd.read_csv('/workspace/project_git/keypoint-analysis-sharedtask/KPA_2021_shared_task/kpm_data/key_points_dev.csv')
test_arguments_df = pd.read_csv('/workspace/project_git/keypoint-analysis-sharedtask/KPA_2021_shared_task/kpm_data/arguments_dev.csv')
preds_df = predict_and_evaluate(test_arguments_df, test_keypoints_df,  '/workspace/project_git/keypoint-analysis-sharedtask/KPA_2021_shared_task/kpm_data', 'dev')

In [None]:
from sentence_transformers import SentenceTransformer, InputExample, LoggingHandler, losses, models, util
import torch
from track_1_kp_matching import *
# predicting the test data
test_keypoints_df = pd.read_csv('/workspace/project_git/keypoint-analysis-sharedtask/KPA_2021_shared_task/test_data/key_points_test.csv')
test_arguments_df = pd.read_csv('/workspace/project_git/keypoint-analysis-sharedtask/KPA_2021_shared_task/test_data/arguments_test.csv')
preds_df = predict_models(test_arguments_df, test_keypoints_df,  '../../data/cross-validation', 'test')

In [None]:
len(preds_df['roberta-base-contrastive-10-epochs-fold-fold-0-2021-06-24_14-14-22'])

In [None]:
scores = []
for model in models_list:
    scores.append(preds_df[model.split('/')[-1]].score)

In [None]:
len(np.mean(scores,axis=0))