In [None]:
import sys
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

In [None]:
import re
import matplotlib.pyplot as plt
import transformers
import datasets
import glob

import torch
import json
from pathlib import Path
from datasets import load_dataset, Dataset
from transformers import (AutoTokenizer, AutoModelForSequenceClassification, default_data_collator,
                          DebertaV2ForSequenceClassification, DebertaV2Tokenizer,
                          TrainingArguments, Trainer, AutoConfig)
from transformers.modeling_outputs import SequenceClassifierOutput

from torch.utils.data import DataLoader

import numpy as np
import pandas as pd

from sklearn.metrics import f1_score, precision_score, recall_score

from sklearn.model_selection import train_test_split

from imblearn.over_sampling import RandomOverSampler
from tabulate import tabulate


device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
pd.set_option('display.max_colwidth', None)

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
import torch 
import random
torch.manual_seed(123)
torch.cuda.manual_seed(123)
np.random.seed(123)
random.seed(123)
ros = RandomOverSampler(random_state=123)

In [None]:
#model_name = "deberta-v3-large"
model_name = "longformer-base-4096"

In [None]:
def load_ds(ds_path, model_name):
    df = pd.read_pickle(ds_path)

    #Aligning the 5-levels labels to eli5 ones
        
    #'(D06) To answer - Other' -> '(D06) Answer - Other'
    #'(D07) To provide agreement statement' -> '(D07) Agreement'
    #'(D08) To provide disagreement statement' -> '(D08) Disagreement'
    #'(D10) Other' -> '(D09) Other'
    #'(D09) To provide informing statement' -> (D10) To provide informing statement
    
    
    # (E10) Other -> (E09) Other 
    # (E09) Introducing Extraneous Information -> (E10) Introducing Extraneous Information
    
    df['exp_act_label'] = df.exp_act_label.apply(lambda x: '(E10) Other' if x == '(E09) Other' else x)
    df['exp_act_label'] = df.exp_act_label.apply(lambda x: '(E09) Introducing Extraneous Information' if x == '(E10) Introducing Extraneous Information' else x)

    df['dlg_act_label'] = df.dlg_act_label.apply(lambda x: '(D09) Other' if x == '(D10) Other' else x)
    df['dlg_act_label'] = df.dlg_act_label.apply(lambda x: '(D10) To provide informing statement' if x == '(D09) To provide informing statement' else x)
    
    df['dlg_act_label'] = df.dlg_act_label.apply(lambda x: '(D06) Answer - Other' if x == '(D06) To answer - Other' else x)
    df['dlg_act_label'] = df.dlg_act_label.apply(lambda x: '(D07) Agreement' if x == '(D07) To provide agreement statement' else x)
    df['dlg_act_label'] = df.dlg_act_label.apply(lambda x: '(D08) Disagreement' if x == '(D08) To provide disagreement statement' else x)
    
    tokenizer = AutoTokenizer.from_pretrained(f"/bigwork/nhwpficl/hf_models/{model_name}")

    sep_token = tokenizer.sep_token

    df['turn_text_with_topic'] = df.apply(lambda row: {
                                        'author': row['turn_text']['author'], 
                                        'text'  : row['topic'].replace('_', ' ') + f' {sep_token} ' +  row['turn_text']['text']
                                       } ,axis=1)

    return df

In [None]:
fivelvls_annotation_df = load_ds('../../data/five_levels_ds/annotation-results/MACE-measure/final_mace_predictions.pkl', model_name)
eli5_annotation_df     = load_ds('../../data/eli5_ds/annotation-results/MACE-measure/final_mace_predictions_training.pkl', model_name)

In [None]:
fivelvls_annotation_df['ds'] = ['5lvls'] * len(fivelvls_annotation_df)
eli5_annotation_df['ds'] = ['eli5'] * len(eli5_annotation_df)
dlgs_df = pd.concat([fivelvls_annotation_df, eli5_annotation_df])

In [None]:
dlgs_df['ds'].value_counts()

ds
eli5     2728
5lvls    1550
Name: count, dtype: int64

In [None]:
dlgs_df.head()

### Experiments:

- Train three models, one on eli5, one on 5lvls, and one on both, and evaluate them in three settings, in-domain and out-domain - through a 5-fold cross validation setting

#### Prepare Folds:

In [None]:
print(len(dlgs_df[dlgs_df.ds == '5lvls'].topic.value_counts()))

13


In [None]:
dlgs_df[dlgs_df.ds == '5lvls'].topic.value_counts()

topic
dimensions          244
gravity             179
music_harmony       150
sleep_scientist     147
lasers              142
machine_learning    119
origani             112
hacking             100
nano_technology      92
blockchain           91
blackhole            70
connectome           60
virtual_reality      44
Name: count, dtype: int64

In [None]:
print(len(dlgs_df[dlgs_df.ds == 'eli5'].topic.value_counts()))

154


In [None]:
dlgs_df[dlgs_df.ds == 'eli5'].topic.value_counts()

In [14]:
from sklearn.model_selection import KFold

In [None]:
#split the two corpora
eli5_topics  = dlgs_df[dlgs_df.ds == 'eli5'].topic.unique()
flvls_topics = dlgs_df[dlgs_df.ds == '5lvls'].topic.unique()

kfold = KFold(n_splits=5)
flvls_folds = [(flvls_topics[idx[0]], flvls_topics[idx[1]]) for idx in kfold.split(flvls_topics)]
eli5_folds  = [(eli5_topics[idx[0]], eli5_topics[idx[1]]) for idx in kfold.split(eli5_topics)]

eli5_training_folds, eli5_test_folds = zip(*eli5_folds)
flvls_training_folds, flvls_test_folds = zip(*flvls_folds)

In [None]:
all_folds = {'train':{'5lvls': [x.tolist() for x in flvls_training_folds], 'eli5': [x.tolist() for x in eli5_training_folds]},
             'test':{'5lvls': [x.tolist() for x in flvls_test_folds], 'eli5': [x.tolist() for x in eli5_test_folds]}}

In [None]:
json.dump(all_folds, open('../../data/topic_folds.json', 'w'))

### Training models:

Now we train models via command line running turn_label_prediction_experiment_with_bert

The code for training the models is in: turn_label_prediction_experiments_with_bert and with_bert_seq

### Prediction using the trained models:

In [None]:
def majority_class(df):
    topics = df.topic.unique()
    for topic in topics:
        training_df = df[df.topic != topic]
        #compute the majority class for each label
        l = len(df[df.topic == topic])
        df.loc[df.topic == topic, 'topic_func_maj_pred'] = [training_df.topic_func_label.mode()] * l
        df.loc[df.topic == topic, 'dlg_act_maj_pred']    = [training_df.dlg_act_label.mode()] * l
        df.loc[df.topic == topic, 'exp_act_maj_pred']    = [training_df.exp_act_label.mode()] * l
    
    return df

def eval_preds(df, models_names, gt_clms, pred_clms):
    results_table = []
    for label in zip(gt_clms, pred_clms, models_names):
        ground_truths = df[label[0]].tolist()
        predictions   = df[label[1]].tolist()
        model_name = label[2]
        
        class_names = df[label[0]].unique()

        prc_scores = precision_score(ground_truths, predictions, average=None, labels=class_names)
        rec_scores = recall_score(ground_truths, predictions, average=None, labels=class_names)
        f1_scores  = f1_score(ground_truths, predictions, average=None, labels=class_names)
        
        macro_prc_scores = precision_score(predictions, ground_truths, average='macro', labels=class_names)
        macro_rec_scores = recall_score(predictions, ground_truths, average='macro', labels=class_names)
        macro_f1 = f1_score(predictions, ground_truths, average='macro', labels=class_names)
        
        scores ={}
        for i, c in enumerate(class_names):
            scores[c] = {'prec': round(prc_scores[i],2), 'recall': round(rec_scores[i],2), 'f1': round(f1_scores[i],2)}
        
        scores['Macro AVG.'] = {'prec': round(macro_prc_scores,2), 'recall': round(macro_rec_scores,2), 'f1': round(macro_f1,2)}
        
        results_table.append([model_name, label[0], scores])
    
    return results_table

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions.argmax(-1)
    f1score = f1_score(predictions, labels, average='macro')
    return {'f1-score': f1score}

def evaluate_fold(fold_path, df, fold_idx, input_clm, tokenizer, label_clm='exp_act_label', ds_names=[]):
    print(fold_path)
    config = AutoConfig.from_pretrained(f"/bigwork/nhwpficl/hf_models/{model_name}")
    if "deberta-v3" in model_name:
        model = DebertaV2ForSequenceClassification.from_pretrained(fold_path + 'best_model').to(device)
        max_length = config.max_position_embeddings * 2
    else:
        model = AutoModelForSequenceClassification.from_pretrained(fold_path + 'best_model').to(device)
        max_length = config.max_position_embeddings - 2
    
    model.eval()
    fold_topics = []
    if 'eli5' in ds_names:
        fold_topics += folds_dict['test']['eli5'][fold_idx]
    if '5lvls' in ds_names:
        fold_topics += folds_dict['test']['5lvls'][fold_idx]
    
    test_df  = df[df.topic.isin(fold_topics)]
    test_df['labels'] = test_df[label_clm].apply(lambda x: int(x[2:4])-1) 
    
    eval_dataset = Dataset.from_pandas(test_df)
    eval_dataset = eval_dataset.map(lambda examples: tokenizer([x['text'] for x in examples[input_clm]], padding='max_length', max_length=max_length), batched=True)
    eval_dataset = eval_dataset.remove_columns(test_df.columns.tolist() + ['__index_level_0__'])
    
    
    eval_dataloader = DataLoader(eval_dataset, collate_fn=default_data_collator)
    all_predictions = []
    for step, batch in enumerate(eval_dataloader):
        batch = {x[0]: x[1].cuda() for x in batch.items()}
        outputs = model(**batch)
        predictions = outputs.logits.argmax(dim=-1)
        all_predictions+=[x.item() for x in predictions]

    #print(all_predictions)
    #print(test_df['labels'].tolist())
    
    return f1_score(all_predictions, test_df['labels'].tolist(), average='macro')

def evaluate_model(models_path, label_clm, label_model, model_name='bert-base-uncased'):
    f1_scores = {}

    if "deberta-v3" in model_name:
        tokenizer = DebertaV2Tokenizer.from_pretrained(f"/bigwork/nhwpficl/hf_models/{model_name}")
    else:
        tokenizer = AutoTokenizer.from_pretrained(f"/bigwork/nhwpficl/hf_models/{model_name}")
    
    for model_type in ['all_models']:
        model_scores = []
        for ds_names in [['eli5'], ['5lvls'], ['5lvls', 'eli5']]:
            s = np.mean([evaluate_fold('{}/{}/{}/model/fold-{}/'.format(models_path, label_model, model_type, fold), dlgs_df.copy(), 
                                       fold, 'turn_text_with_topic', tokenizer, label_clm, ds_names=ds_names) 
                         for fold in range(5)])
            model_scores.append(round(s, 2))
        f1_scores[model_name] = model_scores
    
    return f1_scores

In [None]:
folds_dict = json.load(open('../../data/topic_folds.json'))

In [None]:
dlg_act_f1_scores = evaluate_model(f'../../data/turn-label-models/{model_name}/', 'dlg_act_label', 'dlg_act_label_prediction', model_name=model_name)

In [None]:
exp_act_f1_scores = evaluate_model(f'../../data/turn-label-models/{model_name}/', 'exp_act_label', 'exp_act_label_prediction', model_name=model_name)

In [None]:
topic_func_f1_scores = evaluate_model(f'../../data/turn-label-models/{model_name}/', 'topic_func_label', 'topic_func_label_prediction', model_name=model_name)

In [None]:
print(tabulate([[x[0][0]] + x[0][1] + x[1][1] + x[2][1] for x in zip(exp_act_f1_scores.items(), dlg_act_f1_scores.items(), topic_func_f1_scores.items())], 
               headers=['Approach', 'ELI5', '5lvls', 'ALL', 'ELI5', '5lvls', 'ALL', 'ELI5', '5lvls', 'ALL']))

Approach                ELI5    5lvls    ALL    ELI5    5lvls    ALL    ELI5    5lvls    ALL
--------------------  ------  -------  -----  ------  -------  -----  ------  -------  -----
longformer-base-4096    0.37     0.38   0.41    0.38     0.47   0.48    0.38     0.56    0.5


--------

### Predicting on the test set:

- Best Performing Models:
    - Explanation Moves: ELI-5 trained BERT-Seq
    - Dialogue Acts: Both trained RoBERTa
    - Topic Func: ELI-5 trained RoBERTa

In [None]:
def load_ds(ds_path, model_name):
    if "pkl" in ds_path:
        df = pd.read_pickle(ds_path)
    else:
        with open(f"{ds_path}/chat_per_user.json", "r") as file:
            chat_per_user = json.load(file)
        with open(f"{ds_path}/setup_per_user.json", "r") as file:
            setup_per_user = json.load(file)
        
        turns = []
        for user_id in chat_per_user:
            for turn in chat_per_user[user_id]:
                turn["task_id"] = user_id
                turn["topic"] = setup_per_user[user_id]["explanandum"]
                turn["system_prompt"] = setup_per_user[user_id]["setting"]
                turns.append(turn)
        df = pd.DataFrame.from_dict(turns)

    #Aligning the 5-levels labels to eli5 ones
        
    #'(D06) To answer - Other' -> '(D06) Answer - Other'
    #'(D07) To provide agreement statement' -> '(D07) Agreement'
    #'(D08) To provide disagreement statement' -> '(D08) Disagreement'
    #'(D10) Other' -> '(D09) Other'
    #'(D09) To provide informing statement' -> (D10) To provide informing statement
    
    
    # (E10) Other -> (E09) Other 
    # (E09) Introducing Extraneous Information -> (E10) Introducing Extraneous Information
    
    if 'exp_act_label' in df:
        df['exp_act_label'] = df.exp_act_label.apply(lambda x: '(E10) Other' if x == '(E09) Other' else x)
        df['exp_act_label'] = df.exp_act_label.apply(lambda x: '(E09) Introducing Extraneous Information' if x == '(E10) Introducing Extraneous Information' else x)

        df['dlg_act_label'] = df.dlg_act_label.apply(lambda x: '(D09) Other' if x == '(D10) Other' else x)
        df['dlg_act_label'] = df.dlg_act_label.apply(lambda x: '(D10) To provide informing statement' if x == '(D09) To provide informing statement' else x)

        df['dlg_act_label'] = df.dlg_act_label.apply(lambda x: '(D06) Answer - Other' if x == '(D06) To answer - Other' else x)
        df['dlg_act_label'] = df.dlg_act_label.apply(lambda x: '(D07) Agreement' if x == '(D07) To provide agreement statement' else x)
        df['dlg_act_label'] = df.dlg_act_label.apply(lambda x: '(D08) Disagreement' if x == '(D08) To provide disagreement statement' else x)
    
    tokenizer = AutoTokenizer.from_pretrained(f"/bigwork/nhwpficl/hf_models/{model_name}")

    sep_token = tokenizer.sep_token

    df['turn_text_with_topic'] = df.apply(lambda row: {
                                        'author': row['turn_text']['author'], 
                                        'text'  : row['topic'].replace('_', ' ') + f' {sep_token} ' +  row['turn_text']['text']
                                       } ,axis=1)

    return df

In [None]:
#Loading and preparing data
model_name = "longformer-base-4096"
fivelvls_annotation_df = load_ds('../../data/five_levels_ds/annotation-results/MACE-measure/final_mace_predictions.pkl', model_name)
eli5_annotation_df     = load_ds('../../data/eli5_ds/annotation-results/MACE-measure/final_mace_predictions.pkl', model_name)

fivelvls_annotation_df['ds'] = ['5lvls'] * len(fivelvls_annotation_df)
eli5_annotation_df['ds'] = ['eli5'] * len(eli5_annotation_df)
dlgs_df = pd.concat([fivelvls_annotation_df, eli5_annotation_df])

#split into train test split
#train_test_topics = {"test": {}}
#for dataset in ['eli5', '5lvls']:
#    topics = dlgs_df[dlgs_df.ds == dataset].topic.unique()
#    _, valid_topics = train_test_split(topics, shuffle=False, test_size=0.2, random_state=0)
#    train_test_topics["test"][dataset] = list(valid_topics)
#test_df  = dlgs_df[dlgs_df.topic.isin(train_test_topics['test']['5lvls'] + train_test_topics['test']['eli5'])]
#test_df.head()

In [None]:
test_df = load_ds('../../../evaluation/user_study_data', model_name)
test_df.head()

In [None]:
def ensemble_generate(models_path, test_df, input_clm, label_clm, label_model, model_name):
    fold_path = '{}/{}/{}/model/'.format(models_path, label_model, 'all_models')
    print(fold_path)
    label_dictionary = {int(l[2:4])-1 : l for l in  dlgs_df[label_clm].unique()}
    print(label_dictionary)
    
    config = AutoConfig.from_pretrained(f"/bigwork/nhwpficl/hf_models/{model_name}")
    if "deberta-v3" in model_name:
        model = DebertaV2ForSequenceClassification.from_pretrained(fold_path + 'best_model').to(device)
        max_length = config.max_position_embeddings * 2
        tokenizer = DebertaV2Tokenizer.from_pretrained(f"/bigwork/nhwpficl/hf_models/{model_name}")
    else:
        model = AutoModelForSequenceClassification.from_pretrained(fold_path + 'best_model').to(device)
        max_length = config.max_position_embeddings - 2
        tokenizer = AutoTokenizer.from_pretrained(f"/bigwork/nhwpficl/hf_models/{model_name}")
        
    eval_dataset = Dataset.from_pandas(test_df)
    eval_dataset = eval_dataset.map(lambda examples: tokenizer([str(x['text']) for x in examples[input_clm]], padding='max_length', max_length=max_length), batched=True)
    eval_dataset = eval_dataset.remove_columns(test_df.columns.tolist())
    
    
    eval_dataloader = DataLoader(eval_dataset, collate_fn=default_data_collator, batch_size=4)
    all_predictions = []
    for step, batch in enumerate(eval_dataloader):
        batch = {x[0]: x[1].cuda() for x in batch.items()}
        
        model_outputs = model(**batch)
        model_predictions = model_outputs.logits.argmax(dim=-1)
        model_predictions = [x.item() for x in model_predictions]
        all_predictions += model_predictions
        #print(batch['input_ids'].shape)
        #print(models_outputs)
        #print(models_predictions)

    test_df[label_clm + '_predictions'] = [label_dictionary[x] for x in all_predictions]
    return test_df

In [None]:
test_df = ensemble_generate(f'../../data/final-turn-label-models/{model_name}/', test_df, 'turn_text_with_topic', 'dlg_act_label', 'dlg_act_label_prediction', model_name=model_name)

In [None]:
test_df = ensemble_generate(f'../../data/final-turn-label-models/{model_name}/', test_df, 'turn_text_with_topic', 'exp_act_label', 'exp_act_label_prediction', model_name=model_name)

In [None]:
test_df = ensemble_generate(f'../../data/final-turn-label-models/{model_name}/', test_df, 'turn_text_with_topic', 'topic_func_label', 'topic_func_label_prediction', model_name=model_name)

In [None]:
test_df.head()

In [None]:
test_df.to_pickle(f'../../data/final_mace_predictions_{model_name}.pkl')