In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install transformers

In [None]:
import os
import re
import random
import pandas as pd
import numpy as np
import csv
import tensorflow as tf
import torch
from sklearn.model_selection import train_test_split
from google.colab import drive
import textwrap
import progressbar
import keras
from keras_preprocessing.sequence import pad_sequences
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertForSequenceClassification, AdamW, BertConfig
from transformers import get_linear_schedule_with_warmup
import time
import datetime
import json
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

from sklearn.metrics import f1_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix

def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)
    
def metrics_calculator(preds, test_labels):
    cm = confusion_matrix(test_labels, preds)
    TP = []
    FP = []
    FN = []
    for i in range(0,2):
        summ = 0
        for j in range(0,2):
            if(i!=j):
                summ=summ+cm[i][j]

        FN.append(summ)
    for i in range(0,2):
        summ = 0
        for j in range(0,2):
            if(i!=j):
                summ=summ+cm[j][i]

        FP.append(summ)
    for i in range(0,2):
        TP.append(cm[i][i])
    precision = []
    recall = []
    for i in range(0,2):
        precision.append(TP[i]/(TP[i] + FP[i]))
        recall.append(TP[i]/(TP[i] + FN[i]))

    macro_precision = sum(precision)/2
    macro_recall = sum(recall)/2
    micro_precision = sum(TP)/(sum(TP) + sum(FP))
    micro_recall = sum(TP)/(sum(TP) + sum(FN))
    micro_f1 = (2*micro_precision*micro_recall)/(micro_precision + micro_recall)
    macro_f1 = (2*macro_precision*macro_recall)/(macro_precision + macro_recall)
    return macro_precision, macro_recall, macro_f1, micro_precision, micro_recall, micro_f1

seed_val = 2212
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [None]:
# load all models and select roberta
from transformers import PreTrainedModel, PreTrainedTokenizer, PretrainedConfig
from transformers import BertForSequenceClassification, BertTokenizer, BertConfig
from transformers import RobertaForSequenceClassification, RobertaTokenizer, RobertaConfig
from transformers import XLNetForSequenceClassification, XLNetTokenizer, XLNetConfig
from transformers import XLMForSequenceClassification, XLMTokenizer, XLMConfig
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer, DistilBertConfig

MODEL_CLASSES = {
    'bert': (BertForSequenceClassification, BertTokenizer, BertConfig),
    'xlnet': (XLNetForSequenceClassification, XLNetTokenizer, XLNetConfig),
    'xlm': (XLMForSequenceClassification, XLMTokenizer, XLMConfig),
    'roberta': (RobertaForSequenceClassification, RobertaTokenizer, RobertaConfig),
    'distilbert': (DistilBertForSequenceClassification, DistilBertTokenizer, DistilBertConfig)}

model_type = 'roberta' ###--> CHANGE WHAT MODEL YOU WANT HERE!!! <--###
model_class, tokenizer_class, config_class = MODEL_CLASSES[model_type]
model_name = 'roberta-base'

In [None]:
#save the trained model

#output_dir     = '/content/drive/MyDrive/Colab Notebooks/SemEval2023/CJPE-main/Models/finetuned/roberta_trained_on_single_10eps/'
#pretrained_dir = '/content/drive/MyDrive/Colab Notebooks/SemEval2023/CJPE-main/Models/finetuned/roberta_trained_on_single_preprocessed_2_10eps/'
pretrained_dir = '/content/drive/MyDrive/Colab Notebooks/SemEval2023/CJPE-main/Models/finetuned/roberta_trained_on_single_10eps/'

#input_file = '/content/drive/MyDrive/Colab Notebooks/SemEval2023/CJPE-main/Data/trainData/ILDC_single_train_dev_preprocessed_2.csv'
input_file = '/content/drive/MyDrive/Colab Notebooks/SemEval2023/CJPE-main/Data/trainData/ILDC_single_train_dev.csv'

#test_file  = '/content/drive/MyDrive/Colab Notebooks/SemEval2023/CJPE-main/Data/trainData/ILDC_single_test_explanation_preprocessed_2.csv'
#pred_file  = '/content/drive/MyDrive/Colab Notebooks/SemEval2023/CJPE-main/Data/trainData/ILDC_single_test_explanation_preprocessed_2_preds.csv'
test_file  = '/content/drive/MyDrive/Colab Notebooks/SemEval2023/CJPE-main/Data/trainData/ILDC_single_test_explanation.csv'
pred_file  = '/content/drive/MyDrive/Colab Notebooks/SemEval2023/CJPE-main/Data/trainData/ILDC_single_test_explanation_preds.csv'

#test_exp_file  = '/content/drive/MyDrive/Colab Notebooks/SemEval2023/CJPE-main/Data/trainData/ILDC_single_test_explanation.csv'
#pred_exp_file  = '/content/drive/MyDrive/Colab Notebooks/SemEval2023/CJPE-main/Data/trainData/ILDC_single_test_explanation_pred.csv'



In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
config = RobertaConfig.from_pretrained('roberta-base', output_hidden_states=True, output_attentions=True, num_labels=2) 
tokenizer = tokenizer_class.from_pretrained(model_name)

if pretrained_dir == None:
  print('\n\nno pretrained model')  
  model = RobertaForSequenceClassification.from_pretrained("roberta-base",  config=config)
else:
  model = RobertaForSequenceClassification.from_pretrained(pretrained_dir,  config=config)
  print('\n\nload from ', pretrained_dir)

model.to(device)

In [None]:
#load data
df = pd.read_csv(input_file) # path to multi_dataset
df = df.applymap(lambda x: x.strip().replace('-\n', '').replace('\n', '') if isinstance(x, str) else x)

#assert all(not '\n' in df.iloc[i]['text'] for i in range(len(df)))

train_set = df.query(" split=='train' ")
validation_set = df.query(" split=='dev' ")
#validation_set = pd.read_csv(test_file)
#test_set = pd.read_csv(test_file)

In [None]:
validation_set['id'] = range(len(validation_set))
print(validation_set)

In [None]:
df

# Preprocessing

# Preparing data

In [None]:
def input_id_maker(dataf, tokenizer):
  input_ids = []
  lengths = []

  for i in progressbar.progressbar(range(len(dataf['text']))):
    sen = dataf['text'].iloc[i]
    sen = tokenizer.tokenize(sen, add_prefix_space=True)
    CLS = tokenizer.cls_token
    SEP = tokenizer.sep_token
    if(len(sen) > 510):
      sen = sen[len(sen)-510:]
    
    '''if len(sen) > 510:
      sen = sen[0:255] + sen[len(sen)-255:]'''

    sen = [CLS] + sen + [SEP]
    encoded_sent = tokenizer.convert_tokens_to_ids(sen)
    input_ids.append(encoded_sent)
    lengths.append(len(encoded_sent))

  input_ids = pad_sequences(input_ids, maxlen=512, value=0, dtype="long", truncating="pre", padding="post")
  return input_ids, lengths

In [None]:
train_input_ids, train_lengths = input_id_maker(train_set, tokenizer)
validation_input_ids, validation_lengths = input_id_maker(validation_set, tokenizer)

In [None]:
def att_masking(input_ids):
  attention_masks = []
  for sent in input_ids:
    att_mask = [int(token_id > 0) for token_id in sent]
    attention_masks.append(att_mask)
  return attention_masks

In [None]:
train_attention_masks = att_masking(train_input_ids)
validation_attention_masks = att_masking(validation_input_ids)

train_labels = train_set['label'].to_numpy().astype('int')
validation_labels = validation_set['label'].to_numpy().astype('int')
#validation_labels = validation_set['id'].to_numpy().astype('int')

In [None]:
train_inputs = train_input_ids
validation_inputs = validation_input_ids
train_masks = train_attention_masks
validation_masks = validation_attention_masks

train_inputs = torch.tensor(train_inputs)
train_labels = torch.tensor(train_labels)
train_masks = torch.tensor(train_masks)
validation_inputs = torch.tensor(validation_inputs)
validation_labels = torch.tensor(validation_labels)
validation_masks = torch.tensor(validation_masks)

In [None]:
batch_size = 6
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size = batch_size)
validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size = batch_size)

# Training

In [None]:
'''
# max batch size should be 6 due to colab limits
batch_size = 6
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size = batch_size)

lr = 2e-6
max_grad_norm = 1.0
epochs = 10
num_total_steps = len(train_dataloader)*epochs
num_warmup_steps = 1000
warmup_proportion = float(num_warmup_steps) / float(num_total_steps)  # 0.1
optimizer = AdamW(model.parameters(), lr=lr, correct_bias=True)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = num_warmup_steps, num_training_steps = num_total_steps)

loss_values = []

# For each epoch...
for epoch_i in range(0, epochs):
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    t0 = time.time()
    total_loss = 0

    model.train()

    for step, batch in enumerate(train_dataloader):
        if step % 40 == 0 and not step == 0:
            print('  Batch {:>5,}  of  {:>5,}. '.format(step, len(train_dataloader)))

        
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        model.zero_grad()        

        outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
        
        loss = outputs[0]
        total_loss += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()

    avg_train_loss = total_loss / len(train_dataloader)            
    loss_values.append(avg_train_loss)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))

    if (epoch_i+1)%10 == 0:
    # Create output directory if needed
      if not os.path.exists(output_dir):
          os.makedirs(output_dir)

      print("Saving model to %s (epoch %d)" % (output_dir, epoch_i+1))
      model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
      model_to_save.save_pretrained(output_dir)
      tokenizer.save_pretrained(output_dir)


    if (epoch_i+1)%5 == 0:
        
      print("")
      print("Running Validation...")

      t0 = time.time()

      model.eval()

      eval_loss, eval_accuracy = 0, 0
      nb_eval_steps, nb_eval_examples = 0, 0

      for batch in validation_dataloader:
          batch = tuple(t.to(device) for t in batch)
          b_input_ids, b_input_mask, b_labels = batch
          
          with torch.no_grad():        
            outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
      
          logits = outputs[0]

          logits = logits.detach().cpu().numpy()
          label_ids = b_labels.to('cpu').numpy()
          
          tmp_eval_accuracy = flat_accuracy(logits, label_ids)
          eval_accuracy += tmp_eval_accuracy

          nb_eval_steps += 1

      # Report the final accuracy for this validation run.
      print("  Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))

print("")
print("Training complete!")


if not os.path.exists(output_dir):
  os.makedirs(output_dir)

print("Saving model to %s (epoch %d)" % (output_dir, epoch_i+1))
model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
model_to_save.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)


'''

#Predicting

## Evaluation on training set

Predicting labels for 4,982 test sentences...
0.9207145724608591
0.925262014040807 0.9064129216309111 0.9157404833006237 0.9207145724608591 0.9207145724608591 0.9207145724608591
    DONE.

In [None]:
'''batch_size = 6

prediction_data = TensorDataset(train_inputs, train_masks, train_labels)
prediction_sampler = SequentialSampler(train_data)
prediction_dataloader = DataLoader(train_data, shuffle=False, sampler=None, batch_size = batch_size)

print('Predicting labels for {:,} test sentences...'.format(len(prediction_data)))
model.eval()

predictions , true_labels = [], []
explanations = []

for (step, batch) in enumerate(prediction_dataloader):
  batch = tuple(t.to(device) for t in batch)
  
  b_input_ids, b_input_mask, b_labels = batch
  #print(b_input_ids)

  with torch.no_grad():
      outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)

      logits = outputs.logits

  # Move logits and labels to CPU
  logits = logits.detach().cpu().numpy()
  label_ids = b_labels.to('cpu').numpy()
  
  # Store predictions and true labels
  predictions.append(logits)
  true_labels.append(label_ids)

predictions = np.concatenate(predictions, axis=0)
true_labels = np.concatenate(true_labels, axis=0)
pred_flat = np.argmax(predictions, axis=1).flatten()
labels_flat = true_labels.flatten()

print(flat_accuracy(predictions,true_labels))

macro_precision, macro_recall, macro_f1, micro_precision, micro_recall, micro_f1 = metrics_calculator(pred_flat, labels_flat)
print(macro_precision, macro_recall, macro_f1, micro_precision, micro_recall, micro_f1)

print('    DONE.')'''

##evaluation on dev set

Predicting labels for 994 test sentences...
0.6056338028169014
0.6907555653032131 0.6056338028169015 0.6454001091325866 0.6056338028169014 0.6056338028169014 0.6056338028169014
    DONE.

In [None]:
'''batch_size = 1

# Create the DataLoader.
prediction_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
prediction_sampler = SequentialSampler(prediction_data)
prediction_dataloader = DataLoader(prediction_data, shuffle=False, sampler=None, batch_size = batch_size)

print('Predicting labels for {:,} test sentences...'.format(len(prediction_data)))
model.eval()

predictions , true_labels = [], []
explanations = []

for (step, batch) in enumerate(prediction_dataloader):
  batch = tuple(t.to(device) for t in batch)
  
  b_input_ids, b_input_mask, b_labels = batch
  #print(b_input_ids)

  with torch.no_grad():
      outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
      logits = outputs.logits

  # Move logits and labels to CPU
  logits = logits.detach().cpu().numpy()
  label_ids = b_labels.to('cpu').numpy()
  attentions = attentions.detach().cpu().numpy()
  
  # Store predictions and true labels
  predictions.append(logits)
  true_labels.append(label_ids)

predictions = np.concatenate(predictions, axis=0)
true_labels = np.concatenate(true_labels, axis=0)
pred_flat = np.argmax(predictions, axis=1).flatten()
labels_flat = true_labels.flatten()

print(flat_accuracy(predictions,true_labels))

macro_precision, macro_recall, macro_f1, micro_precision, micro_recall, micro_f1 = metrics_calculator(pred_flat, labels_flat)
print(macro_precision, macro_recall, macro_f1, micro_precision, micro_recall, micro_f1)

print('    DONE.')'''

## Prediction on binary classification test set

In [None]:
'''batch_size = 1

# Create the DataLoader.
prediction_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
prediction_sampler = SequentialSampler(prediction_data)
prediction_dataloader = DataLoader(prediction_data, shuffle=False, sampler=None, batch_size = batch_size)

print('Predicting labels for {:,} test sentences...'.format(len(prediction_data)))
model.eval()

predictions , true_labels = [], []
explanations = []

for (step, batch) in enumerate(prediction_dataloader):
  batch = tuple(t.to(device) for t in batch)
  
  b_input_ids, b_input_mask, b_labels = batch
  #print(b_input_ids)

  with torch.no_grad():
      outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)

      logits = outputs.logits
      
  # Move logits and labels to CPU
  logits = logits.detach().cpu().numpy()
  label_ids = b_labels.to('cpu').numpy()
  
  # Store predictions and true labels
  predictions.append(logits)
  true_labels.append(label_ids)

predictions = np.concatenate(predictions, axis=0)
true_labels = np.concatenate(true_labels, axis=0)
pred_flat = np.argmax(predictions, axis=1).flatten()
labels_flat = true_labels.flatten()


print('    DONE.')

validation_set['id2'] = labels_flat
validation_set['prediction'] = pred_flat
assert all(validation_set['id2'] == validation_set['id'])
validation_set.to_csv(pred_file, index=False,)'''

##Prediction on explanation test set

In [None]:
batch_size = 1
window_size = 128

# Create the DataLoader.
prediction_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
prediction_sampler = SequentialSampler(prediction_data)
prediction_dataloader = DataLoader(prediction_data, shuffle=False, sampler=None, batch_size = batch_size)

print('Predicting labels for {:,} test sentences...'.format(len(prediction_data)))
model.eval()

predictions , true_labels = [], []
explanations = []

for (step, batch) in enumerate(prediction_dataloader):
  batch = tuple(t.to(device) for t in batch)
  
  b_input_ids, b_input_mask, b_labels = batch
  #print(b_input_ids)

  with torch.no_grad():
      outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)

      logits = outputs.logits
      attentions = outputs.attentions[-1]
      #max_ind = attentions[0][-1][0].detach().cpu().numpy().argmax()

      max_inds = np.argpartition(attentions[0][-1][0].detach().cpu().numpy(), -3)[-3:]
      
      explanation_sents = []
      for max_ind in max_inds:
        selected_tokens = b_input_ids[0][max_ind-int(window_size/2):max_ind+int(window_size/2)]
        tokens = tokenizer.convert_ids_to_tokens(selected_tokens) 
        #print(tokens)
        sent = tokenizer.convert_tokens_to_string(tokens)
        #print(sent)
        explanation_sents.append(sent)

      explanation_sents = " ".join(explanation_sents)
      explanations.append(explanation_sents) 

  # Move logits and labels to CPU
  logits = logits.detach().cpu().numpy()
  label_ids = b_labels.to('cpu').numpy()
  attentions = attentions.detach().cpu().numpy()
  
  # Store predictions and true labels
  predictions.append(logits)
  true_labels.append(label_ids)

predictions = np.concatenate(predictions, axis=0)
true_labels = np.concatenate(true_labels, axis=0)
pred_flat = np.argmax(predictions, axis=1).flatten()
labels_flat = true_labels.flatten()

print('    DONE.')


decisions = ['Accepted' if _ == 1 else 'Denied' for _ in pred_flat]

validation_set['id2'] = labels_flat
validation_set['prediction'] = pred_flat
validation_set['decision'] = decisions
validation_set['explanation'] = explanations
assert all(validation_set['id2'] == validation_set['id'])
validation_set.to_csv(pred_file, index=False, columns=['uid','decision','explanation'] )