### This notebook is to jointly train BART-v2 model for both generating the conclusion and the counter

In [1]:
import os
import sys
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]="1"

In [2]:
sys.path.append('../src-py')

In [3]:
import transformers
import datasets
from utils import *
from mt_bart_v2 import *

print(f"Running on transformers v{transformers.__version__} and datasets v{datasets.__version__}")

Running on transformers v4.18.0 and datasets v2.1.0


In [8]:
import torch
import json

import nltk
import numpy as np
import pandas as pd

from pathlib import Path
from datasets import load_dataset, load_metric, Dataset

from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import BartTokenizer, BartForConditionalGeneration

In [9]:
pd.set_option('display.max_colwidth', None)

In [10]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [11]:
device

device(type='cuda')

In [12]:
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')

In [13]:
#This function to fine-tune different weighting schemes of the two loss functions
#conclusion gen loss and counter gen loss.
def get_model(params):
    compute_dynamic_weights=False
    conc_loss_weight=0.5 if params == None else params['conc_loss_weight']
    counter_loss_weight=0.5 if params == None else params['counter_loss_weight']
    attention_to_conc=False
    conc_decoder=True
    model     = BartModelV2.from_pretrained('facebook/bart-base', 
                                            compute_dynamic_weights=False, 
                                            conc_loss_weight = conc_loss_weight, 
                                            counter_loss_weight=counter_loss_weight, 
                                            attention_to_conc=attention_to_conc, 
                                            conc_decoder=conc_decoder).to(device)

    original_bart_model = BartModel.from_pretrained('facebook/bart-base').to(device)

    #load the weights of the two decoders
    model.conclusion_decoder.load_state_dict(original_bart_model.decoder.state_dict())

    data_collator= DataCollatorForSeq2Seq(tokenizer, model)
    
    return data_collator, model

In [14]:
data_fold = '../../../data-ceph/arguana/arg-generation/multi-taks-counter-argument-generation/'

In [15]:
#Taking unique posts from valid dataset and sample only 1500 instances
# valid_df = pd.read_pickle(data_fold+'/reddit_data/conclusion_and_ca_generation/valid_conclusion_comp_remove_75sem_perc.pkl')
# valid_unique_df = valid_df.drop_duplicates('post_id')
# valid_sample_df = valid_unique_df.sample(1500)
# valid_sample_df.to_pickle(data_fold+'/reddit_data/conclusion_and_ca_generation/valid_conclusion_comp_remove_75sem_perc_sample.pkl')

# test_df = pd.read_pickle(data_fold+'/reddit_data/conclusion_and_ca_generation/test_concusion_comp_remove_75sem_perc.pkl')
# test_unique_df = test_df.drop_duplicates('post_id')
# test_unique_df.to_pickle(data_fold+'/reddit_data/conclusion_and_ca_generation/test_concusion_comp_remove_75sem_perc_sample.pkl')

#Taking unique posts from valid dataset and sample only 1500 instances
# valid_df = pd.read_pickle(data_fold+'/reddit_data/conclusion_and_ca_generation/valid_conclusion_all.pkl')
# valid_unique_df = valid_df.drop_duplicates('post_id')
# valid_sample_df = valid_unique_df.sample(1500)
# valid_sample_df.to_pickle(data_fold+'/reddit_data/conclusion_and_ca_generation/valid_conclusion_all_sample.pkl')

# test_df = pd.read_pickle(data_fold+'/reddit_data/conclusion_and_ca_generation/test_concusion_all.pkl')
# test_unique_df = test_df.drop_duplicates('post_id')
# test_unique_df = test_df.sample(2500)
# test_unique_df.to_pickle(data_fold+'/reddit_data/conclusion_and_ca_generation/test_concusion_all_sample.pkl')

In [16]:
#Encoding function for joint generation of conclusion and counter
def preprocess_function(examples, tokenizer, premises_clm, counter_clm, conclusion_clm, 
                        max_input_length=512, max_conc_length=100, max_counter_length=200):
    premises    = examples[premises_clm]
    conclusions = examples[conclusion_clm]
    counters = examples[counter_clm]
    
        
    premises = [' '.join(x) for x in premises] if isinstance(premises[0], list) else premises
    counters = [' '.join(x) for x in counters] if isinstance(counters[0], list) else counters
    conclusions = [' '.join(x) for x in conclusions] if isinstance(conclusions[0], list) else conclusions
    
    model_inputs = tokenizer(premises, max_length=max_input_length, truncation=True, padding='max_length')
        
    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        counter_labels = tokenizer(counters, max_length=max_counter_length, truncation=True, padding='max_length')
        conclusion_labels = tokenizer(conclusions, max_length=max_conc_length, truncation=True, padding='max_length')

    model_inputs["conclusion_labels"] = conclusion_labels["input_ids"]
    model_inputs["conclusion_decoder_attention_mask"] = conclusion_labels['attention_mask']
    model_inputs["labels"] = counter_labels["input_ids"]
    model_inputs["decoder_attention_mask"] = counter_labels['attention_mask']
    
    return model_inputs

In [17]:
#downsample the training dataset
#tmp_ds = train_ds.train_test_split(0.005)
#train_ds = tmp_ds['test']

In [18]:
output_models_path = '/var/tmp/sile2804/ca-final-models/'

#### Fine-tune model using simple weighting scheme:

In [19]:
train_df = pd.read_pickle(data_fold+'/reddit_data/conclusion_and_ca_generation/preprocessed_train_conclusion_all.pkl')
valid_df = pd.read_pickle(data_fold+'/reddit_data/conclusion_and_ca_generation/valid_conclusion_all.pkl')

In [20]:
train_df.columns

Index(['post_id', 'split', 'comment_id', 'title', 'post', 'n_sentences',
       'counter', 'bot_comment', 'counter_conclusion', 'counter_conclusions'],
      dtype='object')

In [20]:
valid_df = valid_df.drop_duplicates('post_id').sample(1000)
valid_df.to_pickle(data_fold+'/reddit_data/conclusion_and_ca_generation/valid_conclusion_all_sample.pkl')

In [21]:
train_ds = Dataset.from_pandas(train_df.drop_duplicates('post_id'))
valid_ds = Dataset.from_pandas(valid_df.drop_duplicates('post_id'))

In [22]:
train_tokenized_ds = train_ds.map(lambda x :preprocess_function(x, tokenizer, 'post', 'counter', 'title'), batched=True)
valid_tokenized_ds = valid_ds.map(lambda x :preprocess_function(x, tokenizer, 'post', 'counter', 'title'), batched=True)

  0%|          | 0/26 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [24]:
model = BartModelV2.from_pretrained('facebook/bart-large', compute_dynamic_weights=False, conc_loss_weight = 0.5, counter_loss_weight=0.5, conc_decoder=True).to(device)
original_bart_model = BartModel.from_pretrained('facebook/bart-large').to(device)
model.conclusion_decoder.load_state_dict(original_bart_model.decoder.state_dict())
data_collator= DataCollatorForSeq2Seq(tokenizer, model)

args = Seq2SeqTrainingArguments(
    output_models_path + "mt-model-baseline-weighting-scheme",
    evaluation_strategy = "steps",
    eval_steps=500,
    save_steps=500,
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=5,
    num_train_epochs=6,
    load_best_model_at_end=True,
    predict_with_generate=True,
    metric_for_best_model='loss',
    label_names=['labels', 'conclusion_labels']
)

trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=train_tokenized_ds,
    eval_dataset=valid_tokenized_ds,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=lambda x : compute_metrics(x, tokenizer)
)

trainer.train()
trainer.save_model()

Some weights of BartModelV2 were not initialized from the model checkpoint at facebook/bart-large and are newly initialized: ['conclusion_decoder.layernorm_embedding.weight', 'conclusion_decoder.embed_tokens.weight', 'conclusion_decoder.layernorm_embedding.bias', 'conclusion_decoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The following columns in the training set  don't have a corresponding argument in `BartModelV2.forward` and have been ignored: bot_comment, title, post_id, post, counter, n_sentences, split, comment_id, __index_level_0__.
***** Running training *****
  Num examples = 25704
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 9639


Step,Training Loss,Validation Loss,Bleu Scores,Bert-fscores,Bert-fscore,Bleu
500,1.9179,1.195952,[],[],0.042958,0.000385
1000,1.5505,1.140719,[],[],0.051198,0.000623
1500,1.4936,1.104544,[],[],0.042712,0.000558
2000,1.48,1.085377,[],[],0.041616,0.000334
2500,1.4523,1.072465,[],[],0.041587,0.000387
3000,1.4218,1.052911,[],[],0.035724,0.000402
3500,1.3392,1.048096,[],[],0.021071,0.000367
4000,1.2905,1.038212,[],[],0.040583,0.000453
4500,1.2874,1.032002,[],[],0.031088,0.000378
5000,1.2898,1.027369,[],[],0.022899,0.000349


The following columns in the evaluation set  don't have a corresponding argument in `BartModelV2.forward` and have been ignored: bot_comment, title, post_id, post, counter, n_sentences, split, comment_id, __index_level_0__.
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 16
Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file https://huggingface.co/roberta-large/resolve/main/config.json from cache at /mnt/ceph/storage/data-tmp/2021//sile2804/.cache/huggingface/transformers/dea67b44b38d504f2523f3ddb6acb601b23d67bee52c942da336fa1283100990.94cae8b3a8dbab1d59b9d4827f7ce79e73124efa6bb970412cd503383a95f373
Model config RobertaConfig {
  "_name_or_path": "roberta-large",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
 

In [25]:
print('Done')

Done


#### Train a dyanmic weighting model:

In [26]:
model = BartModelV2.from_pretrained('facebook/bart-large', compute_dynamic_weights=True, conc_decoder=True).to(device)
original_bart_model = BartModel.from_pretrained('facebook/bart-large').to(device)
model.conclusion_decoder.load_state_dict(original_bart_model.decoder.state_dict())
data_collator= DataCollatorForSeq2Seq(tokenizer, model)

args = Seq2SeqTrainingArguments(
    output_models_path + "mt-model-dynamic-weighting-scheme",
    evaluation_strategy = "steps",
    eval_steps=500,
    save_steps=500,
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_total_limit=5,
    num_train_epochs=3,
    load_best_model_at_end=True,
    predict_with_generate=True,
    metric_for_best_model='bert-fscore',
    label_names=['labels', 'conclusion_labels']
)

trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=train_tokenized_ds,
    eval_dataset=valid_tokenized_ds,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=lambda x : compute_metrics(x, tokenizer)
)

trainer.train()
trainer.save_model()

loading configuration file https://huggingface.co/facebook/bart-large/resolve/main/config.json from cache at /mnt/ceph/storage/data-tmp/2021//sile2804/.cache/huggingface/transformers/3f12fb71b844fcb7d591fdd4e55027da90d7b5dd6aa5430ad00ec6d76585f26c.bc22f15dc7ba074ee0a60bdd34c5f2fe3b6d746f89e765303376c51aff04e260
Model config BartConfig {
  "activation_dropout": 0.1,
  "activation_function": "gelu",
  "add_bias_logits": false,
  "add_final_layer_norm": false,
  "architectures": [
    "BartModel"
  ],
  "attention_dropout": 0.1,
  "bos_token_id": 0,
  "classif_dropout": 0.1,
  "classifier_dropout": 0.0,
  "d_model": 1024,
  "decoder_attention_heads": 16,
  "decoder_ffn_dim": 4096,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 12,
  "decoder_start_token_id": 2,
  "dropout": 0.1,
  "early_stopping": true,
  "encoder_attention_heads": 16,
  "encoder_ffn_dim": 4096,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 12,
  "eos_token_id": 2,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 

Step,Training Loss,Validation Loss,Bleu Scores,Bert-fscores,Bert-fscore,Bleu
500,1.9132,1.182969,[],[],0.042776,0.00037
1000,1.5108,1.108315,[],[],0.040106,0.000407
1500,1.4347,1.067844,[],[],0.044828,0.000896
2000,1.4053,1.036797,[],[],0.0433,0.0003
2500,1.3639,1.012464,[],[],0.036081,0.000455
3000,1.3254,0.991186,[],[],0.035286,0.000452
3500,1.2382,0.982157,[],[],0.023371,0.000374
4000,1.1865,0.968586,[],[],0.039315,0.000665
4500,1.1749,0.957093,[],[],0.040442,0.000416
5000,1.1704,0.947326,[],[],0.022037,0.000356


The following columns in the evaluation set  don't have a corresponding argument in `BartModelV2.forward` and have been ignored: bot_comment, title, post_id, post, counter, n_sentences, split, comment_id, __index_level_0__.
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 8
Trainer is attempting to log a value of "[]" of type <class 'list'> for key "eval/bleu_scores" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "[]" of type <class 'list'> for key "eval/bert-fscores" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Saving model checkpoint to /var/tmp/sile2804/ca-final-models/mt-model-dynamic-weighting-scheme/checkpoint-500
Configuration saved in /var/tmp/sile2804/ca-final-models/mt-model-dynamic-weighting-scheme/checkpoint-500/config.json
Model weights saved in /var/tmp/sile2804/ca-final-models/mt-model-dyn

-----------------

### Finding best training parameters:

In [14]:
#train_ds = Dataset.from_pandas(pd.read_pickle(data_fold+'/reddit_data/conclusion_and_ca_generation/train_conclusion_comp_remove_75sem_perc.pkl'))
#valid_ds = Dataset.from_pandas(pd.read_pickle(data_fold+'/reddit_data/conclusion_and_ca_generation/valid_conclusion_comp_remove_75sem_perc_sample.pkl'))

train_ds = Dataset.from_pandas(pd.read_pickle(data_fold+'/reddit_data/conclusion_and_ca_generation/train_conclusion_all.pkl'))
valid_ds = Dataset.from_pandas(pd.read_pickle(data_fold+'/reddit_data/conclusion_and_ca_generation/valid_conclusion_all_sample.pkl'))

In [15]:
train_tokenized_ds = train_ds.map(lambda x :preprocess_function(x, tokenizer, 'post', 'counter', 'title'), batched=True)
valid_tokenized_ds = valid_ds.map(lambda x :preprocess_function(x, tokenizer, 'post', 'counter', 'title'), batched=True)

  0%|          | 0/296 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

In [None]:
for lr in [2e-5, 3e-5, 5e-5]:
    model = BartModelV2.from_pretrained('facebook/bart-base', compute_dynamic_weights=True, conc_decoder=True).to(device)
    original_bart_model = BartModel.from_pretrained('facebook/bart-base').to(device)
    model.conclusion_decoder.load_state_dict(original_bart_model.decoder.state_dict())
    data_collator= DataCollatorForSeq2Seq(tokenizer, model)

    args = Seq2SeqTrainingArguments(
        "../data/output/valid-ft-all/mt-model-{}".format(str(lr) + '-' + str(32)),
        evaluation_strategy = "steps",
        eval_steps=500,
        save_steps=500,
        learning_rate=lr,
        per_device_train_batch_size=32,
        per_device_eval_batch_size=32,
        weight_decay=0.01,
        save_total_limit=5,
        num_train_epochs=1,
        load_best_model_at_end=True,
        predict_with_generate=True,
        metric_for_best_model='bert-fscore',
        label_names=['labels', 'conclusion_labels']
    )

    trainer = Seq2SeqTrainer(
        model,
        args,
        train_dataset=train_tokenized_ds,
        eval_dataset=valid_tokenized_ds,
        data_collator=data_collator,
        tokenizer=tokenizer,
        compute_metrics=lambda x : compute_metrics(x, tokenizer)
    )

    trainer.train()
    trainer.save_model()

Some weights of BartModelV2 were not initialized from the model checkpoint at facebook/bart-base and are newly initialized: ['conclusion_decoder.layernorm_embedding.weight', 'conclusion_decoder.embed_tokens.weight', 'conclusion_decoder.embed_positions.weight', 'conclusion_decoder.layernorm_embedding.bias', 'log_vars']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The following columns in the training set  don't have a corresponding argument in `BartModelV2.forward` and have been ignored: post_id, post, comment_id, counter, n_sentences, split, __index_level_0__, title.
***** Running training *****
  Num examples = 295914
  Num Epochs = 1
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 9248


Step,Training Loss,Validation Loss,Bleu Scores,Bert-fscores,Bert-fscore,Bleu
500,2.0896,1.214435,[],[],0.022454,0.000629


The following columns in the evaluation set  don't have a corresponding argument in `BartModelV2.forward` and have been ignored: post_id, post, comment_id, counter, n_sentences, split, __index_level_0__, title.
***** Running Evaluation *****
  Num examples = 1500
  Batch size = 32
To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor'). (Triggered internally at  /pytorch/aten/src/ATen/native/BinaryOps.cpp:467.)
  return torch.floor_divide(self, other)
Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file https://huggingface.co/roberta-large/resolve/main/config.json from cache at /mnt/ceph/storage/data-tmp/2021//sile2804/.cache/huggingface/transformers/dea67b44b38d504f2523f3ddb6acb601b23d67bee52c942da336fa1283100990.94cae8b3a8dbab1d59b9d4827f7ce79e73124efa6bb970412cd503383a95f373
Model config RobertaConfig {
  "architectures": [
   