### This notebook is to jointly train BART-v2 model for both generating the conclusion and the counter

In [1]:
import os
import sys
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]="1"

In [2]:
sys.path.append('../src-py')

In [3]:
import transformers
import datasets
from utils import *
from mt_bart_v2 import *

print(f"Running on transformers v{transformers.__version__} and datasets v{datasets.__version__}")

Running on transformers v4.9.1 and datasets v1.10.2


In [4]:
import torch
import json

import nltk
import numpy as np
import pandas as pd

from pathlib import Path
from datasets import load_dataset, load_metric, Dataset

from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import BartTokenizer, BartForConditionalGeneration

import ray
from ray.tune.suggest.hyperopt import HyperOptSearch
from ray.tune.schedulers import PopulationBasedTraining
from ray import tune
from ray.tune import CLIReporter

In [5]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [6]:
device

device(type='cuda')

In [7]:
tokenizer = BartTokenizer.from_pretrained('facebook/bart-base')

In [8]:
def get_model(params):
    compute_dynamic_weights=False
    conc_loss_weight=0.5 if params == None else params['conc_loss_weight']
    counter_loss_weight=0.5 if params == None else params['counter_loss_weight']
    attention_to_conc=False
    conc_decoder=True
    model     = BartModelV2.from_pretrained('facebook/bart-base', compute_dynamic_weights=False, 
                                            conc_loss_weight = conc_loss_weight, 
                                            counter_loss_weight=counter_loss_weight, 
                                            attention_to_conc=attention_to_conc, 
                                            conc_decoder=conc_decoder).to(device)

    original_bart_model = BartModel.from_pretrained('facebook/bart-base').to(device)

    #load the weights of the two decoders
    model.conclusion_decoder.load_state_dict(original_bart_model.decoder.state_dict())
    model.counter_decoder.load_state_dict(original_bart_model.decoder.state_dict())
    
    data_collator= DataCollatorForSeq2Seq(tokenizer, model)
    
    return data_collator, model

In [9]:
data_fold = '../../../data-ceph/arguana/arg-generation/multi-taks-counter-argument-generation/'

In [10]:
#Taking unique posts from valid dataset and sample only 1500 instances
# valid_df = pd.read_pickle(data_fold+'/reddit_data/conclusion_and_ca_generation/valid_conclusion_comp_remove_75sem_perc.pkl')
# valid_unique_df = valid_df.drop_duplicates('post_id')
# valid_sample_df = valid_unique_df.sample(1500)
# valid_sample_df.to_pickle(data_fold+'/reddit_data/conclusion_and_ca_generation/valid_conclusion_comp_remove_75sem_perc_sample.pkl')

In [11]:
train_ds = Dataset.from_pandas(pd.read_pickle(data_fold+'/reddit_data/conclusion_and_ca_generation/train_conclusion_comp_remove_75sem_perc.pkl'))
valid_ds = Dataset.from_pandas(pd.read_pickle(data_fold+'/reddit_data/conclusion_and_ca_generation/valid_conclusion_comp_remove_75sem_perc_sample.pkl'))

In [12]:
#Encoding function for joint generation of conclusion and counter
def preprocess_function(examples, tokenizer, premises_clm, counter_clm, conclusion_clm, max_input_length=512, max_conc_length=100, max_counter_length=200):
    premises   = examples[premises_clm]
    conclusions = examples[conclusion_clm]
    counters = examples[counter_clm]
    
        
    premises = [' '.join(x) for x in premises] if isinstance(premises[0], list) else premises
    counters = [' '.join(x) for x in counters] if isinstance(counters[0], list) else counters
    conclusions = [' '.join(x) for x in conclusions] if isinstance(conclusions[0], list) else conclusions
    
    model_inputs = tokenizer(premises, max_length=max_input_length, truncation=True, padding='max_length')
        
    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        counter_labels = tokenizer(counters, max_length=max_counter_length, truncation=True, padding='max_length')
        conclusion_labels = tokenizer(conclusions, max_length=max_conc_length, truncation=True, padding='max_length')

    model_inputs["conclusion_labels"] = conclusion_labels["input_ids"]
    model_inputs["conclusion_decoder_attention_mask"] = conclusion_labels['attention_mask']
    model_inputs["labels"] = counter_labels["input_ids"]
    model_inputs["decoder_attention_mask"] = counter_labels['attention_mask']
    
    return model_inputs

In [13]:
#downsample the training dataset
#tmp_ds = train_ds.train_test_split(0.005)
#train_ds = tmp_ds['test']

In [14]:
len(train_ds)

92397

In [15]:
len(valid_ds)

1500

In [16]:
train_tokenized_ds = train_ds.map(lambda x :preprocess_function(x, tokenizer, 'masked_premises', 'counter', 'title'), batched=True)
valid_tokenized_ds = valid_ds.map(lambda x :preprocess_function(x, tokenizer, 'masked_premises', 'counter', 'title'), batched=True)

  0%|          | 0/93 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

In [17]:
# #Train the model for different config
# batch_size = 32
# for conc_loss_weight, count_loss_weight in [(0.0, 1.0), (0.8, 0.2), (0.6, 0.4)]:
#         data_collator, model = get_model({'conc_loss_weight': conc_loss_weight, 'counter_loss_weight': count_loss_weight})
#         args = Seq2SeqTrainingArguments(
#             "../data/output/joint-con-counter-bart-model-no-attention-finetuned/{}-{}".format(str(conc_loss_weight).replace('.','-'), str(count_loss_weight).replace('.','-')),
#             evaluation_strategy = "steps",
#             learning_rate=2e-5,
#             per_device_train_batch_size=batch_size,
#             per_device_eval_batch_size=batch_size,
#             weight_decay=0.01,
#             save_total_limit=5,
#             num_train_epochs=3,
#             load_best_model_at_end=True,
#             predict_with_generate=True,
#             metric_for_best_model='bert-fscore',
#             label_names=['conclusion_labels', 'counter_labels']
#         )

#         trainer = Seq2TwoSeqTrainer(
#             model,
#             args,
#             train_dataset=train_tokenized_ds,
#             eval_dataset=valid_tokenized_ds,
#             data_collator=data_collator,
#             tokenizer=tokenizer,
#             compute_metrics=lambda x : compute_metrics(x, tokenizer)
#         )
        
#         trainer.train()
#         trainer.save_model()

#### Train a dyanmic weighting model:

In [18]:
batch_size = 32

In [19]:
model = BartModelV2.from_pretrained('facebook/bart-base', compute_dynamic_weights=True, conc_decoder=True).to(device)

Some weights of BartModelV2 were not initialized from the model checkpoint at facebook/bart-base and are newly initialized: ['conclusion_decoder.layernorm_embedding.bias', 'log_vars', 'conclusion_decoder.embed_tokens.weight', 'conclusion_decoder.embed_positions.weight', 'conclusion_decoder.layernorm_embedding.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [20]:
original_bart_model = BartModel.from_pretrained('facebook/bart-base').to(device)
#load the weights of the two decoders
model.conclusion_decoder.load_state_dict(original_bart_model.decoder.state_dict())

<All keys matched successfully>

In [24]:
data_collator= DataCollatorForSeq2Seq(tokenizer, model)

args = Seq2SeqTrainingArguments(
    "../data/output/joint-con-counter-bart-model-no-attention-finetuned/dynamic-weight",
    evaluation_strategy = "steps",
    eval_steps=500,
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=5,
    num_train_epochs=6,
    load_best_model_at_end=True,
    predict_with_generate=True,
    metric_for_best_model='bert-fscore',
    label_names=['labels', 'conclusion_labels']
)

trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=train_tokenized_ds,
    eval_dataset=valid_tokenized_ds,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=lambda x : compute_metrics(x, tokenizer)
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [26]:
trainer.train()
trainer.save_model()

The following columns in the training set  don't have a corresponding argument in `BartModelV2.forward` and have been ignored: premises_with_conclusion, masked_premises, post_id, split, bart_conclusion, comment_id, __index_level_0__, num_cand_conc, n_sentences, title, counter, post.
***** Running training *****
  Num examples = 92397
  Num Epochs = 6
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 17328


Step,Training Loss,Validation Loss,Bleu,Precisions,Brevity Penalty,Length Ratio,Translation Length,Reference Length,Bert-fscore
500,1.617,1.15773,0.000831,"[0.37190149374243037, 0.06617963042544048, 0.016628387689480936, 0.004736063147508633]",0.022264,0.208125,24770,119015,0.04
1000,1.4336,1.090654,0.000786,"[0.3658013041189097, 0.06731059462722608, 0.01590521414411507, 0.00420979644396018]",0.021924,0.207461,24691,119015,0.04
1500,1.3532,1.05783,0.00078,"[0.3710699488179381, 0.06730686045505667, 0.016051438615968174, 0.004225072074758922]",0.021613,0.206848,24618,119015,0.04
2000,1.2944,1.03637,0.000591,"[0.330952674137011, 0.05493592266981586, 0.012451434723587512, 0.0030206917384080953]",0.020546,0.204705,24363,119015,0.0
2500,1.2463,1.021561,0.000878,"[0.3682814349263355, 0.07012128852758336, 0.019439148025856858, 0.005949107633854922]",0.021128,0.205882,24503,119015,0.03
3000,1.2061,1.011362,0.000729,"[0.3534889444968618, 0.061590243475980244, 0.015343593581887074, 0.004678774462947125]",0.020604,0.204823,24377,119015,0.02
3500,1.1584,1.00504,0.000824,"[0.36414394587765414, 0.06728306637148934, 0.01741189580721549, 0.005290213105754354]",0.021271,0.206167,24537,119015,0.03
4000,1.144,0.997085,0.000853,"[0.37282471369768105, 0.06901940356817293, 0.018294098528114408, 0.005489843788990368]",0.021271,0.206167,24537,119015,0.04
4500,1.1142,0.993813,0.000824,"[0.37040194884287453, 0.06683960224816256, 0.01701340730466944, 0.004967709885742673]",0.021664,0.206949,24630,119015,0.04
5000,1.091,0.989948,0.000804,"[0.3647768002606712, 0.0635953496442825, 0.016889383815887157, 0.005136644723718333]",0.021334,0.206293,24552,119015,0.03


The following columns in the evaluation set  don't have a corresponding argument in `BartModelV2.forward` and have been ignored: premises_with_conclusion, masked_premises, post_id, split, comment_id, __index_level_0__, num_cand_conc, n_sentences, title, counter, post.
***** Running Evaluation *****
  Num examples = 1500
  Batch size = 32
Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file https://huggingface.co/roberta-large/resolve/main/config.json from cache at /mnt/ceph/storage/data-tmp/2021//sile2804/.cache/huggingface/transformers/dea67b44b38d504f2523f3ddb6acb601b23d67bee52c942da336fa1283100990.94cae8b3a8dbab1d59b9d4827f7ce79e73124efa6bb970412cd503383a95f373
Model config RobertaConfig {
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_