In [1]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\worac\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
import math
import os
import pprint
import logging

import datasets
import nltk
import numpy as np
import torch
from tqdm.auto import tqdm

import transformers
from accelerate import Accelerator
from filelock import FileLock
from transformers import AdamW, get_scheduler, set_seed

from transformers.file_utils import is_offline_mode
from transformers.utils.versions import require_version

# from args import parse_args
# from data_loader import raw_data_loader, data_processor
# from model_loader import model_loader
from rouge_s import py_rouge_scores
from utils import label_smoothed_nll_loss, postprocess_text

In [3]:
logger = logging.getLogger(__name__)
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
    datefmt="%m/%d/%Y %H:%M:%S",
    level=logging.INFO,
)

In [4]:
from transformers import (
    MODEL_MAPPING,
    SchedulerType,
)

# You should update this to your particular problem to have better documentation of `model_type`
MODEL_CONFIG_CLASSES = list(MODEL_MAPPING.keys())
MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)

In [5]:
import argparse
arg_parser = argparse.ArgumentParser(description="BART")
arg_parser.add_argument("--len_input", dest="len_input", type=str, default=None, help="Use the ctrlen model or not", choices=('no', 'real', 'predict', 'surface'))
arg_parser.add_argument("--len_output", dest="len_output", default=None, help="Use the ctrlen model or not", choices=('no', 'real'))
arg_parser.add_argument("--output_dir", dest="output_dir", type=str, default="./output/1", help="default")
arg_parser.add_argument("--train_file", dest="train_file", type=str, default=None, help="A csv or a json file containing the training data.")
arg_parser.add_argument("--validation_file", dest="validation_file", type=str, default=None, help="A csv or a json file containing the validation data.")
arg_parser.add_argument("--test_file", dest="test_file", type=str, default=None, help="A csv or a json file containing the test data.")
arg_parser.add_argument("--ignore_pad_token_for_loss", dest="ignore_pad_token_for_loss", type=bool, default=True, help="Whether to ignore the tokens corresponding to " "padded labels in the loss computation or not.",)
arg_parser.add_argument("--text_column", dest="text_column", type=str, default="dialogue", help="The name of the column in the datasets containing the full texts (for summarization).")
arg_parser.add_argument("--summary_column", dest="summary_column", type=str, default="summary", help="The name of the column in the datasets containing the summaries (for summarization).")
arg_parser.add_argument("--model_name_or_path", dest="model_name_or_path", type=str, default="facebook/bart-large", help="Path to pretrained model or model identifier from huggingface.co/models.")
arg_parser.add_argument("--model_type", dest="model_type", type=str, default="bart", help="Model type to use if training from scratch.", choices=MODEL_TYPES)
arg_parser.add_argument("--max_source_length", dest="max_source_length", type=int, default=1024, help="default")
arg_parser.add_argument("--source_prefix", dest="source_prefix", type=str, default=None, help="A prefix to add before every source text " "(useful for T5 models).")
arg_parser.add_argument("--preprocessing_num_workers", type=int, default=None, help="The number of processes to use for the preprocessing.")
# arg_parser.add_argument("--overwrite_cache", dest="overwrite_cache", type=lambda x:bool(strtobool(x)), default=True, help="default")
arg_parser.add_argument("--overwrite_cache", dest="overwrite_cache", type=bool, default=None, help="Overwrite the cached training and evaluation sets")
arg_parser.add_argument("--min_target_length", dest="min_target_length", type=int, default=1, help="The minimal total sequence length for target text")
arg_parser.add_argument("--max_target_length", dest="max_target_length", type=int, default=128, help="The maximum total sequence length for target text after "
        "tokenization. Sequences longer than this will be truncated, sequences shorter will be padded."
        "during ``evaluate`` and ``predict``.")
arg_parser.add_argument("--num_beams", dest="num_beams", type=int, default=4, help="Number of beams to use for evaluation. This argument will be "
        "passed to ``model.generate``, which is used during ``evaluate`` and ``predict``.")
arg_parser.add_argument("--learning_rate", dest="learning_rate", type=float, default=5e-5, help="Initial learning rate (after the potential warmup period) to use.")
arg_parser.add_argument("--pad_to_max_length", action="store_true", help="If passed, pad all samples to `max_length`. Otherwise, dynamic padding is used.",)
arg_parser.add_argument("--weight_decay", dest="weight_decay", type=float, default=1e-3, help="Weight decay to use.")
arg_parser.add_argument("--label_smoothing", dest="label_smoothing", type=float, default=0.1, help="hyperparameter for label smoothing.")
arg_parser.add_argument("--length_penalty", dest="length_penalty", type=float, default=1.0, help="large - longer sequence, small - shorter sequence")
arg_parser.add_argument("--num_train_epochs", dest="num_train_epochs", type=int, default=15, help="Total number of training epochs to perform.")
arg_parser.add_argument("--per_device_train_batch_size", dest="per_device_train_batch_size", type=int, default=8, help="Batch size (per device) for the training dataloader.")
arg_parser.add_argument("--gradient_accumulation_steps", dest="gradient_accumulation_steps", type=int, default=64, help="Number of updates steps to accumulate before performing a backward/update pass.")
arg_parser.add_argument("--per_device_eval_batch_size", dest="per_device_eval_batch_size", type=int, default=8, help="Batch size (per device) for the evaluation dataloader.")
arg_parser.add_argument("--per_device_test_batch_size", dest="per_device_test_batch_size", type=int, default=8, help="Batch size (per device) for the evaluation dataloader.")
arg_parser.add_argument("--num_warmup_steps", dest="num_warmup_steps", type=int, default=0, help="Number of steps for the warmup in the lr scheduler.")
arg_parser.add_argument("--cache_dir", dest="cache_dir", type=str, default="./output/cache", help="default")
arg_parser.add_argument("--seed", dest="seed", type=int, default=12345, help="default")
# arg_parser.add_argument("-f", required=False) #important
arg_parser.add_argument("--config_name", type=str, default=None, help="Pretrained config name or path if not the same as model_name")
arg_parser.add_argument("--tokenizer_name", type=str, default=None, help="Pretrained tokenizer name or path if not the same as model_name")
arg_parser.add_argument("--use_slow_tokenizer", dest="use_slow_tokenizer", action="store_true", help="If passed, will use a slow tokenizer (not backed by the 🤗 Tokenizers library).")
arg_parser.add_argument("--max_train_steps", type=int, default=None, help="Total number of training steps to perform. If provided, overrides num_train_epochs.")
arg_parser.add_argument("--lr_scheduler_type", type=SchedulerType, default="linear", help="The scheduler type to use.", choices=["linear", "cosine", "cosine_with_restarts", "polynomial", "constant", "constant_with_warmup"])
arg_parser.add_argument("--ctrlen_model", action='store_true', default=False, help="Use the ctrlen model or not")
arg_parser.add_argument("--sim_window_size", type=int, default=5, help="window size for computing loss.")
arg_parser.add_argument("--sim_loss", type=float, default=0, help="the loss weight for similarity scores.")
arg_parser.add_argument("--special_len_token_init", type=str, default=None, help="ways to initialize special token for length (random, zero, token_embs)")
arg_parser.add_argument("--embedding_lr", type=float, default=5e-5, help="Initial learning rate for embedding layers.")
arg_parser.add_argument("--len_start", type=int, default=1, help="start length.")
arg_parser.add_argument("--len_end", type=int, default=100, help="end length.")
arg_parser.add_argument("--data_aug",action='store_true',default=False,help="whether to perform data augmentation or not")
arg_parser.add_argument("--pred_len", action='store_true', default=False, help="whether to use the golden length or predicted length")
arg_parser.add_argument("--shuffle", action='store_true', default=False, help="whether to shuffle the dataset to balance train/validation/test")
arg_parser.add_argument("--debug", action='store_true', default=False, help="Use the debug mode or not")

_StoreTrueAction(option_strings=['--debug'], dest='debug', nargs=0, const=True, default=False, type=None, choices=None, required=False, help='Use the debug mode or not', metavar=None)

In [6]:
args = arg_parser.parse_args('')

In [7]:
args.len_input = 'no'
args.len_output = 'no'
args.output_dir = "./output/1"
args.train_file = "./data/dialogsum/dialogsum.train.jsonl"
args.validation_file = "./data/dialogsum/dialogsum.dev.jsonl"
args.test_file = "./data/dialogsum/dialogsum.test.jsonl"
args.text_column = "dialogue"
args.summary_column = "summary"
args.model_name_or_path = "facebook/bart-large"
args.model_type = "bart"
args.max_source_length = 1024
args.min_target_length = 1
args.max_target_length = 128
args.num_beams = 4
args.learning_rate = 5e-5
args.weight_decay = 1e-3
args.label_smoothing = 0.1
args.length_penalty = 1.0 
args.num_train_epochs = 15 
args.per_device_train_batch_size = 2 
args.gradient_accumulation_steps = 64 
args.per_device_eval_batch_size = 8 
args.per_device_test_batch_size = 8 
args.num_warmup_steps = 0 
args.cache_dir = "./output/cache"
args.overwrite_cache = True
args.seed = 12345

In [8]:
import json
import random
import utils

import datasets
from datasets import Dataset
from torch.utils.data import DataLoader
from transformers import DataCollatorForSeq2Seq


def load_from_dialogsum(args, file_path):
    ''' load dialoguesum jsonl data '''

    data = []
    
    with open(file_path, 'r') as f:
        for line in f:
            data.append(json.loads(line))
    
    id_list       = [sample['fname'] for sample in data]
    dialogue_list = [sample['dialogue'] for sample in data]

    if 'summary' in data[0]:
        #summary
        summary_list  = [sample['summary'] for sample in data]
        #topic
        topic_list = [sample['topic'] for sample in data]

    elif 'summary1' in data[0]:

        id_list1 = [id+"_sum1" for id in id_list]
        id_list2 = [id+"_sum2" for id in id_list]
        id_list3 = [id+"_sum3" for id in id_list]

        id_list = id_list1 + id_list2 + id_list3
        dialogue_list = dialogue_list + dialogue_list + dialogue_list

        #summary
        summary_list1  = [sample['summary1'] for sample in data]
        summary_list2  = [sample['summary2'] for sample in data]
        summary_list3  = [sample['summary3'] for sample in data]

        summary_list = summary_list1 + summary_list2 + summary_list3
        
        #topic
        topic_list1  = [sample['topic1'] for sample in data]
        topic_list2  = [sample['topic2'] for sample in data]
        topic_list3  = [sample['topic3'] for sample in data]
        
        topic_list = topic_list1 + topic_list2 + topic_list3

    data_dict = {'id': id_list,
                'dialogue': dialogue_list,
                'summary': summary_list,
                'topic': topic_list}

    data_dict = Dataset.from_dict(data_dict)

    return data_dict

In [9]:
dialogsum_test = load_from_dialogsum(args, "./data/dialogsum/dialogsum.test.jsonl")
# print sample data
num_sample = 0
print(f"""
Dialogue:\n{dialogsum_test['dialogue'][num_sample]}
{"="*200}
ID:       {dialogsum_test['id'][num_sample]}
summary:  {dialogsum_test['summary'][num_sample]}
topic:    {dialogsum_test['topic'][num_sample]}  
ID:       {dialogsum_test['id'][num_sample+500]}
summary:  {dialogsum_test['summary'][num_sample+500]}
topic:    {dialogsum_test['topic'][num_sample+500]}  
ID:       {dialogsum_test['id'][num_sample+1000]}
summary:  {dialogsum_test['summary'][num_sample+1000]}
topic:    {dialogsum_test['topic'][num_sample+1000]}  
""")


Dialogue:
#Person1#: Ms. Dawson, I need you to take a dictation for me.
#Person2#: Yes, sir...
#Person1#: This should go out as an intra-office memorandum to all employees by this afternoon. Are you ready?
#Person2#: Yes, sir. Go ahead.
#Person1#: Attention all staff... Effective immediately, all office communications are restricted to email correspondence and official memos. The use of Instant Message programs by employees during working hours is strictly prohibited.
#Person2#: Sir, does this apply to intra-office communications only? Or will it also restrict external communications?
#Person1#: It should apply to all communications, not only in this office between employees, but also any outside communications.
#Person2#: But sir, many employees use Instant Messaging to communicate with their clients.
#Person1#: They will just have to change their communication methods. I don't want any - one using Instant Messaging in this office. It wastes too much time! Now, please continue with t

In [10]:
def raw_data_loader(args):
    ''' load raw datasets from csv files '''

    data_files = {}
    if args.train_file is not None:
        data_files["train"] = args.train_file
    if args.validation_file is not None:
        data_files["validation"] = args.validation_file
    if args.test_file is not None:
        data_files["test"] = args.test_file

    if 'dialogsum' in args.train_file:
        train_dict = load_from_dialogsum(args, args.train_file)
        val_dict   = load_from_dialogsum(args, args.validation_file)
        test_dict  = load_from_dialogsum(args, args.test_file)

    train_dict = utils.len_adjust(args, train_dict, 'train')
    val_dict   = utils.len_adjust(args, val_dict, 'val')
    test_dict  = utils.len_adjust(args, test_dict, 'test')

    raw_datasets = datasets.DatasetDict({"train":train_dict, "validation":val_dict, "test":test_dict})

    return raw_datasets

In [11]:
dialogsum = raw_data_loader(args)
# print sample data
num_sample = 0
print(f"""
Dialogue:\n{dialogsum['test']['dialogue'][num_sample]}
{"="*200}
ID:       {dialogsum['test']['id'][num_sample]}
summary:  {dialogsum['test']['summary'][num_sample]}
topic:    {dialogsum['test']['topic'][num_sample]}  
ID:       {dialogsum['test']['id'][num_sample+500]}
summary:  {dialogsum['test']['summary'][num_sample+500]}
topic:    {dialogsum['test']['topic'][num_sample+500]}  
ID:       {dialogsum['test']['id'][num_sample+1000]}
summary:  {dialogsum['test']['summary'][num_sample+1000]}
topic:    {dialogsum['test']['topic'][num_sample+1000]}  
""")


Dialogue:
#Person1#: Ms. Dawson, I need you to take a dictation for me.
#Person2#: Yes, sir...
#Person1#: This should go out as an intra-office memorandum to all employees by this afternoon. Are you ready?
#Person2#: Yes, sir. Go ahead.
#Person1#: Attention all staff... Effective immediately, all office communications are restricted to email correspondence and official memos. The use of Instant Message programs by employees during working hours is strictly prohibited.
#Person2#: Sir, does this apply to intra-office communications only? Or will it also restrict external communications?
#Person1#: It should apply to all communications, not only in this office between employees, but also any outside communications.
#Person2#: But sir, many employees use Instant Messaging to communicate with their clients.
#Person1#: They will just have to change their communication methods. I don't want any - one using Instant Messaging in this office. It wastes too much time! Now, please continue with t

In [12]:
def data_processor(logger, args, accelerator, raw_datasets, tokenizer, model):
    ''' prepare dataset format for train/val/test '''
    def preprocess_function(examples):

        # summary - target
        targets = examples[summary_column]
        with tokenizer.as_target_tokenizer():
            labels = tokenizer(targets, max_length=max_target_length, padding=padding, truncation=True)

        if args.ctrlen_model:
            gold_sum_len = [len(item) for item in labels['attention_mask']]

        # dialogue - input
        inputs = examples[text_column]
        new_inputs = []
        for i, inp in enumerate(inputs):
            if args.ctrlen_model:
                if 'pred_len' in examples:
                    new_inputs.append(prefix + "<len_{}> ".format(examples['pred_len'][i]) + inp)

                else:
                    new_inputs.append(prefix + "<len_{}> ".format(gold_sum_len[i]) + inp)
            else:
                new_inputs.append(prefix + inp)

        inputs = new_inputs
        model_inputs = tokenizer(inputs, max_length=args.max_source_length, padding=padding, truncation=True)

        # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
        # padding in the loss.
        if padding == "max_length" and args.ignore_pad_token_for_loss:
            labels["input_ids"] = [
                [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
            ]
        
        model_inputs["labels"] = labels["input_ids"]

        if args.ctrlen_model:
            model_inputs["gold_len"] = gold_sum_len

        return model_inputs

    prefix = args.source_prefix if args.source_prefix is not None else ""

    # Preprocessing the datasets.
    # First we tokenize all the texts.
    column_names = raw_datasets["train"].column_names

    # Get the column names for input/target.
    text_column = args.text_column
    if text_column not in column_names:
        raise ValueError(
            f"--text_column' value '{args.text_column}' needs to be one of: {', '.join(column_names)}"
        )

    summary_column = args.summary_column
    if summary_column not in column_names:
        raise ValueError(
            f"--summary_column' value '{args.summary_column}' needs to be one of: {', '.join(column_names)}"
        )

    # Temporarily set max_target_length for training.
    max_target_length = args.max_target_length
    padding = "max_length" if args.pad_to_max_length else False

    with accelerator.main_process_first():
        processed_datasets = raw_datasets.map(
            preprocess_function,
            batched=True,
            batch_size=1000,
            remove_columns=column_names,
            load_from_cache_file=not args.overwrite_cache,
            desc="Running tokenizer on dataset",
        )

    train_dataset = processed_datasets["train"]
    eval_dataset  = processed_datasets["validation"]
    test_dataset  = processed_datasets["test"]

    # Log a few random samples from the training set:
    for index in random.sample(range(len(train_dataset)), 1):
        logger.info(f"Sample {index} of the training set: {train_dataset[index]}.")

    label_pad_token_id = -100 if args.ignore_pad_token_for_loss else tokenizer.pad_token_id
    data_collator = DataCollatorForSeq2Seq(
        tokenizer,
        model=model,
        label_pad_token_id=label_pad_token_id,
        pad_to_multiple_of=8 if accelerator.use_fp16 else None,
    )

    train_dataloader = DataLoader(train_dataset, shuffle=True, collate_fn=data_collator, batch_size=args.per_device_train_batch_size)
    eval_dataloader = DataLoader(eval_dataset, collate_fn=data_collator, batch_size=args.per_device_eval_batch_size)
    test_dataloader = DataLoader(test_dataset, collate_fn=data_collator, batch_size=args.per_device_test_batch_size)

    return (train_dataloader, eval_dataloader, test_dataloader), (train_dataset, eval_dataset, test_dataset)