In [1]:
import os
# os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]="3"


In [2]:
import pandas as pd
import numpy as np
import nltk
from datasets import Dataset, DatasetDict, load_metric, load_dataset
from transformers import BartTokenizer, BartForConditionalGeneration, TrainingArguments
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from evaluate import load
import argparse

metric_rouge = load("rouge")
metric_bertscore = load("bertscore")
metric_sari = load("sari")

def wandb_hp_space(trial):
    return {
        "method": "random",
        "metric": {"name": "objective", "goal": "minimize"},
        "parameters": {
            "learning_rate": {"distribution": "uniform", "min": 1e-6, "max": 1e-4},
            "per_device_train_batch_size": {"values": [4, 8, 16]},
            "num_train_epochs": {"values": [5, 10, 15, 20]}
        },
    }
MODEL = 'flant5'
checkpoint = None

# def remove_none(lst):
#     return list(filter(lambda item: item is not None, lst))

def compute_metrics(eval_pred):
    predictions, labels, sources = eval_pred

    if isinstance(predictions, tuple):
        predictions = predictions[0]
        print("preds again", predictions)
        
    # Replace -100 in the labels and sources as we can't decode them.
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
        
    sources = np.where(sources != -100, sources, tokenizer.pad_token_id)
    decoded_inputs = tokenizer.batch_decode(sources, skip_special_tokens=True)
    
    # Tokenize and clean
    decoded_preds_newln = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_preds_space = [ " ".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_label_newln = ["\n".join(nltk.sent_tokenize(labl.strip())) for labl in decoded_labels]
    decoded_label_space = [ " ".join(nltk.sent_tokenize(labl.strip())) for labl in decoded_labels]
    decoded_input_space = [ " ".join(nltk.sent_tokenize(inpt.strip())) for inpt in decoded_inputs]
    
    # sources=["About 95 species are currently accepted.","About 95 species are currently accepted."]
    # predictions=["About 95 you now get in.","About 95 you now get in."]
    # references=[["About 95 species are currently known.","About 95 species are now accepted.","95 species are now accepted."],
    #             ["About 95 species are currently known.","About 95 species are now accepted.","95 species are now accepted."]]

    result_rouge = metric_rouge.compute(predictions=decoded_preds_newln, references=decoded_label_newln, use_stemmer=True)
    result_berts = metric_bertscore.compute(predictions=decoded_preds_space, references=decoded_label_space, lang="en")
    result_sari  = metric_sari.compute(sources=decoded_input_space, predictions=decoded_preds_space, references=[[i] for i in decoded_label_space])

    # Extract results
    result = result_rouge # {key: value.mid.fmeasure * 100 for key, value in result_rouge.items()}
    result['bert_score'] = np.mean(result_berts['f1'])
    result['sari']       = result_sari['sari']
    prediction_lens      = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"]    = np.mean(prediction_lens)
    
    return {k: round(v, 4) for k, v in result.items()}

In [3]:
# Get dataset from arguments
# parser = argparse.ArgumentParser()
# parser.add_argument("dataset")
# args = parser.parse_args()
# print(f"Using dataset: {args.dataset}")

DATASET_NAME    = 'turkcorpus' # args.dataset 
dataset         = load_dataset('json', data_files=f'data/{DATASET_NAME}.json', field='train')
dataset['test'] = load_dataset('json', data_files=f'data/{DATASET_NAME}.json', field='test')['train']


Using custom data configuration default-1eeb42a59953c14f
Reusing dataset json (/home/lily/lyf6/.cache/huggingface/datasets/json/default-1eeb42a59953c14f/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b)


  0%|          | 0/1 [00:00<?, ?it/s]

Using custom data configuration default-b244e89f5dbf8140
Reusing dataset json (/home/lily/lyf6/.cache/huggingface/datasets/json/default-b244e89f5dbf8140/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b)


  0%|          | 0/1 [00:00<?, ?it/s]

In [4]:
# Load in the model and tokenizer, for this we're using BART, which is good at generation tasks
if MODEL == 'bart':
    MODEL_NAME = "BART"
    model = BartForConditionalGeneration.from_pretrained("facebook/bart-large" if checkpoint == None else checkpoint)
    tokenizer = BartTokenizer.from_pretrained("facebook/bart-large")
elif MODEL == 'flant5':
    MODEL_NAME = 'FLANT5'
    model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-large" if checkpoint == None else checkpoint)
    tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-large")
else:
    assert False

def preprocess_function(examples):
    """This function takes a batch of samples, and tokenizes them into IDs for the model
       It does this by adding new arguments to the Dataset dictionary, namely
       - input_ids:      tokenized IDs of the findings
       - attention_mask: mask that tells us which tokens are words and which are padding
       - labels:         tokenized IDs of the impressions
    Args:
        examples (Dataset): {'Findings':[<list of findings texts>],
                             'Impressions':[[<list of impressions texts>] per item]}

    Returns:
        model_inputs (Dataset): {'Findings':      [<list of findings texts>],
                                 'Impressions':   [<list of impressions texts>],
                                 'input_ids':     list of lists with impressions IDs,
                                 'attention_mask':list of lists with impressions IDs masks,
                                 'labels':        list of lists with findings IDs}
    """
    # Tokenize the Findings (the input)
    model_inputs = tokenizer(examples["input"], max_length=512, padding=True, truncation=True)
    # Tokenize the Impressions (the output)
    labels = tokenizer([lst[0] for lst in examples["labels"]], max_length=512, padding=True, truncation=True)
    # Set the label as the token ids (i.e. the vocab IDs) of the findings
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# We apply the function to all the examples in our train and test datasets
dataset['train'] = dataset['train'].map(preprocess_function, batched=True)
dataset['test']  = dataset['test'].map(preprocess_function, batched=True)

# Remove the original columns
dataset['train'] = dataset['train'].remove_columns(["input"])
dataset['test']  = dataset['test'].remove_columns(["input"])


  0%|          | 0/16 [00:00<?, ?ba/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

In [5]:
# Write out the arguments
MODEL_NAME = f"{MODEL_NAME}_{DATASET_NAME}"

args = Seq2SeqTrainingArguments(
    f"models/{MODEL_NAME}",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=1,
    predict_with_generate=True,
    fp16=True,
    include_inputs_for_metrics=True,
    report_to="wandb"
)

# data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
data_collator = DataCollatorForSeq2Seq(tokenizer)

# Create the Trainer and train
trainer = Seq2SeqTrainer(
    model=model, # None,
    args=args,
    # model_init=model_init,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [11]:
model(dataset['train'][:5])

AttributeError: 'dict' object has no attribute 'size'

In [None]:
# best_trial = trainer.hyperparameter_search(
#     direction="minimize",
#     backend="wandb",
#     hp_space=wandb_hp_space,
#     n_trials=3,
# )
# print('########### BEST TRIAL ###########')
# print(best_trial)
# print('##################################')

trainer.train()

# Run evaluate to obtain the model's performance on the test dataset 
# trainer.evaluate()

In [9]:
# Run evaluate to obtain the model's performance on the test dataset 
trainer.evaluate()

ERROR:wandb.jupyter:Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mljyflores_team[0m (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: wandb version 0.13.11 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


{'eval_loss': 13.650123596191406,
 'eval_rouge1': 0.6202,
 'eval_rouge2': 0.4409,
 'eval_rougeL': 0.5768,
 'eval_rougeLsum': 0.5799,
 'eval_bert_score': 0.9275,
 'eval_sari': 37.5662,
 'eval_gen_len': 19.1588,
 'eval_runtime': 258.5198,
 'eval_samples_per_second': 13.887,
 'eval_steps_per_second': 0.87}

In [26]:
test_labels = torch.load('labels.pt')
labels = np.where(test_labels != -100, test_labels, tokenizer.pad_token_id)
tokenizer.batch_decode(labels)

In [9]:
# Use the model to generate outputs
test_output = trainer.predict(dataset_testfile['train'])
tokenizer.batch_decode(test_output.predictions)

['</s><s><s>Adjacent counties are Marin (to the south), Mendocino (to</s>',
 '</s><s>A Georgian inscription around the drum attests his name.</s><pad><pad><pad><pad><pad><pad>',
 '</s><s>They would later return to the revived series in the 2008 Christmas Special "The Next Doctor</s>',
 "</s><s>Jameson's autobiography, How to Make Love Like a Porn Star: A Caution</s>",
 '</s><s>It is particularly famous for the cultivation of kiwifruit.</s><pad><pad><pad><pad>',
 '</s><s>Singles competition (2002 – 2003) After years in the tag team division, Hardy</s>',
 '</s><s>Many of the churches work together for town-wide projects under the banner of "Ch</s>',
 '</s><s>Word processing templates enable the ability to bypass the initial setup and configuration time necessary to create</s>',
 '</s><s>Rollins retired in 1962 and opted to become a coach.</s><pad><pad><pad><pad><pad>',
 "</s><s>History Landsberg prison, which is in the town's western outskirts, was completed in</s>",
 '</s><s>A hunting d