In [27]:
import os
import json
import glob
from transformers import pipeline

In [None]:
# Hugging Face login
from huggingface_hub import HfApi

api = HfApi()

checkpoint = "philschmid/bart-large-cnn-samsum"

# Hugging Face repository id
repository_id = f"{checkpoint.split('/')[1]}-acsi-ami"


# Establishing a baseline with bart-large-cnn-samsum 
- Here, we are creating a summary using the pretrained model.

In [5]:
def baseline_summary(path_to_source, model, min_length, max_length):
    """
    :param path_to_source Path to raw .txt files.
    Creates a summary and writes in the result of the summary in .json format
    """
    summarizer = pipeline("summarization", model, truncation=True)

    if not os.path.exists("./baseline_sum"):
        os.mkdir("./baseline_sum")
    for filename in glob.glob(f"{path_to_source}*.txt"):
        txt_raw = filename.split("/")[2].split(".")[0]
        result_dict = {}
        with open(filename, encoding="unicode_escape") as f:
            read_data = f.read()
            result_dict["filename"] = filename
            result_dict["transcript"] = read_data
            result_dict["summary"] = summarizer(read_data,min_length = min_length,max_length=max_length)
        with open(f"./baseline_sum/{txt_raw}.json", "w") as fp:
            json.dump(result_dict, fp)
    


In [None]:
baseline_summary("./data/", "philschmid/bart-large-cnn-samsum", 50, 300)

## Creating JSON Dataset out of the ICSI corpus

In [21]:
def reference_list(reference_path):
    result = []
    for filename in glob.glob(f"{reference_path}*.txt"):
        result_dict = {}
        txt_raw = filename.split("/")[2].split(".")[0]
        with open(filename, encoding="unicode_escape") as f:
            summary = f.read()
            result_dict["meeting_id"] = txt_raw
            result_dict["summary"] = summary
            result.append(result_dict)
    return result

test_list = reference_list("./reference_txt/")

In [23]:
def parse_data(original_path, reference_path):
    references = reference_list(reference_path)
    if not os.path.exists("./acsi_data"):
        os.mkdir("./acsi_data")
    for filename in glob.glob(f"{original_path}*.txt"):
        txt_raw = filename.split("/")[2].split(".")[0]
        with open(filename, encoding="UTF-8") as f:
            read_data = f.read()
            for meeting in references:
                if meeting["meeting_id"] == txt_raw:
                    meeting["dialogue"] = read_data
    return references

comp_list = parse_data("./formatted_data/", "./reference_txt/")


with open("./result.json", "w") as f:
    json.dump(comp_list, f)
        
        
            

In [1]:
from datasets import load_dataset

data_files = {"train" : "result.json"}
my_data = load_dataset("json", data_files=data_files)

my_data_train_test = my_data["train"].train_test_split(test_size = 0.2)
my_data_train_test


Using custom data configuration default-f8f1a33d63c39846
Found cached dataset json (/Users/vincentmarklynn/.cache/huggingface/datasets/json/default-f8f1a33d63c39846/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)


  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['meeting_id', 'summary', 'dialogue'],
        num_rows: 156
    })
    test: Dataset({
        features: ['meeting_id', 'summary', 'dialogue'],
        num_rows: 40
    })
})

## Verify that dataset is loaded propery 

In [2]:
sample = my_data_train_test["train"].shuffle(seed=42).select(range(3))

for row in sample:
    print(f"\n'>>> Text: {row['summary']}'")


'>>> Text: The group mainly talked about the extra deciding of the product at this meeting including the presentation of the prototype assessment, the discussion of the requirements and trends of marketing and the product cost and quotation. Firstly, the Industrial Designer introduced the prototype of the product. It had not only the round basic shape which was made of hard plastics and titanium using different colors, but also the buttons like channel and volumn. Later, the group discussed some details and changes of redesigning the logo, buttons and screens. Next, the Marketing Expert mentioned the exterior of the product, the material attraction, and how easy it was to learn or use the basic functions of the product. Besides, the group discussed some details of the cost of components and made the product cheaper by replacing the titanium by hard plastics with similar color. They finally got an estimate of fourteen point one Euros, which was above the budget. In the end, they discus

In [4]:
from datasets import concatenate_datasets

from transformers import AutoTokenizer,AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorWithPadding



tokenizer = AutoTokenizer.from_pretrained(checkpoint)
# The maximum total input sequence length after tokenization.
# Sequences longer than this will be truncated, sequences shorter will be padded.
tokenized_inputs = concatenate_datasets([my_data_train_test["train"], my_data_train_test["test"]]).map(lambda x: tokenizer(x["dialogue"], truncation=True), batched=True, remove_columns=["dialogue", "summary"])
max_source_length = max([len(x) for x in tokenized_inputs["input_ids"]])
print(f"Max source length: {max_source_length}")

# The maximum total sequence length for target text after tokenization.
# Sequences longer than this will be truncated, sequences shorter will be padded."
tokenized_targets = concatenate_datasets([my_data_train_test["train"], my_data_train_test["test"]]).map(lambda x: tokenizer(x["summary"], truncation=True), batched=True, remove_columns=["dialogue", "summary"])
max_target_length = max([len(x) for x in tokenized_targets["input_ids"]])
print(f"Max target length: {max_target_length}")

  0%|          | 0/1 [00:00<?, ?ba/s]

Max source length: 1024


  0%|          | 0/1 [00:00<?, ?ba/s]

Max target length: 201


# Fine-tuning Phase
- In this phase, we will be fine-tuning the pretrained model to get a more accurate summary. 

In [5]:
# Use a pre-trained tokenizer to tokenize the dataset because model needs tensor input.

def preprocess_function(sample, padding="max_length"):
    model_inputs = tokenizer(sample["dialogue"], max_length=max_source_length, padding=padding, truncation=True)

    labels = tokenizer(text_target=sample["summary"], max_length=max_target_length, padding=padding, truncation=True)
    
    if padding == "max_length":
        labels["input_ids"] = [
            [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
        ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = my_data_train_test.map(preprocess_function, batched=True, remove_columns=["dialogue", "summary", "meeting_id"])
print(f"Keys of tokenized dataset: {list(tokenized_dataset['train'].features)}")

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

Keys of tokenized dataset: ['input_ids', 'attention_mask', 'labels']


In [7]:
import evaluate
import nltk
import numpy as np
from nltk.tokenize import sent_tokenize
nltk.download("punkt")

# Metric
metric = evaluate.load("rouge")

# helper function to postprocess text
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    # rougeLSum expects newline after each sentence
    preds = ["\n".join(sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(sent_tokenize(label)) for label in labels]

    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    result = {k: round(v * 100, 4) for k, v in result.items()}
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    return result

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/vincentmarklynn/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [8]:
# Load existing model
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

In [9]:
from transformers import DataCollatorForSeq2Seq

# we want to ignore tokenizer pad token in the loss
label_pad_token_id = -100
# Data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=label_pad_token_id,
    pad_to_multiple_of=8
)

In [11]:
from huggingface_hub import HfFolder
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

# Define training args
training_args = Seq2SeqTrainingArguments(
    output_dir=repository_id,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    fp16=False, # Overflows with fp16
    learning_rate=5e-5,
    num_train_epochs=5,
    # logging & evaluation strategies
    logging_dir=f"{repository_id}/logs",
    logging_strategy="steps",
    logging_steps=500,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
    # metric_for_best_model="overall_f1",
    # push to hub parameters
    report_to="tensorboard",
    push_to_hub=False,
    hub_strategy="every_save",
    hub_model_id=repository_id,
    hub_token=HfFolder.get_token(),
)

# Create Trainer instance
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    compute_metrics=compute_metrics,
)

In [12]:
trainer.train()

***** Running training *****
  Num examples = 156
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 100
  Number of trainable parameters = 406290432


  0%|          | 0/100 [00:00<?, ?it/s]

***** Running Evaluation *****
  Num examples = 40
  Batch size = 8
Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "early_stopping": true,
  "eos_token_id": 2,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "length_penalty": 2.0,
  "max_length": 142,
  "min_length": 56,
  "no_repeat_ngram_size": 3,
  "num_beams": 4,
  "pad_token_id": 1,
  "transformers_version": "4.26.0"
}



  0%|          | 0/5 [00:00<?, ?it/s]

Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "early_stopping": true,
  "eos_token_id": 2,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "length_penalty": 2.0,
  "max_length": 142,
  "min_length": 56,
  "no_repeat_ngram_size": 3,
  "num_beams": 4,
  "pad_token_id": 1,
  "transformers_version": "4.26.0"
}

Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "early_stopping": true,
  "eos_token_id": 2,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "length_penalty": 2.0,
  "max_length": 142,
  "min_length": 56,
  "no_repeat_ngram_size": 3,
  "num_beams": 4,
  "pad_token_id": 1,
  "transformers_version": "4.26.0"
}

Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "early_stopping": true,
  "eos_token_id": 2,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "length_penalty": 2.0,
  "max_length": 142,
  "min_length": 56,
  "no_repeat_ngram_s

{'eval_loss': 3.2094905376434326, 'eval_rouge1': 39.8174, 'eval_rouge2': 11.5559, 'eval_rougeL': 24.0296, 'eval_rougeLsum': 36.3048, 'eval_gen_len': 108.5, 'eval_runtime': 212.8556, 'eval_samples_per_second': 0.188, 'eval_steps_per_second': 0.023, 'epoch': 1.0}


Model weights saved in bart-large-cnn-samsum-acsi-ami/checkpoint-20/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 40
  Batch size = 8
Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "early_stopping": true,
  "eos_token_id": 2,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "length_penalty": 2.0,
  "max_length": 142,
  "min_length": 56,
  "no_repeat_ngram_size": 3,
  "num_beams": 4,
  "pad_token_id": 1,
  "transformers_version": "4.26.0"
}



  0%|          | 0/5 [00:00<?, ?it/s]

Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "early_stopping": true,
  "eos_token_id": 2,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "length_penalty": 2.0,
  "max_length": 142,
  "min_length": 56,
  "no_repeat_ngram_size": 3,
  "num_beams": 4,
  "pad_token_id": 1,
  "transformers_version": "4.26.0"
}

Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "early_stopping": true,
  "eos_token_id": 2,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "length_penalty": 2.0,
  "max_length": 142,
  "min_length": 56,
  "no_repeat_ngram_size": 3,
  "num_beams": 4,
  "pad_token_id": 1,
  "transformers_version": "4.26.0"
}

Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "early_stopping": true,
  "eos_token_id": 2,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "length_penalty": 2.0,
  "max_length": 142,
  "min_length": 56,
  "no_repeat_ngram_s

{'eval_loss': 3.1361048221588135, 'eval_rouge1': 39.7563, 'eval_rouge2': 11.1286, 'eval_rougeL': 23.2632, 'eval_rougeLsum': 36.5664, 'eval_gen_len': 108.15, 'eval_runtime': 209.462, 'eval_samples_per_second': 0.191, 'eval_steps_per_second': 0.024, 'epoch': 2.0}


Model weights saved in bart-large-cnn-samsum-acsi-ami/checkpoint-40/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 40
  Batch size = 8
Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "early_stopping": true,
  "eos_token_id": 2,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "length_penalty": 2.0,
  "max_length": 142,
  "min_length": 56,
  "no_repeat_ngram_size": 3,
  "num_beams": 4,
  "pad_token_id": 1,
  "transformers_version": "4.26.0"
}



  0%|          | 0/5 [00:00<?, ?it/s]

Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "early_stopping": true,
  "eos_token_id": 2,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "length_penalty": 2.0,
  "max_length": 142,
  "min_length": 56,
  "no_repeat_ngram_size": 3,
  "num_beams": 4,
  "pad_token_id": 1,
  "transformers_version": "4.26.0"
}

Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "early_stopping": true,
  "eos_token_id": 2,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "length_penalty": 2.0,
  "max_length": 142,
  "min_length": 56,
  "no_repeat_ngram_size": 3,
  "num_beams": 4,
  "pad_token_id": 1,
  "transformers_version": "4.26.0"
}

Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "early_stopping": true,
  "eos_token_id": 2,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "length_penalty": 2.0,
  "max_length": 142,
  "min_length": 56,
  "no_repeat_ngram_s

{'eval_loss': 3.1599366664886475, 'eval_rouge1': 41.79, 'eval_rouge2': 12.0967, 'eval_rougeL': 23.5336, 'eval_rougeLsum': 37.6859, 'eval_gen_len': 122.95, 'eval_runtime': 217.8522, 'eval_samples_per_second': 0.184, 'eval_steps_per_second': 0.023, 'epoch': 3.0}


Model weights saved in bart-large-cnn-samsum-acsi-ami/checkpoint-60/pytorch_model.bin
Deleting older checkpoint [bart-large-cnn-samsum-acsi-ami/checkpoint-20] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 40
  Batch size = 8
Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "early_stopping": true,
  "eos_token_id": 2,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "length_penalty": 2.0,
  "max_length": 142,
  "min_length": 56,
  "no_repeat_ngram_size": 3,
  "num_beams": 4,
  "pad_token_id": 1,
  "transformers_version": "4.26.0"
}



  0%|          | 0/5 [00:00<?, ?it/s]

Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "early_stopping": true,
  "eos_token_id": 2,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "length_penalty": 2.0,
  "max_length": 142,
  "min_length": 56,
  "no_repeat_ngram_size": 3,
  "num_beams": 4,
  "pad_token_id": 1,
  "transformers_version": "4.26.0"
}

Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "early_stopping": true,
  "eos_token_id": 2,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "length_penalty": 2.0,
  "max_length": 142,
  "min_length": 56,
  "no_repeat_ngram_size": 3,
  "num_beams": 4,
  "pad_token_id": 1,
  "transformers_version": "4.26.0"
}

Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "early_stopping": true,
  "eos_token_id": 2,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "length_penalty": 2.0,
  "max_length": 142,
  "min_length": 56,
  "no_repeat_ngram_s

{'eval_loss': 3.28777813911438, 'eval_rouge1': 42.3161, 'eval_rouge2': 12.2801, 'eval_rougeL': 23.9352, 'eval_rougeLsum': 38.2391, 'eval_gen_len': 122.7, 'eval_runtime': 212.5424, 'eval_samples_per_second': 0.188, 'eval_steps_per_second': 0.024, 'epoch': 4.0}


Model weights saved in bart-large-cnn-samsum-acsi-ami/checkpoint-80/pytorch_model.bin
Deleting older checkpoint [bart-large-cnn-samsum-acsi-ami/checkpoint-60] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 40
  Batch size = 8
Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "early_stopping": true,
  "eos_token_id": 2,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "length_penalty": 2.0,
  "max_length": 142,
  "min_length": 56,
  "no_repeat_ngram_size": 3,
  "num_beams": 4,
  "pad_token_id": 1,
  "transformers_version": "4.26.0"
}



  0%|          | 0/5 [00:00<?, ?it/s]

Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "early_stopping": true,
  "eos_token_id": 2,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "length_penalty": 2.0,
  "max_length": 142,
  "min_length": 56,
  "no_repeat_ngram_size": 3,
  "num_beams": 4,
  "pad_token_id": 1,
  "transformers_version": "4.26.0"
}

Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "early_stopping": true,
  "eos_token_id": 2,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "length_penalty": 2.0,
  "max_length": 142,
  "min_length": 56,
  "no_repeat_ngram_size": 3,
  "num_beams": 4,
  "pad_token_id": 1,
  "transformers_version": "4.26.0"
}

Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "early_stopping": true,
  "eos_token_id": 2,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "length_penalty": 2.0,
  "max_length": 142,
  "min_length": 56,
  "no_repeat_ngram_s

{'eval_loss': 3.3671374320983887, 'eval_rouge1': 40.7968, 'eval_rouge2': 10.7336, 'eval_rougeL': 22.9434, 'eval_rougeLsum': 36.4383, 'eval_gen_len': 129.225, 'eval_runtime': 216.6227, 'eval_samples_per_second': 0.185, 'eval_steps_per_second': 0.023, 'epoch': 5.0}


Model weights saved in bart-large-cnn-samsum-acsi-ami/checkpoint-100/pytorch_model.bin
Deleting older checkpoint [bart-large-cnn-samsum-acsi-ami/checkpoint-80] due to args.save_total_limit


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from bart-large-cnn-samsum-acsi-ami/checkpoint-40 (score: 3.1361048221588135).


{'train_runtime': 3684.6075, 'train_samples_per_second': 0.212, 'train_steps_per_second': 0.027, 'train_loss': 2.40261474609375, 'epoch': 5.0}


TrainOutput(global_step=100, training_loss=2.40261474609375, metrics={'train_runtime': 3684.6075, 'train_samples_per_second': 0.212, 'train_steps_per_second': 0.027, 'train_loss': 2.40261474609375, 'epoch': 5.0})

In [13]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 40
  Batch size = 8
Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "early_stopping": true,
  "eos_token_id": 2,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "length_penalty": 2.0,
  "max_length": 142,
  "min_length": 56,
  "no_repeat_ngram_size": 3,
  "num_beams": 4,
  "pad_token_id": 1,
  "transformers_version": "4.26.0"
}



  0%|          | 0/5 [00:00<?, ?it/s]

Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "early_stopping": true,
  "eos_token_id": 2,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "length_penalty": 2.0,
  "max_length": 142,
  "min_length": 56,
  "no_repeat_ngram_size": 3,
  "num_beams": 4,
  "pad_token_id": 1,
  "transformers_version": "4.26.0"
}

Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "early_stopping": true,
  "eos_token_id": 2,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "length_penalty": 2.0,
  "max_length": 142,
  "min_length": 56,
  "no_repeat_ngram_size": 3,
  "num_beams": 4,
  "pad_token_id": 1,
  "transformers_version": "4.26.0"
}

Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "early_stopping": true,
  "eos_token_id": 2,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "length_penalty": 2.0,
  "max_length": 142,
  "min_length": 56,
  "no_repeat_ngram_s

{'eval_loss': 3.1361048221588135,
 'eval_rouge1': 39.7563,
 'eval_rouge2': 11.1286,
 'eval_rougeL': 23.2632,
 'eval_rougeLsum': 36.5664,
 'eval_gen_len': 108.15,
 'eval_runtime': 210.5694,
 'eval_samples_per_second': 0.19,
 'eval_steps_per_second': 0.024,
 'epoch': 5.0}

In [None]:
trainer.push_to_hub()

In [None]:
from random import randrange
from transformers import pipeline

summarizer = pipeline("summarization", model="vmarklynn/bart-large-cnn-samsum-acsi-ami")
sample = my_data_train_test['test'][randrange(len(my_data_train_test["test"]))]
