# Fine tune single BART by combining all the subgroups together

# Setups

In [1]:
from IPython.display import clear_output

!pip install datasets transformers rouge_score rouge-score nltk
# rouge-score is the google version
!pip install pyarrow
!pip install -q sentencepiece

clear_output()

In [2]:
import os
import re
import time
from tqdm.notebook import trange, tqdm
import pandas as pd
import numpy as np
from pprint import pprint
import matplotlib.pyplot as plt

# nlp stuff
import nltk
nltk.download('punkt')

# tf stuff
import tensorflow_datasets as tfds 
import tensorflow as tf
from transformers import PegasusTokenizer, TFPegasusForConditionalGeneration # pegasus
from transformers import BartTokenizer, TFBartForConditionalGeneration # bart

# pytorch dataset types
import datasets
from datasets.dataset_dict import DatasetDict
from datasets import Dataset, load_metric, load_dataset

# pytorch bart stuff
import torch
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import AutoTokenizer

clear_output()

In [3]:
# sign into huggingface: https://huggingface.co/settings/tokens
from huggingface_hub import notebook_login
notebook_login()

Login successful
Your token has been saved to /root/.huggingface/token
[1m[31mAuthenticated through git-credential store but this isn't the helper defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub. Run the following command in your terminal in case you want to set this credential helper as the default

git config --global credential.helper store[0m


In [None]:
#!apt install git-lfs

# Load data

In [4]:
# specify your path to the repo here:
repo_path = '/content/gdrive/MyDrive/w266/w266_reddit_summarization'

In [5]:
%%time
from google.colab import drive
drive.mount('/content/gdrive')
data_path = os.path.join(repo_path, 'data/reddit_parquet/train_test_split_v2')
os.chdir(data_path)
files = [i for i in os.listdir(data_path) if re.search("reddit", i)]

train = pd.read_parquet('reddit_train.parquet')
test = pd.read_parquet('reddit_test.parquet')
valid = pd.read_parquet('reddit_validation.parquet')

Mounted at /content/gdrive
CPU times: user 1.17 s, sys: 357 ms, total: 1.53 s
Wall time: 19 s


In [6]:
print("train")
print(train['subreddit_group'].value_counts())

print("\n\ntest:")
print(test['subreddit_group'].value_counts())

print("\n\nvalid:")
valid['subreddit_group'].value_counts()

train
advice/story              15000
gaming                    15000
media/lifestyle/sports    15000
other                     15000
Name: subreddit_group, dtype: int64


test:
advice/story              1000
gaming                    1000
media/lifestyle/sports    1000
other                     1000
Name: subreddit_group, dtype: int64


valid:


advice/story              1000
gaming                    1000
media/lifestyle/sports    1000
other                     1000
Name: subreddit_group, dtype: int64

# Modeling

In [7]:
# bunch of diff checkpoints to consider

# bart checkpoints
# model_checkpoint = 'facebook/bart-base' # keep returning the first sentence for me, extractive.
# model_checkpoint = 'facebook/bart-large-mnli' # same as above, only returns first sentences. extractive.
# model_checkpoint = 'sshleifer/distilbart-cnn-12-6' # works a bit better, but seems to produce extractive summaries still. 
# model_checkpoint = 'sshleifer/distilbart-xsum-6-6' # was recommended. produces abstractive summaries p well. so far works the best of the above. 
model_checkpoint = 'sshleifer/distilbart-xsum-6-6' # trained on both xsum and cnn/dm

# pegasus checkpoints:
# model_checkpoint = "google/pegasus-xsum" # works really well
# model_checkpoint = 'google/pegasus-reddit_tifu' # also works really well

In [8]:
# load model, tokenizer, and rouge metric
# load diff checkpoint after it stopped fitting halfway through
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
metric = load_metric("rouge")

clear_output()

In [9]:
# convert data to torch Dataset.
raw_datasets = DatasetDict({
    'train': Dataset.from_dict({
        'content': train['content'],
        'summary': train['summary'],
        'subreddit': train['subreddit']
    }), 
    'test': Dataset.from_dict({
        'content': test['content'],
        'summary': test['summary'],
        'subreddit': test['subreddit']
    }), 
    'valid': Dataset.from_dict({
        'content': valid['content'],
        'summary': valid['summary'],
        'subreddit': valid['subreddit']
    })
})

raw_datasets

DatasetDict({
    train: Dataset({
        features: ['content', 'summary', 'subreddit'],
        num_rows: 60000
    })
    test: Dataset({
        features: ['content', 'summary', 'subreddit'],
        num_rows: 4000
    })
    valid: Dataset({
        features: ['content', 'summary', 'subreddit'],
        num_rows: 4000
    })
})

In [10]:
# tokenize everything
max_input_length = 1024
max_target_length = 128

def preprocess_function(examples):
    inputs = [doc for doc in examples["content"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["summary"], max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)



  0%|          | 0/60 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

In [11]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['content', 'summary', 'subreddit', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 60000
    })
    test: Dataset({
        features: ['content', 'summary', 'subreddit', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 4000
    })
    valid: Dataset({
        features: ['content', 'summary', 'subreddit', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 4000
    })
})

In [12]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]
    
    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    # Extract a few results
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    
    # Add mean generated length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    
    return {k: round(v, 4) for k, v in result.items()}

# Train full dataset
- epochs=3, obs=60k, batch_size=8. Took 4:50 hrs

In [16]:
%%time
# note, batch size of 8 seems to almost max out the gpu
# so probs dont go any higher than this. 
args = Seq2SeqTrainingArguments(
    f"BART_reddit", 
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8, # 16
    per_device_eval_batch_size=8, #16
    weight_decay=0.01,
    save_total_limit=1, #3,
    num_train_epochs=3,
    predict_with_generate=True,
    # fp16=True,
    # push_to_hub=True,
)

trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["valid"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# this should save the model to disk. changing wd so that it saves here:
fit_path = '/content/gdrive/MyDrive/w266/w266_reddit_summarization/data/model_outputs/bart_fitted_models/round2'
os.chdir(fit_path)
trainer.train()

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the training set don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: summary, subreddit, content. If summary, subreddit, content are not expected by `BartForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 60000
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 22500


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,2.9579,3.445648,18.8989,5.0689,15.4851,16.3155,19.442
2,2.7437,3.479094,18.8818,4.9775,15.4084,16.2892,19.113
3,2.5666,3.520019,18.8941,4.9529,15.2998,16.1965,19.8208


Saving model checkpoint to BART_reddit/checkpoint-500
Configuration saved in BART_reddit/checkpoint-500/config.json
Model weights saved in BART_reddit/checkpoint-500/pytorch_model.bin
tokenizer config file saved in BART_reddit/checkpoint-500/tokenizer_config.json
Special tokens file saved in BART_reddit/checkpoint-500/special_tokens_map.json
Saving model checkpoint to BART_reddit/checkpoint-1000
Configuration saved in BART_reddit/checkpoint-1000/config.json
Model weights saved in BART_reddit/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in BART_reddit/checkpoint-1000/tokenizer_config.json
Special tokens file saved in BART_reddit/checkpoint-1000/special_tokens_map.json
Deleting older checkpoint [BART_reddit/checkpoint-500] due to args.save_total_limit
Saving model checkpoint to BART_reddit/checkpoint-1500
Configuration saved in BART_reddit/checkpoint-1500/config.json
Model weights saved in BART_reddit/checkpoint-1500/pytorch_model.bin
tokenizer config file saved in BART_

CPU times: user 4h 34min 19s, sys: 10min 17s, total: 4h 44min 36s
Wall time: 4h 50min 4s


In [None]:
# continue training from checkpoint
model_checkpoint = '/content/gdrive/MyDrive/w266/w266_reddit_summarization/data/model_outputs/bart_fitted_models/round2/BART_reddit/checkpoint-15500'
args = Seq2SeqTrainingArguments(
    f"BART_reddit", 
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8, # 16
    per_device_eval_batch_size=8, #16
    weight_decay=0.01,
    save_total_limit=1, #3,
    num_train_epochs=3, #3
    predict_with_generate=True,
    # fp16=True,
    # push_to_hub=True,
)

trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["valid"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)
fit_path = '/content/gdrive/MyDrive/w266/w266_reddit_summarization/data/model_outputs/bart_fitted_models/round2'
os.chdir(fit_path)
trainer.train(model_checkpoint)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Loading model from /content/gdrive/MyDrive/w266/w266_reddit_summarization/data/model_outputs/bart_fitted_models/round2/BART_reddit/checkpoint-15500.
The following columns in the training set don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: subreddit, content, summary. If subreddit, content, summary are not expected by `BartForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 60000
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 22500
  Co

  0%|          | 0/500 [00:00<?, ?it/s]

Epoch,Training Loss,Validation Loss


Saving model checkpoint to BART_reddit/checkpoint-16000
Configuration saved in BART_reddit/checkpoint-16000/config.json
Model weights saved in BART_reddit/checkpoint-16000/pytorch_model.bin
tokenizer config file saved in BART_reddit/checkpoint-16000/tokenizer_config.json
Special tokens file saved in BART_reddit/checkpoint-16000/special_tokens_map.json
Deleting older checkpoint [BART_reddit/checkpoint-15000] due to args.save_total_limit
Deleting older checkpoint [BART_reddit/checkpoint-15500] due to args.save_total_limit
Saving model checkpoint to BART_reddit/checkpoint-16500
Configuration saved in BART_reddit/checkpoint-16500/config.json
Model weights saved in BART_reddit/checkpoint-16500/pytorch_model.bin
tokenizer config file saved in BART_reddit/checkpoint-16500/tokenizer_config.json
Special tokens file saved in BART_reddit/checkpoint-16500/special_tokens_map.json
Deleting older checkpoint [BART_reddit/checkpoint-16000] due to args.save_total_limit
Saving model checkpoint to BART_re

Epoch,Training Loss,Validation Loss


Saving model checkpoint to BART_reddit/checkpoint-20500
Configuration saved in BART_reddit/checkpoint-20500/config.json
Model weights saved in BART_reddit/checkpoint-20500/pytorch_model.bin
tokenizer config file saved in BART_reddit/checkpoint-20500/tokenizer_config.json
Special tokens file saved in BART_reddit/checkpoint-20500/special_tokens_map.json
Deleting older checkpoint [BART_reddit/checkpoint-20000] due to args.save_total_limit
Saving model checkpoint to BART_reddit/checkpoint-21000
Configuration saved in BART_reddit/checkpoint-21000/config.json
Model weights saved in BART_reddit/checkpoint-21000/pytorch_model.bin
tokenizer config file saved in BART_reddit/checkpoint-21000/tokenizer_config.json
Special tokens file saved in BART_reddit/checkpoint-21000/special_tokens_map.json


In [15]:
# continue training from checkpoint
model_checkpoint = '/content/gdrive/MyDrive/w266/w266_reddit_summarization/data/model_outputs/bart_fitted_models/round2/BART_reddit'
args = Seq2SeqTrainingArguments(
    f"BART_reddit", 
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8, # 16
    per_device_eval_batch_size=8, #16
    weight_decay=0.01,
    save_total_limit=1, #3,
    num_train_epochs=3, #3
    predict_with_generate=True,
    # fp16=True,
    # push_to_hub=True,
)

trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["valid"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)
fit_path = '/content/gdrive/MyDrive/w266/w266_reddit_summarization/data/model_outputs/bart_fitted_models/round2'
os.chdir(fit_path)
trainer.train(model_checkpoint)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Loading model from /content/gdrive/MyDrive/w266/w266_reddit_summarization/data/model_outputs/bart_fitted_models/round2/BART_reddit.
The following columns in the training set don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: summary, subreddit, content. If summary, subreddit, content are not expected by `BartForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 60000
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 22500
Didn't find an RNG fi

Epoch,Training Loss,Validation Loss


KeyboardInterrupt: ignored

In [None]:
%%time
# also save on huggingface
trainer.push_to_hub()
# fit_path = '/content/gdrive/MyDrive/w266/w266_reddit_summarization/data/model_outputs/bart_fitted_models/'
# os.chdir(fit_path)
# trainer.push_to_hub('trevorj/BART-reddit-advice_story')


# then load model back in
# model = AutoModelForSeq2SeqLM.from_pretrained("trevorj/model_name")

# push to hub seems to fail with this message:
# Dropping the following result as it does not have all the necessary fields:
# {'task': {'name': 'Sequence-to-sequence Language Modeling', 'type': 'text2text-generation'}, 'metrics': [{'name': 'Rouge1', 'type': 'rouge', 'value': 20.9206}]}

Compare Model predictions

In [None]:
# true:
print("Content:")
pprint(tokenized_datasets['test_advice_story']['content'][0])
print("\n\nTrue Summary")
pprint(tokenized_datasets['test_advice_story']['summary'][0])

Content:
('Living in the Sierra Nevadas, it gets very cold at night. \n'
 ' Took a shower after a cold night at work. Shower with nice hot water. Hot '
 'but not steaming hot. Felt so good and warm. \n'
 ' Drying myself, starting to get cold. \n'
 ' Got out of the shower, so damn cold. \n'
 ' Drying hair(on head) with the blow dryer and it felt so nice and warm.\n'
 'Balls were cold, penis was cold. All shrivled up, smaller than usual. \n'
 ' Blow dry my private area just as usual during a cold winter day, but since '
 'it was so cold I moved in a little closer. It felt so nice and warm. \n'
 'Thought "oh, fuck it, I\'ll move in a little closer since it was so damn '
 'cold today." Moved in too close, tip of penis touches the burning hot iron '
 'grill. Pull dryer away in shock. \n'
 " It hurts. It burns as I type this. Erections make it burn more. It's red "
 "where it got burned. Hoping it doesn't turn into a blister")


True Summary
('Too damn cold, showered, dry my hair, dry my pri

In [None]:
# results from my trained model above
# note, I had to add .cuda() to the end of the input tensor to specify to use gpu i guess. 
# But dont do this for the original model. just your fine tuned one. 
output = model.generate(torch.tensor([tokenized_datasets['test_advice_story']['input_ids'][0]]).cuda(), num_beams=2, max_length=60, min_length=2, no_repeat_ngram_size=3)
output_decoded = tokenizer.decode(output.squeeze(), skip_special_tokens=True)
pprint(output_decoded)

'Got cold, penis burns.'


In [None]:
# read model from disk (same prediction. Good) 
# note, using the latest checkpoint produced the same thing, so the weights are likely almost the same. 
# final files in the base folder will get overwritten at very end with final weights. 
# so we can just delete the older checkpoint folders

checkpoint_disk = "/content/gdrive/MyDrive/w266/w266_reddit_summarization/data/model_outputs/bart_fitted_models/round2/BART_reddit_advice_story"
# model_disk = AutoModelForSeq2SeqLM.from_pretrained(checkpoint_disk)
model_disk = AutoModelForSeq2SeqLM.from_pretrained(checkpoint_disk, local_files_only=True) # optional to force looking locally

clear_output()
output = model_disk.generate(torch.tensor([tokenized_datasets['test_advice_story']['input_ids'][0]]), num_beams=2, max_length=60, min_length=2, no_repeat_ngram_size=3)
output_decoded = tokenizer.decode(output.squeeze(), skip_special_tokens=True)
pprint(output_decoded)

'Got cold, penis burns.'


In [None]:
# compare to original unfit model. (prediction is different, good). 
model_original = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
clear_output()
output = model_original.generate(torch.tensor([tokenized_datasets['test_advice_story']['input_ids'][0]]), num_beams=2, max_length=60, min_length=2, no_repeat_ngram_size=3)
output_decoded = tokenizer.decode(output.squeeze(), skip_special_tokens=True)
pprint(output_decoded)

(' In a series of letters from African journalists, filmmaker and columnist '
 'Adelisa Fonseca looks back at her recent cold weather.')


In [None]:
# Load from huggingface (haven't configured yet)
# new_checkpoint = 'trevorj/BART-reddit-advice_story'
# model_advice_story = AutoModelForSeq2SeqLM.from_pretrained(new_checkpoint)
# tokenizer_advice_story = AutoTokenizer.from_pretrained(new_checkpoint)

# Train - media_lifestyle_sports
- 1:12 hrs

In [None]:
%%time

args = Seq2SeqTrainingArguments(
    f"BART_reddit_media_lifestyle_sports",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8, # 16
    per_device_eval_batch_size=8, #16
    weight_decay=0.01,
    save_total_limit=1, #3,
    num_train_epochs=3, # 1
    predict_with_generate=True,
    fp16=True,
    push_to_hub=True,
)

trainer_media = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train_media_lifestyle_sports"],
    eval_dataset=tokenized_datasets["valid_media_lifestyle_sports"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# this should save the model to disk. changing wd so that it saves here:
fit_path = '/content/gdrive/MyDrive/w266/w266_reddit_summarization/data/model_outputs/bart_fitted_models/round2'
os.chdir(fit_path)
trainer_media.train()

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Cloning https://huggingface.co/trevorj/BART_reddit_media_lifestyle_sports into local empty directory.
Using cuda_amp half precision backend
The following columns in the training set don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: subreddit_group, subreddit, content, summary. If subreddit_group, subreddit, content, summary are not expected by `BartForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 15000
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimi

Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,3.6542,3.548437,15.9704,4.2881,13.4621,13.9857,17.876


Saving model checkpoint to BART_reddit_media_lifestyle_sports/checkpoint-500
Configuration saved in BART_reddit_media_lifestyle_sports/checkpoint-500/config.json
Model weights saved in BART_reddit_media_lifestyle_sports/checkpoint-500/pytorch_model.bin
tokenizer config file saved in BART_reddit_media_lifestyle_sports/checkpoint-500/tokenizer_config.json
Special tokens file saved in BART_reddit_media_lifestyle_sports/checkpoint-500/special_tokens_map.json
tokenizer config file saved in BART_reddit_media_lifestyle_sports/tokenizer_config.json
Special tokens file saved in BART_reddit_media_lifestyle_sports/special_tokens_map.json
Saving model checkpoint to BART_reddit_media_lifestyle_sports/checkpoint-1000
Configuration saved in BART_reddit_media_lifestyle_sports/checkpoint-1000/config.json
Model weights saved in BART_reddit_media_lifestyle_sports/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in BART_reddit_media_lifestyle_sports/checkpoint-1000/tokenizer_config.json
Speci

Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,3.6542,3.548437,15.9704,4.2881,13.4621,13.9857,17.876
2,3.2719,3.540086,15.9733,4.2271,13.4401,14.055,18.02
3,3.0301,3.562253,16.2463,4.3296,13.7725,14.3504,17.76


The following columns in the evaluation set don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: subreddit_group, subreddit, content, summary. If subreddit_group, subreddit, content, summary are not expected by `BartForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 8
Saving model checkpoint to BART_reddit_media_lifestyle_sports/checkpoint-4000
Configuration saved in BART_reddit_media_lifestyle_sports/checkpoint-4000/config.json
Model weights saved in BART_reddit_media_lifestyle_sports/checkpoint-4000/pytorch_model.bin
tokenizer config file saved in BART_reddit_media_lifestyle_sports/checkpoint-4000/tokenizer_config.json
Special tokens file saved in BART_reddit_media_lifestyle_sports/checkpoint-4000/special_tokens_map.json
Deleting older checkpoint [BART_reddit_media_lifestyle_sports/checkpoint-3500] due to args.save_total_limit
Saving model checkpoin

CPU times: user 1h 5min 6s, sys: 4min 42s, total: 1h 9min 48s
Wall time: 1h 12min 49s


In [None]:
# create huggingface repo
trainer_media.push_to_hub()

# Train gaming
- 1:13 hrs

In [None]:
%%time

args = Seq2SeqTrainingArguments(
    f"BART_reddit_gaming",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8, # 16
    per_device_eval_batch_size=8, #16
    weight_decay=0.01,
    save_total_limit=1, #3,
    num_train_epochs=3, # 1
    predict_with_generate=True,
    fp16=True,
    push_to_hub=True,
)

trainer_gaming = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train_gaming"],
    eval_dataset=tokenized_datasets["valid_gaming"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# this should save the model to disk. changing wd so that it saves here:
fit_path = '/content/gdrive/MyDrive/w266/w266_reddit_summarization/data/model_outputs/bart_fitted_models/round2'
os.chdir(fit_path)
trainer_gaming.train()

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Cloning https://huggingface.co/trevorj/BART_reddit_gaming into local empty directory.
Using cuda_amp half precision backend
The following columns in the training set don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: subreddit_group, content, summary, subreddit. If subreddit_group, content, summary, subreddit are not expected by `BartForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 15000
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 5

Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,3.864,3.775191,17.3754,4.51,14.6763,15.22,16.944
2,3.4755,3.72648,17.8066,4.4188,14.9432,15.5396,18.104
3,3.2629,3.737317,18.1202,4.6045,15.1273,15.7601,18.208


Saving model checkpoint to BART_reddit_gaming/checkpoint-500
Configuration saved in BART_reddit_gaming/checkpoint-500/config.json
Model weights saved in BART_reddit_gaming/checkpoint-500/pytorch_model.bin
tokenizer config file saved in BART_reddit_gaming/checkpoint-500/tokenizer_config.json
Special tokens file saved in BART_reddit_gaming/checkpoint-500/special_tokens_map.json
tokenizer config file saved in BART_reddit_gaming/tokenizer_config.json
Special tokens file saved in BART_reddit_gaming/special_tokens_map.json
Saving model checkpoint to BART_reddit_gaming/checkpoint-1000
Configuration saved in BART_reddit_gaming/checkpoint-1000/config.json
Model weights saved in BART_reddit_gaming/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in BART_reddit_gaming/checkpoint-1000/tokenizer_config.json
Special tokens file saved in BART_reddit_gaming/checkpoint-1000/special_tokens_map.json
Deleting older checkpoint [BART_reddit_gaming/checkpoint-500] due to args.save_total_limit
Sa

CPU times: user 1h 5min 19s, sys: 5min 48s, total: 1h 11min 8s
Wall time: 1h 13min 6s


In [None]:
# create huggingface repo
trainer_gaming.push_to_hub()

Saving model checkpoint to BART_reddit_gaming
Configuration saved in BART_reddit_gaming/config.json
Model weights saved in BART_reddit_gaming/pytorch_model.bin
tokenizer config file saved in BART_reddit_gaming/tokenizer_config.json
Special tokens file saved in BART_reddit_gaming/special_tokens_map.json
Dropping the following result as it does not have all the necessary fields:
{'task': {'name': 'Sequence-to-sequence Language Modeling', 'type': 'text2text-generation'}, 'metrics': [{'name': 'Rouge1', 'type': 'rouge', 'value': 18.1202}]}


# Train other
- 1:17 hrs

In [None]:
%%time

args = Seq2SeqTrainingArguments(
    f"BART_reddit_other",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8, # 16
    per_device_eval_batch_size=8, #16
    weight_decay=0.01,
    save_total_limit=1, #3,
    num_train_epochs=3, # 1
    predict_with_generate=True,
    fp16=True,
    push_to_hub=True,
)

trainer_other = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train_other"],
    eval_dataset=tokenized_datasets["valid_other"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# this should save the model to disk. changing wd so that it saves here:
fit_path = '/content/gdrive/MyDrive/w266/w266_reddit_summarization/data/model_outputs/bart_fitted_models/round2'
os.chdir(fit_path)
trainer_other.train()

Cloning https://huggingface.co/trevorj/BART_reddit_other into local empty directory.
Using cuda_amp half precision backend
The following columns in the training set don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: content, subreddit_group, subreddit, summary. If content, subreddit_group, subreddit, summary are not expected by `BartForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 15000
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 5625


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,3.7887,3.604443,18.4668,5.182,15.359,16.169,19.341
2,3.3816,3.562785,18.0998,4.8937,15.0179,15.7615,17.789
3,3.134,3.579218,18.5705,5.0107,15.2581,16.082,19.402


Saving model checkpoint to BART_reddit_other/checkpoint-500
Configuration saved in BART_reddit_other/checkpoint-500/config.json
Model weights saved in BART_reddit_other/checkpoint-500/pytorch_model.bin
tokenizer config file saved in BART_reddit_other/checkpoint-500/tokenizer_config.json
Special tokens file saved in BART_reddit_other/checkpoint-500/special_tokens_map.json
tokenizer config file saved in BART_reddit_other/tokenizer_config.json
Special tokens file saved in BART_reddit_other/special_tokens_map.json
Saving model checkpoint to BART_reddit_other/checkpoint-1000
Configuration saved in BART_reddit_other/checkpoint-1000/config.json
Model weights saved in BART_reddit_other/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in BART_reddit_other/checkpoint-1000/tokenizer_config.json
Special tokens file saved in BART_reddit_other/checkpoint-1000/special_tokens_map.json
Deleting older checkpoint [BART_reddit_other/checkpoint-500] due to args.save_total_limit
Saving model ch

CPU times: user 1h 11min 53s, sys: 5min 57s, total: 1h 17min 50s
Wall time: 1h 20min 4s


In [None]:
# create huggingface repo
trainer_other.push_to_hub()

Saving model checkpoint to BART_reddit_other
Configuration saved in BART_reddit_other/config.json
Model weights saved in BART_reddit_other/pytorch_model.bin
tokenizer config file saved in BART_reddit_other/tokenizer_config.json
Special tokens file saved in BART_reddit_other/special_tokens_map.json
Dropping the following result as it does not have all the necessary fields:
{'task': {'name': 'Sequence-to-sequence Language Modeling', 'type': 'text2text-generation'}, 'metrics': [{'name': 'Rouge1', 'type': 'rouge', 'value': 18.5705}]}
