# About

Fine tune BART for the 4 different subreddit groupings: 
1. advice_story
1. gaming
1. media_lifestyle_sports
1. other

# Setups

In [1]:
from IPython.display import clear_output

!pip install datasets transformers rouge_score rouge-score nltk
# rouge-score is the google version
!pip install pyarrow
!pip install -q sentencepiece

clear_output()

In [2]:
import os
import re
import time
from tqdm.notebook import trange, tqdm
import pandas as pd
import numpy as np
from pprint import pprint
import matplotlib.pyplot as plt

# nlp stuff
import nltk
nltk.download('punkt')

# tf stuff
import tensorflow_datasets as tfds 
import tensorflow as tf
from transformers import PegasusTokenizer, TFPegasusForConditionalGeneration # pegasus
from transformers import BartTokenizer, TFBartForConditionalGeneration # bart

# pytorch dataset types
import datasets
from datasets.dataset_dict import DatasetDict
from datasets import Dataset, load_metric, load_dataset

# pytorch bart stuff
import torch
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import AutoTokenizer

clear_output()

In [3]:
# sign into huggingface: https://huggingface.co/settings/tokens
from huggingface_hub import notebook_login
notebook_login()

Login successful
Your token has been saved to /root/.huggingface/token
[1m[31mAuthenticated through git-credential store but this isn't the helper defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub. Run the following command in your terminal in case you want to set this credential helper as the default

git config --global credential.helper store[0m


In [4]:
!apt install git-lfs

Reading package lists... Done
Building dependency tree       
Reading state information... Done
git-lfs is already the newest version (2.3.4-1).
The following package was automatically installed and is no longer required:
  libnvidia-common-460
Use 'apt autoremove' to remove it.
0 upgraded, 0 newly installed, 0 to remove and 49 not upgraded.


# Load data

In [5]:
# specify your path to the repo here:
repo_path = '/content/gdrive/MyDrive/w266/w266_reddit_summarization'

In [6]:
%%time
from google.colab import drive
drive.mount('/content/gdrive')
data_path = os.path.join(repo_path, 'data/reddit_parquet/train_test_split_v2')
os.chdir(data_path)
files = [i for i in os.listdir(data_path) if re.search("reddit", i)]

train = pd.read_parquet('reddit_train.parquet')
test = pd.read_parquet('reddit_test.parquet')
valid = pd.read_parquet('reddit_validation.parquet')

Mounted at /content/gdrive
CPU times: user 1.23 s, sys: 384 ms, total: 1.61 s
Wall time: 21.1 s


In [8]:
print("train")
print(train['subreddit_group'].value_counts())

print("\n\ntest:")
print(test['subreddit_group'].value_counts())

print("\n\nvalid:")
valid['subreddit_group'].value_counts()

train
advice/story              15000
gaming                    15000
media/lifestyle/sports    15000
other                     15000
Name: subreddit_group, dtype: int64


test:
advice/story              1000
gaming                    1000
media/lifestyle/sports    1000
other                     1000
Name: subreddit_group, dtype: int64


valid:


advice/story              1000
gaming                    1000
media/lifestyle/sports    1000
other                     1000
Name: subreddit_group, dtype: int64

# Modeling

In [9]:
# bunch of diff checkpoints to consider

# bart checkpoints
# model_checkpoint = 'facebook/bart-base' # keep returning the first sentence for me, extractive.
# model_checkpoint = 'facebook/bart-large-mnli' # same as above, only returns first sentences. extractive.
# model_checkpoint = 'sshleifer/distilbart-cnn-12-6' # works a bit better, but seems to produce extractive summaries still. 
# model_checkpoint = 'sshleifer/distilbart-xsum-6-6' # was recommended. produces abstractive summaries p well. so far works the best of the above. 
model_checkpoint = 'sshleifer/distilbart-xsum-6-6' # trained on both xsum and cnn/dm

# pegasus checkpoints:
# model_checkpoint = "google/pegasus-xsum" # works really well
# model_checkpoint = 'google/pegasus-reddit_tifu' # also works really well

In [10]:
# load model, tokenizer, and rouge metric
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
metric = load_metric("rouge")

clear_output()

In [11]:
# convert data to torch Dataset. Also split up into 4 groups
raw_datasets = DatasetDict({
    # advice_story
    'train_advice_story': Dataset.from_dict({
        'content': train[train['subreddit_group'] == 'advice/story']['content'],
        'summary': train[train['subreddit_group'] == 'advice/story']['summary'],
        'subreddit': train[train['subreddit_group'] == 'advice/story']['subreddit'],
        'subreddit_group': train[train['subreddit_group'] == 'advice/story']['subreddit_group']
    }), 
    'test_advice_story': Dataset.from_dict({
        'content': test[test['subreddit_group'] == 'advice/story']['content'],
        'summary': test[test['subreddit_group'] == 'advice/story']['summary'],
        'subreddit': test[test['subreddit_group'] == 'advice/story']['subreddit'],
        'subreddit_group': test[test['subreddit_group'] == 'advice/story']['subreddit_group']
    }), 
    'valid_advice_story': Dataset.from_dict({
        'content': valid[valid['subreddit_group'] == 'advice/story']['content'],
        'summary': valid[valid['subreddit_group'] == 'advice/story']['summary'],
        'subreddit': valid[valid['subreddit_group'] == 'advice/story']['subreddit'],
        'subreddit_group': valid[valid['subreddit_group'] == 'advice/story']['subreddit_group']
    }),

    # media_lifestyle_sports
    'train_media_lifestyle_sports': Dataset.from_dict({
        'content': train[train['subreddit_group'] == 'media/lifestyle/sports']['content'],
        'summary': train[train['subreddit_group'] == 'media/lifestyle/sports']['summary'],
        'subreddit': train[train['subreddit_group'] == 'media/lifestyle/sports']['subreddit'],
        'subreddit_group': train[train['subreddit_group'] == 'media/lifestyle/sports']['subreddit_group']
    }), 
    'test_media_lifestyle_sports': Dataset.from_dict({
        'content': test[test['subreddit_group'] == 'media/lifestyle/sports']['content'],
        'summary': test[test['subreddit_group'] == 'media/lifestyle/sports']['summary'],
        'subreddit': test[test['subreddit_group'] == 'media/lifestyle/sports']['subreddit'],
        'subreddit_group': test[test['subreddit_group'] == 'media/lifestyle/sports']['subreddit_group']
    }), 
    'valid_media_lifestyle_sports': Dataset.from_dict({
        'content': valid[valid['subreddit_group'] == 'media/lifestyle/sports']['content'],
        'summary': valid[valid['subreddit_group'] == 'media/lifestyle/sports']['summary'],
        'subreddit': valid[valid['subreddit_group'] == 'media/lifestyle/sports']['subreddit'],
        'subreddit_group': valid[valid['subreddit_group'] == 'media/lifestyle/sports']['subreddit_group']
    }),
    
    # gaming
    'train_gaming': Dataset.from_dict({
        'content': train[train['subreddit_group'] == 'gaming']['content'],
        'summary': train[train['subreddit_group'] == 'gaming']['summary'],
        'subreddit': train[train['subreddit_group'] == 'gaming']['subreddit'],
        'subreddit_group': train[train['subreddit_group'] == 'gaming']['subreddit_group']
    }), 
    'test_gaming': Dataset.from_dict({
        'content': test[test['subreddit_group'] == 'gaming']['content'],
        'summary': test[test['subreddit_group'] == 'gaming']['summary'],
        'subreddit': test[test['subreddit_group'] == 'gaming']['subreddit'],
        'subreddit_group': test[test['subreddit_group'] == 'gaming']['subreddit_group']
    }), 
    'valid_gaming': Dataset.from_dict({
        'content': valid[valid['subreddit_group'] == 'gaming']['content'],
        'summary': valid[valid['subreddit_group'] == 'gaming']['summary'],
        'subreddit': valid[valid['subreddit_group'] == 'gaming']['subreddit'],
        'subreddit_group': valid[valid['subreddit_group'] == 'gaming']['subreddit_group']
    }),

    # other
    'train_other': Dataset.from_dict({
        'content': train[train['subreddit_group'] == 'other']['content'],
        'summary': train[train['subreddit_group'] == 'other']['summary'],
        'subreddit': train[train['subreddit_group'] == 'other']['subreddit'],
        'subreddit_group': train[train['subreddit_group'] == 'other']['subreddit_group']
    }), 
    'test_other': Dataset.from_dict({
        'content': test[test['subreddit_group'] == 'other']['content'],
        'summary': test[test['subreddit_group'] == 'other']['summary'],
        'subreddit': test[test['subreddit_group'] == 'other']['subreddit'],
        'subreddit_group': test[test['subreddit_group'] == 'other']['subreddit_group']
    }), 
    'valid_other': Dataset.from_dict({
        'content': valid[valid['subreddit_group'] == 'other']['content'],
        'summary': valid[valid['subreddit_group'] == 'other']['summary'],
        'subreddit': valid[valid['subreddit_group'] == 'other']['subreddit'],
        'subreddit_group': valid[valid['subreddit_group'] == 'other']['subreddit_group']
    })
})

raw_datasets

DatasetDict({
    train_advice_story: Dataset({
        features: ['content', 'summary', 'subreddit', 'subreddit_group'],
        num_rows: 15000
    })
    test_advice_story: Dataset({
        features: ['content', 'summary', 'subreddit', 'subreddit_group'],
        num_rows: 1000
    })
    valid_advice_story: Dataset({
        features: ['content', 'summary', 'subreddit', 'subreddit_group'],
        num_rows: 1000
    })
    train_media_lifestyle_sports: Dataset({
        features: ['content', 'summary', 'subreddit', 'subreddit_group'],
        num_rows: 15000
    })
    test_media_lifestyle_sports: Dataset({
        features: ['content', 'summary', 'subreddit', 'subreddit_group'],
        num_rows: 1000
    })
    valid_media_lifestyle_sports: Dataset({
        features: ['content', 'summary', 'subreddit', 'subreddit_group'],
        num_rows: 1000
    })
    train_gaming: Dataset({
        features: ['content', 'summary', 'subreddit', 'subreddit_group'],
        num_rows: 15000
  

In [12]:
# tokenize everything
max_input_length = 1024
max_target_length = 128

def preprocess_function(examples):
    inputs = [doc for doc in examples["content"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["summary"], max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)



  0%|          | 0/15 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/15 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/15 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/15 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [13]:
tokenized_datasets

DatasetDict({
    train_advice_story: Dataset({
        features: ['content', 'summary', 'subreddit', 'subreddit_group', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 15000
    })
    test_advice_story: Dataset({
        features: ['content', 'summary', 'subreddit', 'subreddit_group', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1000
    })
    valid_advice_story: Dataset({
        features: ['content', 'summary', 'subreddit', 'subreddit_group', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1000
    })
    train_media_lifestyle_sports: Dataset({
        features: ['content', 'summary', 'subreddit', 'subreddit_group', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 15000
    })
    test_media_lifestyle_sports: Dataset({
        features: ['content', 'summary', 'subreddit', 'subreddit_group', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1000
    })
    valid_media_lifestyle_sports: Dataset({
        features: ['content

In [14]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]
    
    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    # Extract a few results
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    
    # Add mean generated length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    
    return {k: round(v, 4) for k, v in result.items()}

# Train - advice_story

- 3 epochs took 2:02 hrs first go around on 20k obs, batch size=4. 
- 3 epochs 2nd go around, batch size=8. took: 
- https://huggingface.co/trevorj/BART-reddit-advice_story

In [None]:
%%time
# note, batch size of 8 seems to almost max out the gpu
# so probs dont go any higher than this. 
args = Seq2SeqTrainingArguments(
    f"BART_reddit_advice_story", 
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8, # 16
    per_device_eval_batch_size=8, #16
    weight_decay=0.01,
    save_total_limit=1, #3,
    num_train_epochs=3,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=True,
)

trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train_advice_story"],
    eval_dataset=tokenized_datasets["valid_advice_story"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# this should save the model to disk. changing wd so that it saves here:
fit_path = '/content/gdrive/MyDrive/w266/w266_reddit_summarization/data/model_outputs/bart_fitted_models/round2'
os.chdir(fit_path)
trainer.train()

Cloning https://huggingface.co/trevorj/BART_reddit_advice_story into local empty directory.
Using cuda_amp half precision backend
The following columns in the training set don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: subreddit_group, subreddit, content, summary. If subreddit_group, subreddit, content, summary are not expected by `BartForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 15000
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 5625


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,3.3743,3.278733,21.1275,5.9618,17.3772,18.317,20.447


Saving model checkpoint to BART_reddit_advice_story/checkpoint-500
Configuration saved in BART_reddit_advice_story/checkpoint-500/config.json
Model weights saved in BART_reddit_advice_story/checkpoint-500/pytorch_model.bin
tokenizer config file saved in BART_reddit_advice_story/checkpoint-500/tokenizer_config.json
Special tokens file saved in BART_reddit_advice_story/checkpoint-500/special_tokens_map.json
tokenizer config file saved in BART_reddit_advice_story/tokenizer_config.json
Special tokens file saved in BART_reddit_advice_story/special_tokens_map.json
Saving model checkpoint to BART_reddit_advice_story/checkpoint-1000
Configuration saved in BART_reddit_advice_story/checkpoint-1000/config.json
Model weights saved in BART_reddit_advice_story/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in BART_reddit_advice_story/checkpoint-1000/tokenizer_config.json
Special tokens file saved in BART_reddit_advice_story/checkpoint-1000/special_tokens_map.json
Deleting older checkp

In [None]:
%%time
# also save on huggingface
trainer.push_to_hub()
# fit_path = '/content/gdrive/MyDrive/w266/w266_reddit_summarization/data/model_outputs/bart_fitted_models/'
# os.chdir(fit_path)
# trainer.push_to_hub('trevorj/BART-reddit-advice_story')


# then load model back in
# model = AutoModelForSeq2SeqLM.from_pretrained("trevorj/model_name")

# push to hub seems to fail with this message:
# Dropping the following result as it does not have all the necessary fields:
# {'task': {'name': 'Sequence-to-sequence Language Modeling', 'type': 'text2text-generation'}, 'metrics': [{'name': 'Rouge1', 'type': 'rouge', 'value': 20.9206}]}

Saving model checkpoint to BART-reddit-advice_story_v2
Configuration saved in BART-reddit-advice_story_v2/config.json
Model weights saved in BART-reddit-advice_story_v2/pytorch_model.bin
tokenizer config file saved in BART-reddit-advice_story_v2/tokenizer_config.json
Special tokens file saved in BART-reddit-advice_story_v2/special_tokens_map.json
Dropping the following result as it does not have all the necessary fields:
{'task': {'name': 'Sequence-to-sequence Language Modeling', 'type': 'text2text-generation'}, 'metrics': [{'name': 'Rouge1', 'type': 'rouge', 'value': 20.9206}]}


CPU times: user 1.19 s, sys: 1.57 s, total: 2.76 s
Wall time: 5.83 s


Compare Model predictions

In [None]:
# true:
print("Content:")
pprint(tokenized_datasets['test_advice_story']['content'][0])
print("\n\nTrue Summary")
pprint(tokenized_datasets['test_advice_story']['summary'][0])

('What?  No.  A few very large banks started giving out mortgages at sub-prime '
 'rates, to people who had no way to pay for the mortgages.  Then they took '
 "these 'toxic' mortgages and used them as leverage to give out more loans.  "
 'For every 1000 worth of mortgage IOUs the banks had, they were allowed to '
 'loan out 10 dollars (percentages fabricated) to someone else.  Then the '
 'second bank who had just been loaned 1000 dollars were allowed to loan out '
 '10 dollars more.  These loans were all based off of the core of the '
 'mortgages that the banks knew were extremely unlikely to ever be paid off.  '
 'Eventually it all collapsed under itself. \n'
 ' Blaming this on people who lost their money in housing is just wrong.')



('bad banking policy and a lack of governmental oversight allowed bad loans to '
 'be passed around ad infinitum until it collapsed like a house of cards.')


In [None]:
# results from my trained model above
# note, I had to ad .cuda() to the end of the input tensor to specify to use gpu i guess. 
# But dont do this for the original model. just your fine tuned one. 
output = model.generate(torch.tensor([tokenized_datasets['test_advice_story']['input_ids'][0]]).cuda(), num_beams=2, max_length=60, min_length=2, no_repeat_ngram_size=3)
output_decoded = tokenizer.decode(output.squeeze(), skip_special_tokens=True)
pprint(output_decoded)

'Banks gave out mortgages to people who had no way to pay for the mortgages, and then used them as leverage to get more loans.'

In [None]:
# read model from disk (same prediction. Good) 
# note, using the latest checkpoint produced the same thing, so the weights are likely almost the same. 
# final files in the base folder will get overwritten at very end with final weights. 
# so we can just delete the older checkpoint folders
checkpoint_disk = "/content/gdrive/MyDrive/w266/w266_reddit_summarization/data/model_outputs/bart_fitted_models/round2/BART-reddit-advice_story_v2"
model_disk = AutoModelForSeq2SeqLM.from_pretrained(checkpoint_disk)
clear_output()
output = model_disk.generate(torch.tensor([tokenized_datasets['test_advice_story']['input_ids'][0]]), num_beams=2, max_length=60, min_length=2, no_repeat_ngram_size=3)
output_decoded = tokenizer.decode(output.squeeze(), skip_special_tokens=True)
pprint(output_decoded)

'Banks gave out mortgages to people who had no way to pay for the mortgages, and then used them as leverage to get more loans.'

In [None]:
# compare to original unfit model. (prediction is different, good). 
model_original = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
clear_output()
output = model_original.generate(torch.tensor([tokenized_datasets['test_advice_story']['input_ids'][0]]), num_beams=2, max_length=60, min_length=2, no_repeat_ngram_size=3)
output_decoded = tokenizer.decode(output.squeeze(), skip_special_tokens=True)
pprint(output_decoded)

' The BBC News website looks at what happened to the mortgage market in the UK in the early 1990s.'

In [None]:
# Load from huggingface (haven't configured yet)
# new_checkpoint = 'trevorj/BART-reddit-advice_story'
# model_advice_story = AutoModelForSeq2SeqLM.from_pretrained(new_checkpoint)
# tokenizer_advice_story = AutoTokenizer.from_pretrained(new_checkpoint)

# Train - media_lifestyle_sports

In [None]:
%%time

args = Seq2SeqTrainingArguments(
    f"BART_reddit_media_lifestyle_sports",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8, # 16
    per_device_eval_batch_size=8, #16
    weight_decay=0.01,
    save_total_limit=1, #3,
    num_train_epochs=3, # 1
    predict_with_generate=True,
    fp16=True,
    push_to_hub=True,
)

trainer_media = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train_media_lifestyle_sports"],
    eval_dataset=tokenized_datasets["valid_media_lifestyle_sports"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# this should save the model to disk. changing wd so that it saves here:
fit_path = '/content/gdrive/MyDrive/w266/w266_reddit_summarization/data/model_outputs/bart_fitted_models/round2'
os.chdir(fit_path)
trainer_media.train()

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Cloning https://huggingface.co/trevorj/BART-reddit-media_lifestyle_sports into local empty directory.
Using cuda_amp half precision backend
The following columns in the training set don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: subreddit_group, content, summary, subreddit. If subreddit_group, content, summary, subreddit are not expected by `BartForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 6445
  Num Epochs = 3
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimiz

Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,3.6697,3.678965,16.3868,5.117,14.1532,14.7118,17.357
2,3.1785,3.682577,16.8822,4.8181,14.5005,15.0886,17.705
3,2.8537,3.745483,17.0178,4.7455,14.4903,15.1069,18.4236


Saving model checkpoint to BART-reddit-media_lifestyle_sports/checkpoint-500
Configuration saved in BART-reddit-media_lifestyle_sports/checkpoint-500/config.json
Model weights saved in BART-reddit-media_lifestyle_sports/checkpoint-500/pytorch_model.bin
tokenizer config file saved in BART-reddit-media_lifestyle_sports/checkpoint-500/tokenizer_config.json
Special tokens file saved in BART-reddit-media_lifestyle_sports/checkpoint-500/special_tokens_map.json
tokenizer config file saved in BART-reddit-media_lifestyle_sports/tokenizer_config.json
Special tokens file saved in BART-reddit-media_lifestyle_sports/special_tokens_map.json
Saving model checkpoint to BART-reddit-media_lifestyle_sports/checkpoint-1000
Configuration saved in BART-reddit-media_lifestyle_sports/checkpoint-1000/config.json
Model weights saved in BART-reddit-media_lifestyle_sports/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in BART-reddit-media_lifestyle_sports/checkpoint-1000/tokenizer_config.json
Speci

CPU times: user 28min 54s, sys: 1min 57s, total: 30min 51s
Wall time: 33min 31s


In [None]:
# create huggingface repo
trainer_media.push_to_hub()

# Train gaming

In [None]:
%%time

args = Seq2SeqTrainingArguments(
    f"BART_reddit_gaming",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4, # 16
    per_device_eval_batch_size=4, #16
    weight_decay=0.01,
    save_total_limit=1, #3,
    num_train_epochs=3, # 1
    predict_with_generate=True,
    fp16=True,
    push_to_hub=True,
)

trainer_gaming = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train_gaming"],
    eval_dataset=tokenized_datasets["valid_gaming"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# this should save the model to disk. changing wd so that it saves here:
fit_path = '/content/gdrive/MyDrive/w266/w266_reddit_summarization/data/model_outputs/bart_fitted_models/round2'
os.chdir(fit_path)
trainer_gaming.train()

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Cloning https://huggingface.co/trevorj/BART-reddit-gaming into local empty directory.
Using cuda_amp half precision backend
The following columns in the training set don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: subreddit_group, content, summary, subreddit. If subreddit_group, content, summary, subreddit are not expected by `BartForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 6874
  Num Epochs = 3
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 51

Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,3.8011,3.769677,16.399,4.4595,13.9342,14.5098,17.4542
2,3.3039,3.759856,16.9715,4.5274,14.2581,14.8597,18.6121
3,3.0011,3.805855,17.1745,4.4307,14.254,15.0246,19.3357


Saving model checkpoint to BART-reddit-gaming/checkpoint-500
Configuration saved in BART-reddit-gaming/checkpoint-500/config.json
Model weights saved in BART-reddit-gaming/checkpoint-500/pytorch_model.bin
tokenizer config file saved in BART-reddit-gaming/checkpoint-500/tokenizer_config.json
Special tokens file saved in BART-reddit-gaming/checkpoint-500/special_tokens_map.json
tokenizer config file saved in BART-reddit-gaming/tokenizer_config.json
Special tokens file saved in BART-reddit-gaming/special_tokens_map.json
Saving model checkpoint to BART-reddit-gaming/checkpoint-1000
Configuration saved in BART-reddit-gaming/checkpoint-1000/config.json
Model weights saved in BART-reddit-gaming/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in BART-reddit-gaming/checkpoint-1000/tokenizer_config.json
Special tokens file saved in BART-reddit-gaming/checkpoint-1000/special_tokens_map.json
Deleting older checkpoint [BART-reddit-gaming/checkpoint-500] due to args.save_total_limit
Sa

CPU times: user 31min 19s, sys: 2min 8s, total: 33min 28s
Wall time: 36min 36s


In [None]:
# create huggingface repo
trainer_gaming.push_to_hub()

# Train other

In [None]:
%%time

args = Seq2SeqTrainingArguments(
    f"BART_reddit_other",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4, # 16
    per_device_eval_batch_size=4, #16
    weight_decay=0.01,
    save_total_limit=1, #3,
    num_train_epochs=3, # 1
    predict_with_generate=True,
    fp16=True,
    push_to_hub=True,
)

trainer_other = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train_other"],
    eval_dataset=tokenized_datasets["valid_other"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# this should save the model to disk. changing wd so that it saves here:
fit_path = '/content/gdrive/MyDrive/w266/w266_reddit_summarization/data/model_outputs/bart_fitted_models/round2'
os.chdir(fit_path)
trainer_other.train()

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Cloning https://huggingface.co/trevorj/BART-reddit-other into local empty directory.
Using cuda_amp half precision backend
The following columns in the training set don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: subreddit_group, content, summary, subreddit. If subreddit_group, content, summary, subreddit are not expected by `BartForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 16743
  Num Epochs = 3
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 12

Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,3.5823,3.49302,17.6627,5.1827,15.0209,15.6877,17.7175
2,3.1548,3.495256,18.2891,5.2005,15.2549,15.9532,18.3665
3,2.8294,3.544672,18.4646,5.1123,15.4322,16.1507,18.9035


Saving model checkpoint to BART-reddit-other/checkpoint-500
Configuration saved in BART-reddit-other/checkpoint-500/config.json
Model weights saved in BART-reddit-other/checkpoint-500/pytorch_model.bin
tokenizer config file saved in BART-reddit-other/checkpoint-500/tokenizer_config.json
Special tokens file saved in BART-reddit-other/checkpoint-500/special_tokens_map.json
tokenizer config file saved in BART-reddit-other/tokenizer_config.json
Special tokens file saved in BART-reddit-other/special_tokens_map.json
Saving model checkpoint to BART-reddit-other/checkpoint-1000
Configuration saved in BART-reddit-other/checkpoint-1000/config.json
Model weights saved in BART-reddit-other/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in BART-reddit-other/checkpoint-1000/tokenizer_config.json
Special tokens file saved in BART-reddit-other/checkpoint-1000/special_tokens_map.json
Deleting older checkpoint [BART-reddit-other/checkpoint-500] due to args.save_total_limit
Saving model ch

CPU times: user 1h 22min 1s, sys: 5min 20s, total: 1h 27min 22s
Wall time: 1h 33min 25s


In [None]:
# create huggingface repo
trainer_other.push_to_hub()