# Make predictions using the revised fine tuned BART models on each subreddit genre

In [None]:
from IPython.display import clear_output

!pip install datasets transformers rouge_score rouge-score nltk
# rouge-score is the google version
!pip install pyarrow
!pip install -q sentencepiece

clear_output()

In [None]:
import os
import re
import time
from tqdm.notebook import trange, tqdm
import pandas as pd
import numpy as np
from pprint import pprint
import matplotlib.pyplot as plt

# nlp stuff
import nltk
nltk.download('punkt')

# tf stuff
import tensorflow_datasets as tfds 
import tensorflow as tf
from transformers import BartTokenizer, TFBartForConditionalGeneration # bart

# pytorch dataset types
import datasets
from datasets.dataset_dict import DatasetDict
from datasets import Dataset, load_metric, load_dataset

# pytorch bart stuff
import torch
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import AutoTokenizer

clear_output()

In [None]:
# sign into huggingface
from huggingface_hub import notebook_login
notebook_login()

Login successful
Your token has been saved to /root/.huggingface/token
[1m[31mAuthenticated through git-credential store but this isn't the helper defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub. Run the following command in your terminal in case you want to set this credential helper as the default

git config --global credential.helper store[0m


In [None]:
# !apt install git-lfs

# Load Data

In [None]:
%%time

# specify your path to the repo here:
repo_path = '/content/gdrive/MyDrive/w266/w266_reddit_summarization'

from google.colab import drive
drive.mount('/content/gdrive')
data_path = os.path.join(repo_path, 'data/reddit_parquet/train_test_split_v2')
os.chdir(data_path)
files = [i for i in os.listdir(data_path) if re.search("reddit", i)]

train = pd.read_parquet('reddit_train.parquet')
test = pd.read_parquet('reddit_test.parquet')
valid = pd.read_parquet('reddit_validation.parquet')

Mounted at /content/gdrive
CPU times: user 1.66 s, sys: 565 ms, total: 2.22 s
Wall time: 23.9 s


In [None]:
# convert data to torch Dataset. Also split up into 4 groups
raw_datasets = DatasetDict({
    # advice_story
    'train_advice_story': Dataset.from_dict({
        'content': train[train['subreddit_group'] == 'advice/story']['content'],
        'summary': train[train['subreddit_group'] == 'advice/story']['summary'],
        'subreddit': train[train['subreddit_group'] == 'advice/story']['subreddit'],
        'subreddit_group': train[train['subreddit_group'] == 'advice/story']['subreddit_group']
    }), 
    'test_advice_story': Dataset.from_dict({
        'content': test[test['subreddit_group'] == 'advice/story']['content'],
        'summary': test[test['subreddit_group'] == 'advice/story']['summary'],
        'subreddit': test[test['subreddit_group'] == 'advice/story']['subreddit'],
        'subreddit_group': test[test['subreddit_group'] == 'advice/story']['subreddit_group']
    }), 
    'valid_advice_story': Dataset.from_dict({
        'content': valid[valid['subreddit_group'] == 'advice/story']['content'],
        'summary': valid[valid['subreddit_group'] == 'advice/story']['summary'],
        'subreddit': valid[valid['subreddit_group'] == 'advice/story']['subreddit'],
        'subreddit_group': valid[valid['subreddit_group'] == 'advice/story']['subreddit_group']
    }),

    # media_lifestyle_sports
    'train_media_lifestyle_sports': Dataset.from_dict({
        'content': train[train['subreddit_group'] == 'media/lifestyle/sports']['content'],
        'summary': train[train['subreddit_group'] == 'media/lifestyle/sports']['summary'],
        'subreddit': train[train['subreddit_group'] == 'media/lifestyle/sports']['subreddit'],
        'subreddit_group': train[train['subreddit_group'] == 'media/lifestyle/sports']['subreddit_group']
    }), 
    'test_media_lifestyle_sports': Dataset.from_dict({
        'content': test[test['subreddit_group'] == 'media/lifestyle/sports']['content'],
        'summary': test[test['subreddit_group'] == 'media/lifestyle/sports']['summary'],
        'subreddit': test[test['subreddit_group'] == 'media/lifestyle/sports']['subreddit'],
        'subreddit_group': test[test['subreddit_group'] == 'media/lifestyle/sports']['subreddit_group']
    }), 
    'valid_media_lifestyle_sports': Dataset.from_dict({
        'content': valid[valid['subreddit_group'] == 'media/lifestyle/sports']['content'],
        'summary': valid[valid['subreddit_group'] == 'media/lifestyle/sports']['summary'],
        'subreddit': valid[valid['subreddit_group'] == 'media/lifestyle/sports']['subreddit'],
        'subreddit_group': valid[valid['subreddit_group'] == 'media/lifestyle/sports']['subreddit_group']
    }),
    
    # gaming
    'train_gaming': Dataset.from_dict({
        'content': train[train['subreddit_group'] == 'gaming']['content'],
        'summary': train[train['subreddit_group'] == 'gaming']['summary'],
        'subreddit': train[train['subreddit_group'] == 'gaming']['subreddit'],
        'subreddit_group': train[train['subreddit_group'] == 'gaming']['subreddit_group']
    }), 
    'test_gaming': Dataset.from_dict({
        'content': test[test['subreddit_group'] == 'gaming']['content'],
        'summary': test[test['subreddit_group'] == 'gaming']['summary'],
        'subreddit': test[test['subreddit_group'] == 'gaming']['subreddit'],
        'subreddit_group': test[test['subreddit_group'] == 'gaming']['subreddit_group']
    }), 
    'valid_gaming': Dataset.from_dict({
        'content': valid[valid['subreddit_group'] == 'gaming']['content'],
        'summary': valid[valid['subreddit_group'] == 'gaming']['summary'],
        'subreddit': valid[valid['subreddit_group'] == 'gaming']['subreddit'],
        'subreddit_group': valid[valid['subreddit_group'] == 'gaming']['subreddit_group']
    }),

    # other
    'train_other': Dataset.from_dict({
        'content': train[train['subreddit_group'] == 'other']['content'],
        'summary': train[train['subreddit_group'] == 'other']['summary'],
        'subreddit': train[train['subreddit_group'] == 'other']['subreddit'],
        'subreddit_group': train[train['subreddit_group'] == 'other']['subreddit_group']
    }), 
    'test_other': Dataset.from_dict({
        'content': test[test['subreddit_group'] == 'other']['content'],
        'summary': test[test['subreddit_group'] == 'other']['summary'],
        'subreddit': test[test['subreddit_group'] == 'other']['subreddit'],
        'subreddit_group': test[test['subreddit_group'] == 'other']['subreddit_group']
    }), 
    'valid_other': Dataset.from_dict({
        'content': valid[valid['subreddit_group'] == 'other']['content'],
        'summary': valid[valid['subreddit_group'] == 'other']['summary'],
        'subreddit': valid[valid['subreddit_group'] == 'other']['subreddit'],
        'subreddit_group': valid[valid['subreddit_group'] == 'other']['subreddit_group']
    })
})



In [None]:
# tokenize function
max_input_length = 1024
max_target_length = 128

def preprocess_function(examples, tokenizer):
    inputs = [doc for doc in examples["content"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["summary"], max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


# predict function
def model_predict(model, tokenizer, input_ids):
  output = model.generate(torch.tensor([input_ids]), num_beams=2, max_length=60, min_length=2, no_repeat_ngram_size=3)
  output_decoded = tokenizer.decode(output.squeeze(), skip_special_tokens=True)
  return output_decoded

In [None]:
model_checkpoint_path = '/content/gdrive/MyDrive/w266/w266_reddit_summarization/data/model_outputs/bart_fitted_models/round2'

# Advice/Story Model

In [None]:
# choose disk or huggingface version
where_to_look = "huggingface" # or do "disk"
if where_to_look == 'huggingface':
  model_checkpoint_advice = 'trevorj/BART_reddit_advice_story'
else:
  model_checkpoint_advice = os.path.join(model_checkpoint_path, 'BART_reddit_advice_story')

# load model, tokenizer, and rouge metric
tokenizer_advice = AutoTokenizer.from_pretrained(model_checkpoint_advice)
model_advice = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint_advice)

Downloading:   0%|          | 0.00/1.35k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/780k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.01M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/957 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.59k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/877M [00:00<?, ?B/s]

In [None]:
# tokenize our data
test_advice_tokenized = raw_datasets['test_advice_story'].map(lambda x: preprocess_function(examples=x, tokenizer=tokenizer_advice), batched=True)



  0%|          | 0/1 [00:00<?, ?ba/s]

In [None]:
%%time
# batch predict
# start at 4:45p
tqdm.pandas()
df_results_advice = pd.DataFrame({
    'content': test_advice_tokenized['content'],
    'y': test_advice_tokenized['summary'],
    'input_ids': test_advice_tokenized['input_ids']
})

df_results_advice['yhat'] = df_results_advice['input_ids'].progress_map(lambda x: model_predict(model=model_advice, tokenizer=tokenizer_advice, input_ids=x))
df_results_advice = df_results_advice[['content', 'y', 'yhat']]

# write results to disk
out_path ="/content/gdrive/MyDrive/w266/w266_reddit_summarization/data/model_outputs/bart_preds/round2/"
f1 = os.path.join(out_path, "bart_preds_advice.parquet")
df_results_advice.to_parquet(f1)

  0%|          | 0/1000 [00:00<?, ?it/s]

CPU times: user 1h 22min 40s, sys: 36.4 s, total: 1h 23min 17s
Wall time: 1h 23min 5s


# media_lifestyle_sports

In [None]:
# choose disk or huggingface version
where_to_look = "huggingface" # or do "disk"
if where_to_look == 'huggingface':
  model_checkpoint_media = 'trevorj/BART_reddit_media_lifestyle_sports'
else:
  model_checkpoint_media = os.path.join(model_checkpoint_path, 'BART_reddit_media_lifestyle_sports')

# load model, tokenizer, and rouge metric
tokenizer_media = AutoTokenizer.from_pretrained(model_checkpoint_media)
model_media = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint_media)

Downloading:   0%|          | 0.00/1.35k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/780k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.01M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/957 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.59k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/877M [00:00<?, ?B/s]

In [None]:
# tokenize our data
test_media_tokenized = raw_datasets['test_media_lifestyle_sports'].map(lambda x: preprocess_function(examples=x, tokenizer=tokenizer_media), batched=True)

  0%|          | 0/1 [00:00<?, ?ba/s]

In [24]:
# batch predict and write to disk
tqdm.pandas()
df_results_media = pd.DataFrame({
    'content': test_media_tokenized['content'],
    'y': test_media_tokenized['summary'],
    'input_ids': test_media_tokenized['input_ids']
})

df_results_media['yhat'] = df_results_media['input_ids'].progress_map(lambda x: model_predict(model=model_media, tokenizer=tokenizer_media, input_ids=x))
df_results_media = df_results_media[['content', 'y', 'yhat']]

out_path ="/content/gdrive/MyDrive/w266/w266_reddit_summarization/data/model_outputs/bart_preds/round2/"
f1 = os.path.join(out_path, "bart_preds_media.parquet")
df_results_media.to_parquet(f1)

  0%|          | 0/1000 [00:00<?, ?it/s]

# gaming

In [None]:
%%time

# choose disk or huggingface version
where_to_look = "huggingface" # or do "disk"
if where_to_look == 'huggingface':
  model_checkpoint_gaming = 'trevorj/BART_reddit_gaming'
else:
  model_checkpoint_gaming = os.path.join(model_checkpoint_path, 'BART_reddit_gaming')

# load model, tokenizer, and rouge metric
tokenizer_gaming = AutoTokenizer.from_pretrained(model_checkpoint_gaming)
model_gaming = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint_gaming)

# tokenize our data
test_gaming_tokenized = raw_datasets['test_gaming'].map(lambda x: preprocess_function(examples=x, tokenizer=tokenizer_gaming), batched=True)

# batch predict and write to disk
tqdm.pandas()
df_results_gaming = pd.DataFrame({
    'content': test_gaming_tokenized['content'],
    'y': test_gaming_tokenized['summary'],
    'input_ids': test_gaming_tokenized['input_ids']
})

df_results_gaming['yhat'] = df_results_gaming['input_ids'].progress_map(lambda x: model_predict(model=model_gaming, tokenizer=tokenizer_gaming, input_ids=x))
df_results_gaming = df_results_gaming[['content', 'y', 'yhat']]

out_path ="/content/gdrive/MyDrive/w266/w266_reddit_summarization/data/model_outputs/bart_preds/round2/"
f1 = os.path.join(out_path, "bart_preds_gaming.parquet")
df_results_gaming.to_parquet(f1)

# other

In [None]:
%%time

# choose disk or huggingface version
where_to_look = "huggingface" # or do "disk"
if where_to_look == 'huggingface':
  model_checkpoint_other = 'trevorj/BART_reddit_other'
else:
  model_checkpoint_other = os.path.join(model_checkpoint_path, 'BART_reddit_other')

# load model, tokenizer, and rouge metric
tokenizer_other = AutoTokenizer.from_pretrained(model_checkpoint_other)
model_other = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint_other)

# tokenize our data
test_other_tokenized = raw_datasets['test_other'].map(lambda x: preprocess_function(examples=x, tokenizer=tokenizer_other), batched=True)

# batch predict and write to disk
tqdm.pandas()
df_results_other = pd.DataFrame({
    'content': test_other_tokenized['content'],
    'y': test_other_tokenized['summary'],
    'input_ids': test_other_tokenized['input_ids']
})

df_results_other['yhat'] = df_results_other['input_ids'].progress_map(lambda x: model_predict(model=model_other, tokenizer=tokenizer_other, input_ids=x))
df_results_other = df_results_other[['content', 'y', 'yhat']]

out_path ="/content/gdrive/MyDrive/w266/w266_reddit_summarization/data/model_outputs/bart_preds/round2/"
f1 = os.path.join(out_path, "bart_preds_other.parquet")
df_results_other.to_parquet(f1)