# Make predictions using the revised fine tuned BART model on the full dataset

In [1]:
from IPython.display import clear_output

!pip install datasets transformers rouge_score rouge-score nltk
# rouge-score is the google version
!pip install pyarrow
!pip install -q sentencepiece

clear_output()

In [2]:
import os
import re
import time
from tqdm.notebook import trange, tqdm
import pandas as pd
import numpy as np
from pprint import pprint
import matplotlib.pyplot as plt

# nlp stuff
import nltk
nltk.download('punkt')

# tf stuff
import tensorflow_datasets as tfds 
import tensorflow as tf
from transformers import BartTokenizer, TFBartForConditionalGeneration # bart

# pytorch dataset types
import datasets
from datasets.dataset_dict import DatasetDict
from datasets import Dataset, load_metric, load_dataset

# pytorch bart stuff
import torch
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import AutoTokenizer

clear_output()

In [4]:
# sign into huggingface
from huggingface_hub import notebook_login
notebook_login()

Login successful
Your token has been saved to /root/.huggingface/token
[1m[31mAuthenticated through git-credential store but this isn't the helper defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub. Run the following command in your terminal in case you want to set this credential helper as the default

git config --global credential.helper store[0m


In [5]:
# !apt install git-lfs

# Load Data

In [6]:
%%time

# specify your path to the repo here:
repo_path = '/content/gdrive/MyDrive/w266/w266_reddit_summarization'

from google.colab import drive
drive.mount('/content/gdrive')
data_path = os.path.join(repo_path, 'data/reddit_parquet/train_test_split_v2')
os.chdir(data_path)
files = [i for i in os.listdir(data_path) if re.search("reddit", i)]

train = pd.read_parquet('reddit_train.parquet')
test = pd.read_parquet('reddit_test.parquet')
valid = pd.read_parquet('reddit_validation.parquet')

Mounted at /content/gdrive
CPU times: user 2.11 s, sys: 691 ms, total: 2.81 s
Wall time: 23.6 s


In [7]:
# convert data to torch Dataset.
raw_datasets = DatasetDict({
    'train': Dataset.from_dict({
        'content': train['content'],
        'summary': train['summary'],
        'subreddit': train['subreddit'],
        'subreddit_group': train['subreddit_group']
    }), 
    'test': Dataset.from_dict({
        'content': test['content'],
        'summary': test['summary'],
        'subreddit': test['subreddit'],
        'subreddit_group': test['subreddit_group']
    }), 
    'valid': Dataset.from_dict({
        'content': valid['content'],
        'summary': valid['summary'],
        'subreddit': valid['subreddit'],
        'subreddit_group': valid['subreddit_group']
    })
})

raw_datasets

DatasetDict({
    train: Dataset({
        features: ['content', 'summary', 'subreddit', 'subreddit_group'],
        num_rows: 60000
    })
    test: Dataset({
        features: ['content', 'summary', 'subreddit', 'subreddit_group'],
        num_rows: 4000
    })
    valid: Dataset({
        features: ['content', 'summary', 'subreddit', 'subreddit_group'],
        num_rows: 4000
    })
})

In [11]:
# tokenize function
max_input_length = 1024
max_target_length = 128

def preprocess_function(examples, tokenizer):
    inputs = [doc for doc in examples["content"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["summary"], max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


# predict function
def model_predict(model, tokenizer, input_ids):
  output = model.generate(torch.tensor([input_ids]), num_beams=2, max_length=60, min_length=2, no_repeat_ngram_size=3)
  output_decoded = tokenizer.decode(output.squeeze(), skip_special_tokens=True)
  return output_decoded

# Model

In [12]:
# choose disk or huggingface version
where_to_look = "huggingface" # or do "disk"
if where_to_look == 'huggingface':
  model_checkpoint = 'trevorj/BART_reddit'
else:
  model_checkpoint_path = '/content/gdrive/MyDrive/w266/w266_reddit_summarization/data/model_outputs/bart_fitted_models/round2'
  model_checkpoint = os.path.join(model_checkpoint_path, 'BART_reddit')

# load model, tokenizer, and rouge metric
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

Downloading:   0%|          | 0.00/1.35k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/780k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.01M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/957 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.59k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/877M [00:00<?, ?B/s]

In [13]:
# tokenize our data
test_tokenized = raw_datasets['test'].map(lambda x: preprocess_function(examples=x, tokenizer=tokenizer), batched=True)



  0%|          | 0/4 [00:00<?, ?ba/s]

# Make Predictions in Batches

In [14]:
%%time

tqdm.pandas()
df_results = pd.DataFrame({
    'content': test_tokenized['content'][:1000],
    'y': test_tokenized['summary'][:1000],
    'input_ids': test_tokenized['input_ids'][:1000],
    'subreddit_group': test_tokenized['subreddit_group'][:1000]
})

df_results['yhat'] = df_results['input_ids'].progress_map(lambda x: model_predict(model=model, tokenizer=tokenizer, input_ids=x))
df_results = df_results[['content', 'y', 'yhat']]

# write results to disk
out_path ="/content/gdrive/MyDrive/w266/w266_reddit_summarization/data/model_outputs/bart_preds/round2/"
f1 = os.path.join(out_path, "bart_full_preds_pt1.parquet")
df_results.to_parquet(f1)

  0%|          | 0/1000 [00:00<?, ?it/s]

CPU times: user 1h 17min 37s, sys: 26.8 s, total: 1h 18min 3s
Wall time: 1h 17min 50s


In [None]:
%%time

tqdm.pandas()
df_results = pd.DataFrame({
    'content': test_tokenized['content'][1000:2000],
    'y': test_tokenized['summary'][1000:2000],
    'input_ids': test_tokenized['input_ids'][1000:2000],
    'subreddit_group': test_tokenized['input_ids'][1000:2000]
})

df_results['yhat'] = df_results['input_ids'].progress_map(lambda x: model_predict(model=model, tokenizer=tokenizer, input_ids=x))
df_results = df_results[['content', 'y', 'yhat']]

# write results to disk
out_path ="/content/gdrive/MyDrive/w266/w266_reddit_summarization/data/model_outputs/bart_preds/round2/"
f1 = os.path.join(out_path, "bart_full_preds_pt2.parquet")
df_results.to_parquet(f1)

In [None]:
%%time

tqdm.pandas()
df_results = pd.DataFrame({
    'content': test_tokenized['content'][2000:3000],
    'y': test_tokenized['summary'][2000:3000],
    'input_ids': test_tokenized['input_ids'][2000:3000],
    'subreddit_group': test_tokenized['input_ids'][2000:3000]
})

df_results['yhat'] = df_results['input_ids'].progress_map(lambda x: model_predict(model=model, tokenizer=tokenizer, input_ids=x))
df_results = df_results[['content', 'y', 'yhat']]

# write results to disk
out_path ="/content/gdrive/MyDrive/w266/w266_reddit_summarization/data/model_outputs/bart_preds/round2/"
f1 = os.path.join(out_path, "bart_full_preds_pt3.parquet")
df_results.to_parquet(f1)

In [None]:
%%time

tqdm.pandas()
df_results = pd.DataFrame({
    'content': test_tokenized['content'][3000:],
    'y': test_tokenized['summary'][3000:],
    'input_ids': test_tokenized['input_ids'][3000:],
    'subreddit_group': test_tokenized['input_ids'][3000:]
})

df_results['yhat'] = df_results['input_ids'].progress_map(lambda x: model_predict(model=model, tokenizer=tokenizer, input_ids=x))
df_results = df_results[['content', 'y', 'yhat']]

# write results to disk
out_path ="/content/gdrive/MyDrive/w266/w266_reddit_summarization/data/model_outputs/bart_preds/round2/"
f1 = os.path.join(out_path, "bart_full_preds_pt4.parquet")
df_results.to_parquet(f1)