In [1]:
from IPython.display import clear_output

!pip install datasets transformers rouge_score rouge-score nltk
# rouge-score is the google version
!pip install pyarrow
!pip install -q sentencepiece

clear_output()

In [2]:
import os
import re
import time
from tqdm import tqdm
import pandas as pd
import numpy as np
from pprint import pprint
import matplotlib.pyplot as plt

# nlp stuff
import nltk
nltk.download('punkt')

# tf stuff
import tensorflow_datasets as tfds 
import tensorflow as tf
from transformers import PegasusTokenizer, TFPegasusForConditionalGeneration # pegasus
from transformers import BartTokenizer, TFBartForConditionalGeneration # bart

# pytorch dataset types
import datasets
from datasets.dataset_dict import DatasetDict
from datasets import Dataset, load_metric, load_dataset

# pytorch bart stuff
import torch
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import AutoTokenizer

clear_output()

In [3]:
# sign into huggingface
from huggingface_hub import notebook_login
notebook_login()

Login successful
Your token has been saved to /root/.huggingface/token
[1m[31mAuthenticated through git-credential store but this isn't the helper defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub. Run the following command in your terminal in case you want to set this credential helper as the default

git config --global credential.helper store[0m


In [4]:
!apt install git-lfs

Reading package lists... Done
Building dependency tree       
Reading state information... Done
git-lfs is already the newest version (2.3.4-1).
The following package was automatically installed and is no longer required:
  libnvidia-common-460
Use 'apt autoremove' to remove it.
0 upgraded, 0 newly installed, 0 to remove and 49 not upgraded.


# Load Data

In [6]:
%%time

# specify your path to the repo here:
repo_path = '/content/gdrive/MyDrive/w266/w266_reddit_summarization'

from google.colab import drive
drive.mount('/content/gdrive')
data_path = os.path.join(repo_path, 'data/reddit_parquet/train_test_split')
os.chdir(data_path)
files = [i for i in os.listdir(data_path) if re.search("reddit", i)]

train = pd.read_parquet('reddit_train.parquet')
test = pd.read_parquet('reddit_test.parquet')
valid = pd.read_parquet('reddit_validation.parquet')

Mounted at /content/gdrive
CPU times: user 1.99 s, sys: 617 ms, total: 2.6 s
Wall time: 20.7 s


In [16]:
# convert data to torch Dataset. Also split up into 4 groups
raw_datasets = DatasetDict({
    # advice_story
    'train_advice_story': Dataset.from_dict({
        'content': train[train['subreddit_group'] == 'advice/story']['content'],
        'summary': train[train['subreddit_group'] == 'advice/story']['summary'],
        'subreddit': train[train['subreddit_group'] == 'advice/story']['subreddit'],
        'subreddit_group': train[train['subreddit_group'] == 'advice/story']['subreddit_group']
    }), 
    'test_advice_story': Dataset.from_dict({
        'content': test[test['subreddit_group'] == 'advice/story']['content'],
        'summary': test[test['subreddit_group'] == 'advice/story']['summary'],
        'subreddit': test[test['subreddit_group'] == 'advice/story']['subreddit'],
        'subreddit_group': test[test['subreddit_group'] == 'advice/story']['subreddit_group']
    }), 
    'valid_advice_story': Dataset.from_dict({
        'content': valid[valid['subreddit_group'] == 'advice/story']['content'],
        'summary': valid[valid['subreddit_group'] == 'advice/story']['summary'],
        'subreddit': valid[valid['subreddit_group'] == 'advice/story']['subreddit'],
        'subreddit_group': valid[valid['subreddit_group'] == 'advice/story']['subreddit_group']
    }),

    # media_lifestyle_sports
    'train_media_lifestyle_sports': Dataset.from_dict({
        'content': train[train['subreddit_group'] == 'media/lifestyle/sports']['content'],
        'summary': train[train['subreddit_group'] == 'media/lifestyle/sports']['summary'],
        'subreddit': train[train['subreddit_group'] == 'media/lifestyle/sports']['subreddit'],
        'subreddit_group': train[train['subreddit_group'] == 'media/lifestyle/sports']['subreddit_group']
    }), 
    'test_media_lifestyle_sports': Dataset.from_dict({
        'content': test[test['subreddit_group'] == 'media/lifestyle/sports']['content'],
        'summary': test[test['subreddit_group'] == 'media/lifestyle/sports']['summary'],
        'subreddit': test[test['subreddit_group'] == 'media/lifestyle/sports']['subreddit'],
        'subreddit_group': test[test['subreddit_group'] == 'media/lifestyle/sports']['subreddit_group']
    }), 
    'valid_media_lifestyle_sports': Dataset.from_dict({
        'content': valid[valid['subreddit_group'] == 'media/lifestyle/sports']['content'],
        'summary': valid[valid['subreddit_group'] == 'media/lifestyle/sports']['summary'],
        'subreddit': valid[valid['subreddit_group'] == 'media/lifestyle/sports']['subreddit'],
        'subreddit_group': valid[valid['subreddit_group'] == 'media/lifestyle/sports']['subreddit_group']
    }),
    
    # gaming
    'train_gaming': Dataset.from_dict({
        'content': train[train['subreddit_group'] == 'gaming']['content'],
        'summary': train[train['subreddit_group'] == 'gaming']['summary'],
        'subreddit': train[train['subreddit_group'] == 'gaming']['subreddit'],
        'subreddit_group': train[train['subreddit_group'] == 'gaming']['subreddit_group']
    }), 
    'test_gaming': Dataset.from_dict({
        'content': test[test['subreddit_group'] == 'gaming']['content'],
        'summary': test[test['subreddit_group'] == 'gaming']['summary'],
        'subreddit': test[test['subreddit_group'] == 'gaming']['subreddit'],
        'subreddit_group': test[test['subreddit_group'] == 'gaming']['subreddit_group']
    }), 
    'valid_gaming': Dataset.from_dict({
        'content': valid[valid['subreddit_group'] == 'gaming']['content'],
        'summary': valid[valid['subreddit_group'] == 'gaming']['summary'],
        'subreddit': valid[valid['subreddit_group'] == 'gaming']['subreddit'],
        'subreddit_group': valid[valid['subreddit_group'] == 'gaming']['subreddit_group']
    }),

    # other
    'train_other': Dataset.from_dict({
        'content': train[train['subreddit_group'] == 'other']['content'],
        'summary': train[train['subreddit_group'] == 'other']['summary'],
        'subreddit': train[train['subreddit_group'] == 'other']['subreddit'],
        'subreddit_group': train[train['subreddit_group'] == 'other']['subreddit_group']
    }), 
    'test_other': Dataset.from_dict({
        'content': test[test['subreddit_group'] == 'other']['content'],
        'summary': test[test['subreddit_group'] == 'other']['summary'],
        'subreddit': test[test['subreddit_group'] == 'other']['subreddit'],
        'subreddit_group': test[test['subreddit_group'] == 'other']['subreddit_group']
    }), 
    'valid_other': Dataset.from_dict({
        'content': valid[valid['subreddit_group'] == 'other']['content'],
        'summary': valid[valid['subreddit_group'] == 'other']['summary'],
        'subreddit': valid[valid['subreddit_group'] == 'other']['subreddit'],
        'subreddit_group': valid[valid['subreddit_group'] == 'other']['subreddit_group']
    })
})



In [65]:
# tokenize function
max_input_length = 1024
max_target_length = 128

def preprocess_function(examples, tokenizer):
    inputs = [doc for doc in examples["content"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["summary"], max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


# predict function
def model_predict(model, tokenizer, input_ids):
  output = model.generate(torch.tensor([input_ids]), num_beams=2, max_length=60, min_length=2, no_repeat_ngram_size=3)
  output_decoded = tokenizer.decode(output.squeeze(), skip_special_tokens=True)
  return output_decoded

# Advice/Story Model

In [24]:
# for now, use disk version
model_checkpoint_advice = '/content/gdrive/MyDrive/w266/w266_reddit_summarization/data/model_outputs/bart_fitted_models/BART-reddit-advice_story_v2'

# load model, tokenizer, and rouge metric
tokenizer_advice = AutoTokenizer.from_pretrained(model_checkpoint_advice)
model_advice = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint_advice)

In [47]:
# tokenize our data
test_advice_tokenized = raw_datasets['test_advice_story'].map(lambda x: preprocess_function(examples=x, tokenizer=tokenizer_advice), batched=True)
test_advice_tokenized

  0%|          | 0/2 [00:00<?, ?ba/s]

Dataset({
    features: ['content', 'summary', 'subreddit', 'subreddit_group', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 1946
})

In [None]:
%%time

# start at 4:36pm
from tqdm.notebook import trange, tqdm

# batch predict and write to disk
tqdm.pandas()
df_results_advice = pd.DataFrame({
    'content': test_advice_tokenized['content'],
    'y': test_advice_tokenized['summary'],
    'input_ids': test_advice_tokenized['input_ids']
})

# df_results_advice['yhat'] = df_results_advice['input_ids'].map(lambda x: model_predict(model=model_advice, tokenizer=tokenizer_advice, input_ids=x))
df_results_advice['yhat'] = df_results_advice['input_ids'].progress_map(lambda x: model_predict(model=model_advice, tokenizer=tokenizer_advice, input_ids=x))
df_results_advice = df_results_advice[['content', 'y', 'yhat']]

  0%|          | 0/1946 [00:00<?, ?it/s]

In [None]:
%%time
# write results to disk
out_path ="/content/gdrive/MyDrive/w266/w266_reddit_summarization/data/model_outputs/bart_preds/"
f1 = os.path.join(out_path, "bart_preds_advice.parquet")
df_results_advice.to_parquet(f1)