# Compare metrics between the models

In [None]:
from IPython.display import clear_output

!pip install datasets transformers rouge_score rouge-score nltk
# rouge-score is the google version
!pip install pyarrow
!pip install -q sentencepiece

clear_output()

In [None]:
import os
import re
import time
from tqdm import tqdm
import pandas as pd
import numpy as np
from pprint import pprint
import matplotlib.pyplot as plt

# nlp stuff
import nltk
nltk.download('punkt')

# tf stuff
import tensorflow_datasets as tfds 
import tensorflow as tf
from transformers import PegasusTokenizer, TFPegasusForConditionalGeneration # pegasus
from transformers import BartTokenizer, TFBartForConditionalGeneration # bart

# pytorch dataset types
import datasets
from datasets.dataset_dict import DatasetDict
from datasets import Dataset, load_metric, load_dataset

# pytorch bart stuff
import torch
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import AutoTokenizer

clear_output()

In [None]:
%%time

# specify your path to the repo here:
repo_path = '/content/gdrive/MyDrive/w266/w266_reddit_summarization'

from google.colab import drive
drive.mount('/content/gdrive')

# baseline bart
baseline_preds = pd.read_parquet(os.path.join(repo_path, 'data/model_outputs/bart_preds/round2/bart_baseline_preds.parquet'))

# finetuned bart
finetuned_preds = pd.concat([
  pd.read_parquet(os.path.join(repo_path, 'data/model_outputs/bart_preds/round2/bart_preds_advice.parquet')),
  pd.read_parquet(os.path.join(repo_path, 'data/model_outputs/bart_preds/round2/bart_preds_media.parquet')),
  pd.read_parquet(os.path.join(repo_path, 'data/model_outputs/bart_preds/round2/bart_preds_gaming.parquet')),
  pd.read_parquet(os.path.join(repo_path, 'data/model_outputs/bart_preds/round2/bart_preds_other.parquet'))], ignore_index=True)


In [None]:
# join them
baseline_preds.columns = ['content', 'y', 'yhat_baseline']
finetuned_preds.columns = ['content', 'y', 'yhat_finetune']

all_preds = pd.merge(finetuned_preds, baseline_preds, on = ['content', 'y'], how='left')

In [None]:
# compute metrics
baseline_metrics = metric.compute(predictions=all_preds['yhat_baseline'].tolist(), references=all_preds['y'].tolist())
finetune_metrics = metric.compute(predictions=all_preds['yhat_finetune'].tolist(), references=all_preds['y'].tolist())

print("Baseline:")
print(baseline_metrics)

print("\n\nBaseline:")
print(finetune_metrics)

In [None]:
# compute metrics for each genre