In [None]:
%load_ext autoreload
%autoreload 2
%load_ext jupyter_black
%matplotlib inline
%load_ext dotenv
%dotenv

In [None]:
# To compare the models on our sentiment analysis task, we need three things:
# 1. A test dataset
# 2. A set of models to compare (as pipelines, so we can call them on a list of texts and get a list of labels back)
# 3. A function that takes the models and test data and returns accuracy metrics

## Data

In [None]:
# Since our dataset is pretty small at 300 examples, we'll set aside 20% of the data for testing
# We'll use the same split for all of our models
# We'll use the dataset from the previous notebook
from src.config import DATASET_PATH
from src.data.make_dataset import load_dataset_from_file

dataset = load_dataset_from_file(DATASET_PATH)
test_dataset = dataset["test"]

assert len(test_dataset) == 60  # 20%

In [None]:
# let's compute the baseline accuracy of the test set
from sklearn.metrics import classification_report

baseline_report = classification_report(
    test_dataset["sentiment_output"], test_dataset["annotated_sentiment"]
)
print(baseline_report)

In [None]:
# 72% accuracy is the number to beat
# let's store all reports in the same list
title_reports = [("baseline", baseline_report)]

# and use this function to print them all at the end
def print_reports(reports: list[tuple[str, str]]) -> None:
    """Print a list of reports

    Args:
        reports (list[tuple[str, str]]): list of (title, report) tuples
    """
    for title, report in reports:
        print(f"{title.upper()}:")
        print(report)

## Pipelines

In [None]:
# Sequential transfer learning is state-of-the-art for sentiment analysis.
# So we'll dive into using fine-tuned transformers from the HuggingFace model hub.
# We will also test [SetFit](https://arxiv.org/abs/2209.11055), a new few-shot fine-tuning method.
# And [GPT-3](https://platform.openai.com/examples/default-adv-tweet-classifier), because it's so hot right now.

from src.pipelines import get_all_pipelines

pipes = get_all_pipelines()

In [None]:
# Let's compare the HuggingFace transformer pipelines first
from sklearn.metrics import classification_report

hf_reports = []
for name, pipe in pipes["hf"].items():
    print(name)
    preds = pipe(test_dataset["entry"])  # list of dicts (keys: "label", "score")
    preds = [pred["label"] for pred in preds]  # convert to list of labels
    report = classification_report(test_dataset["sentiment_output"], preds)
    hf_reports.append((name, report))
    print(report)

title_reports.extend(hf_reports)

In [None]:
# We can see the fine-tuned RoBERTa model from Hartmann et al. (2021) is the best performer
# at 75% accuracy, beating the baseline of 72% accuracy on the test set.
# It's a RoBERTa-based model fine-tuned on 5,304 manually annotated social media posts
# See https://journals.sagepub.com/doi/full/10.1177/00222437211037258 for more details

In [None]:
# Now the SetFit pipeline
preds = pipes["setfit"]["setfit"](test_dataset["entry"])
setfit_report = classification_report(test_dataset["sentiment_output"], preds)
title_reports.append(("setfit", setfit_report))
print(setfit_report)

In [None]:
# 80% accuracy is pretty good for a model trained on only 240 examples
# This is the best accuracy we've seen so far
# It's also VERY fast for both training and inference

In [None]:
# Finally let's try OpenAI's GPT-3 (text-davinci-003)
# with both zero-shot and few-shot prompts
# NOTE: OpenAI pipes are async
print("OpenAI: Zero-shot")
preds = await pipes["openai"]["zero_shot"](test_dataset["entry"])
zero_shot_report = classification_report(test_dataset["sentiment_output"], preds)
title_reports.append(("openai_zero_shot", zero_shot_report))
print(zero_shot_report)

print("OpenAI: Few-shot")
preds = await pipes["openai"]["few_shot"](
    zip(test_dataset["title"], test_dataset["entry"])
)
few_shot_report = classification_report(test_dataset["sentiment_output"], preds)
title_reports.append(("openai_few_shot", few_shot_report))
print(few_shot_report)

In [None]:
# Let's print all the reports
print_reports(title_reports)

In [None]:
# As we can see from the reports, the SetFit pipeline is the best performer at 80% accuracy