In [None]:
%load_ext autoreload
%load_ext jupyter_black
%autoreload 2

In [None]:
# [GPT-3](https://platform.openai.com/examples/default-adv-tweet-classifier)

In [None]:
# We are going to compare two different approaches to text classification with GPT-3
# 1. Zero-shot classification
# 2. Few-shot classification
# Few-shot will no doubt be more accurate, but it is more costly due to the extra prompt tokens.

In [None]:
# Load the test data that we will use for comparison
from src.data.make_dataset import load_dataset_from_file
from src.config import DATASET_PATH

dataset = load_dataset_from_file(DATASET_PATH)
test_data = dataset["test"]

In [None]:
# Load the pipelines
# NOTE: The pipelines are async callables, so we need to use `await` to get the results
from src.pipelines.openai import get_openai_pipelines

pipelines = get_openai_pipelines()
zero_shot_pipeline = pipelines["zero_shot"]
few_shot_pipeline = pipelines["few_shot"]

## Testing the Pipelines

In [None]:
# Let's try them out on an example from the test set before we compare them
example = test_data[0]
example

In [None]:
# Testing the zero-shot pipeline
await zero_shot_pipeline([example["entry"]])

In [None]:
# The model correctly classifies the example as "NEUTRAL" which is a good sign
# What about our few-shot pipeline?
# NOTE: The few-shot pipeline takes a list of (title, text) tuples as input
await few_shot_pipeline([(example["title"], example["entry"])])

In [None]:
# Also correct! Let's just let both pipelines run on the entire test set and see how they do

## Zero vs. Few-Shot

In [None]:
# Run the zero-shot pipeline on the test set
zero_shot_labels = await zero_shot_pipeline([entry["entry"] for entry in test_data])

# Run the few-shot pipeline on the test set
few_shot_labels = await few_shot_pipeline(
    [(entry["title"], entry["entry"]) for entry in test_data]
)

assert len(zero_shot_labels) == len(few_shot_labels) == len(test_data)

In [None]:
# Calculate the accuracy of each pipeline
from sklearn.metrics import classification_report

# Zero-shot
print("Zero-shot classification report:")
print(classification_report(test_data["sentiment_output"], zero_shot_labels))

# Few-shot
print("Few-shot classification report:")
print(classification_report(test_data["sentiment_output"], few_shot_labels))

In [None]:
# Both pipelines are performing well compared to the baseline (which is ~0.69 accuracy on the entire dataset and ~0.72 on the test set)
# It's a good thing we checked our assumptions about zero-shot vs. few-shot
# It looks like the few-shot pipeline is significantly less accurate when it comes to classifying "NEGATIVE" examples
# Obviously, trying out different examples in the prompt (i.e. "prompt engineering") would be a good next step to
# improve the performance.