In [1]:
%load_ext autoreload
%load_ext jupyter_black
%autoreload 2

In [2]:
# [GPT-3](https://platform.openai.com/examples/default-adv-tweet-classifier)

In [3]:
# We are going to compare two different approaches to text classification with GPT-3
# 1. Zero-shot classification
# 2. Few-shot classification
# Few-shot will no doubt be more accurate, but it is more costly due to the extra prompt tokens.

In [4]:
# Load the test data that we will use for comparison
from src.data.make_dataset import load_dataset_from_file
from src.config import DATASET_PATH

dataset = load_dataset_from_file(DATASET_PATH)
test_data = dataset["test"]

In [5]:
# Load the pipelines
# NOTE: The pipelines are async callables, so we need to use `await` to get the results
from src.pipelines.openai import get_openai_pipelines

pipelines = get_openai_pipelines()
zero_shot_pipeline = pipelines["zero_shot"]
few_shot_pipeline = pipelines["few_shot"]

## Testing the Pipelines

In [6]:
# Let's try them out on an example from the test set before we compare them
example = test_data[0]
example

{'team_id': 988,
 'id': 2309258,
 'title': 'taking break last 15 min before shift is over',
 'entry': 'i’ve heard mixed opinions from my fellow coworkers on this topic. are you allowed to take your last 15 minute break in the last 15 min or so of when your break is over?',
 'data_source': 'Reddit',
 'sentiment_output': 'NEUTRAL',
 'annotated_sentiment': 'NEUTRAL',
 'correct?': 1,
 'label_str': 'NEUTRAL',
 'label': 1,
 '__index_level_0__': 203}

In [7]:
# Testing the zero-shot pipeline
await zero_shot_pipeline([example["entry"]])

['NEUTRAL']

In [8]:
# The model correctly classifies the example as "NEUTRAL" which is a good sign
# What about our few-shot pipeline?
# NOTE: The few-shot pipeline takes a list of (title, text) tuples as input
await few_shot_pipeline([(example["title"], example["entry"])])

['NEUTRAL']

In [9]:
# Also correct! Let's just let both pipelines run on the entire test set and see how they do

## Zero vs. Few-Shot

In [10]:
# Run the zero-shot pipeline on the test set
zero_shot_labels = await zero_shot_pipeline([entry["entry"] for entry in test_data])

# Run the few-shot pipeline on the test set
few_shot_labels = await few_shot_pipeline(
    [(entry["title"], entry["entry"]) for entry in test_data]
)

assert len(zero_shot_labels) == len(few_shot_labels) == len(test_data)

In [11]:
# Calculate the accuracy of each pipeline
from sklearn.metrics import classification_report

# Zero-shot
print("Zero-shot classification report:")
print(classification_report(test_data["sentiment_output"], zero_shot_labels))

# Few-shot
print("Few-shot classification report:")
print(classification_report(test_data["sentiment_output"], few_shot_labels))

Zero-shot classification report:
              precision    recall  f1-score   support

    NEGATIVE       0.88      0.70      0.78        30
     NEUTRAL       0.53      0.56      0.55        16
    POSITIVE       0.74      1.00      0.85        14

    accuracy                           0.73        60
   macro avg       0.71      0.75      0.72        60
weighted avg       0.75      0.73      0.73        60

Few-shot classification report:
              precision    recall  f1-score   support

    NEGATIVE       0.83      0.63      0.72        30
     NEUTRAL       0.52      0.69      0.59        16
    POSITIVE       0.88      1.00      0.93        14

    accuracy                           0.73        60
   macro avg       0.74      0.77      0.75        60
weighted avg       0.76      0.73      0.73        60



In [12]:
# Both pipelines are performing well compared to the baseline (which is ~0.69 accuracy on the entire dataset and ~0.72 on the test set)
# It's a good thing we checked our assumptions about zero-shot vs. few-shot
# It looks like the few-shot pipeline is significantly less accurate when it comes to classifying "NEGATIVE" examples
# Obviously, trying out different examples in the prompt (i.e. "prompt engineering") would be a good next step to
# improve the performance.