# Setup

In [None]:
import os
import pandas as pd

from pprint import pprint

from sklearn.metrics import accuracy_score

from skllm import MultiLabelZeroShotGPTClassifier
from skllm.config import SKLLMConfig

In [None]:
#See notes in INSTALL.md for how to set this, DO NOT HARD CODE YOUR API KEY HERE, 
# if your repository is public, then someone will steal your API key and make you pay for their shit 
SKLLMConfig.set_openai_key(os.environ.get('OPENAI_API_KEY'))

# Collect a Dataset

In [None]:
# Load the Reddit life tips dataset
data = pd.read_csv('data/helpfulRedditPosts.csv')

# Understand Your Data

In [None]:
data.head()

In [None]:
data.describe()

In [None]:
X = data['postTitle']

In [None]:
X

# "Develop" a Model aka just use OpenAI's API

In [None]:
# Define candidate labels
candidate_labels = [
    "Work",
    "Family",
    "Other"
]

# Create and fit the classifier
clf = MultiLabelZeroShotGPTClassifier(max_labels=2) 
clf.fit(None, [candidate_labels])

In [None]:
# Predict the labels
labels = clf.predict(X)

In [None]:
# Add labels to the dataset and save
data['ChatGPTLabel'] = labels
data.to_csv('data/classified_tips.csv', index=False)

# Choose a measure of success, Choose an evaluation protocol / evaluate

In [None]:
# remove some schmutz from the labels, don't worry about what this does for now
data['HumanLabel'] = data['HumanLabel'].str.extract(r"\['(.*?)'\]")
data['ChatGPTLabel'] = data['ChatGPTLabel'].apply(lambda x: x[0] if x else None)

In [None]:
data[["postTitle","ChatGPTLabel","HumanLabel"]]

In [None]:
accuracy = accuracy_score(data[["HumanLabel"]], data[["ChatGPTLabel"]])
print(accuracy)

# Skipped Steps
* Beat a baseline
* Overfit, regularize and tune
* Communicate with stakeholders
* Ship an inference model
* Monitor and maintain