# Setup

In [1]:
import os
import pandas as pd

from pprint import pprint

from sklearn.metrics import accuracy_score

from skllm import MultiLabelZeroShotGPTClassifier
from skllm.config import SKLLMConfig

In [2]:
#See notes in INSTALL.md for how to set this, DO NOT HARD CODE YOUR API KEY HERE, 
# if your repository is public, then someone will steal your API key and make you pay for their shit 
SKLLMConfig.set_openai_key(os.environ.get('OPENAI_API_KEY'))

# Collect a Dataset

In [3]:
# Load the Aphorisms life tips dataset
data = pd.read_csv('data/myAphorisms1.csv')

# Understand Your Data

In [4]:
data

Unnamed: 0,id,author,isOver18,postUrl,subreddit,postTitle,hasPostBody,postBody,score,numComments,Unnamed: 10,Unnamed: 11,Unnamed: 12,Unnamed: 13,Unnamed: 14,HumanLabel
0,f6jt5e,w2555,False,https://reddit.com/r/LifeProTips/comments/f6jt5e/,LifeProTips,"""Hard work beats talent when talent doesn't wo...",True,I had a phone interview scheduled this morning...,147296,4730,,,,,,['Work']
1,lq1jn7,this1tyme,False,https://reddit.com/r/LifeProTips/comments/lq1jn7/,LifeProTips,"""Choose a job you love, and you will never hav...",False,,134320,4121,,,,,,['Other']
2,j2mm1b,raviji22,False,https://reddit.com/r/LifeProTips/comments/j2mm1b/,LifeProTips,"""Success is not final, failure is not fatal: I...",False,,129513,1971,,,,,,['Other']
3,fqkkke,[deleted],False,https://reddit.com/r/LifeProTips/comments/fqkkke/,LifeProTips,"""Opportunities don't happen. You create them.""...",True,[deleted],124219,2762,,,,,,['Family']
4,f6jt5e,w2555,False,https://reddit.com/r/LifeProTips/comments/f6jt5e/,LifeProTips,"""Family is not an important thing, it's everyt...",True,I had a phone interview scheduled this morning...,147296,4730,,,,,,['Work']
5,lq1jn7,this1tyme,False,https://reddit.com/r/LifeProTips/comments/lq1jn7/,LifeProTips,"""The love of a family is life's greatest bless...",False,,134320,4121,,,,,,['Other']
6,j2mm1b,raviji22,False,https://reddit.com/r/LifeProTips/comments/j2mm1b/,LifeProTips,"""Family is where life begins and love never en...",False,,129513,1971,,,,,,['Other']
7,fqkkke,[deleted],False,https://reddit.com/r/LifeProTips/comments/fqkkke/,LifeProTips,"""A happy family is but an earlier heaven."" - G...",True,[deleted],124219,2762,,,,,,['Family']
8,f6jt5e,w2555,False,https://reddit.com/r/LifeProTips/comments/f6jt5e/,LifeProTips,"""Life is what happens when you're busy making ...",True,I had a phone interview scheduled this morning...,147296,4730,,,,,,['Work']
9,lq1jn7,this1tyme,False,https://reddit.com/r/LifeProTips/comments/lq1jn7/,LifeProTips,"""Life is really simple, but we insist on makin...",False,,134320,4121,,,,,,['Other']


In [5]:
data.describe()

Unnamed: 0,score,numComments,Unnamed: 10,Unnamed: 11,Unnamed: 12,Unnamed: 13,Unnamed: 14
count,12.0,12.0,0.0,0.0,0.0,0.0,0.0
mean,133837.0,3396.0,,,,,
std,8932.792549,1136.713204,,,,,
min,124219.0,1971.0,,,,,
25%,128189.5,2564.25,,,,,
50%,131916.5,3441.5,,,,,
75%,137564.0,4273.25,,,,,
max,147296.0,4730.0,,,,,


In [6]:
X = data['postTitle']

In [7]:
X

0     "Hard work beats talent when talent doesn't wo...
1     "Choose a job you love, and you will never hav...
2     "Success is not final, failure is not fatal: I...
3     "Opportunities don't happen. You create them."...
4     "Family is not an important thing, it's everyt...
5     "The love of a family is life's greatest bless...
6     "Family is where life begins and love never en...
7     "A happy family is but an earlier heaven." - G...
8     "Life is what happens when you're busy making ...
9     "Life is really simple, but we insist on makin...
10    "Life is either a daring adventure or nothing ...
11    "Life is like riding a bicycle. To keep your b...
Name: postTitle, dtype: object

# "Develop" a Model aka just use OpenAI's API

In [8]:
# Define candidate labels
candidate_labels = [
    "Work",
    "Family",
    "Other"
]

# Create and fit the classifier
clf = MultiLabelZeroShotGPTClassifier(max_labels=2) 
clf.fit(None, [candidate_labels])

In [18]:
# Predict the labels
labels = clf.predict(X)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 12/12 [01:13<00:00,  6.14s/it]


In [19]:
# Add labels to the dataset and save
data['ChatGPTLabel'] = labels
data.to_csv('data/classified_tips.csv', index=False)

# Choose a measure of success, Choose an evaluation protocol / evaluate

In [20]:
# remove some schmutz from the labels, don't worry about what this does for now
data['HumanLabel'] = data['HumanLabel'].str.extract(r"\['(.*?)'\]")
data['ChatGPTLabel'] = data['ChatGPTLabel'].apply(lambda x: x[0] if x else None)

In [21]:
data[["postTitle","ChatGPTLabel","HumanLabel"]]

Unnamed: 0,postTitle,ChatGPTLabel,HumanLabel
0,"""Hard work beats talent when talent doesn't wo...",Work,Work
1,"""Choose a job you love, and you will never hav...",Work,Other
2,"""Success is not final, failure is not fatal: I...",Other,Other
3,"""Opportunities don't happen. You create them.""...",Work,Family
4,"""Family is not an important thing, it's everyt...",Family,Work
5,"""The love of a family is life's greatest bless...",Family,Other
6,"""Family is where life begins and love never en...",Family,Other
7,"""A happy family is but an earlier heaven."" - G...",Family,Family
8,"""Life is what happens when you're busy making ...",Other,Work
9,"""Life is really simple, but we insist on makin...",Other,Other


In [22]:
accuracy = accuracy_score(data[["HumanLabel"]], data[["ChatGPTLabel"]])
print(accuracy)

0.4166666666666667


# Skipped Steps
* Beat a baseline
* Overfit, regularize and tune
* Communicate with stakeholders
* Ship an inference model
* Monitor and maintain