# Setup

## Install additional dependency (do this in the dockerfile to make in permanent)

In [None]:
import os
import pandas as pd

from pprint import pprint

from docx import Document

from sklearn.metrics import accuracy_score

from skllm import MultiLabelZeroShotGPTClassifier
from skllm.config import SKLLMConfig

In [None]:
#See notes in INSTALL.md for how to set this, DO NOT HARD CODE YOUR API KEY HERE, 
# if your repository is public, then someone will steal your API key and make you pay for their shit 
SKLLMConfig.set_openai_key(os.environ.get('OPENAI_API_KEY'))

# Collect a Dataset

In [None]:
# Set the folder path where your Word documents are located
folder_path = '../data/word_docs'

X = []

# Loop over each file in the folder
for filename in os.listdir(folder_path):
    if filename.endswith('.docx'):
        # Construct the full file path
        file_path = os.path.join(folder_path, filename)
        
        # Open the Word document
        doc = Document(file_path)
        
        # Loop over each paragraph in the document and print its text
        filetext=""
        for para in doc.paragraphs:
            filetext+=para.text
        X.append(filetext)

data = pd.DataFrame(X, columns=['WritingSample'])



# Understand Your Data

In [None]:
data.head()

In [None]:
data.describe()

In [None]:
X = data["WritingSample"]

In [None]:
X

# "Develop" a Model aka just use OpenAI's API

In [None]:
# Define candidate labels
candidate_labels = [
    "A",
    "B",
    "C",
    "D",
    "F"
]

# Create and fit the classifier
clf = MultiLabelZeroShotGPTClassifier(max_labels=2) 
clf.fit(None, [candidate_labels])

In [None]:
# Predict the labels
labels = clf.predict(X)

In [None]:
# Add labels to the dataset and save
data['ChatGPTLabel'] = labels
data.to_csv('../data/classified_tips.csv', index=False)

# Choose a measure of success, Choose an evaluation protocol / evaluate

In [None]:
data[["WritingSample","ChatGPTLabel"]]

In [None]:
accuracy = accuracy_score(data[["HumanLabel"]], data[["ChatGPTLabel"]])
print(accuracy)

# Skipped Steps
* Beat a baseline
* Overfit, regularize and tune
* Communicate with stakeholders
* Ship an inference model
* Monitor and maintain