In [None]:

import os

import openai
import wandb
import pandas as pd

# Prepare Data

In [None]:
train_df = pd.read_csv('./data/train.csv')
val_df = pd.read_csv('./data/val.csv')


# We can create prompt in 2 ways: instruction or raw text
# For instruction propmpt we will use a human understandable textual instruction.
# For raw prompt we will use raw text with special separator between propmpt and completion

def create_instruction_prompt(text, all_labels):
    prompt =  f''' Classify the following messages into one of the following categories: {','.join(all_labels)}

Message: {text}

Category:'''
    return prompt

def create_raw_prompt(text):
    prompt =  f'''{text} /n/n###/n/n'''
    return prompt

# For classification task we need 1 token completion. The completion token must be in model vocabulary. 
# GPT tokenization required completion tokens started with whitespace.

train_df['completion'] = train_df['label'].apply(lambda x: ' '+ x)
val_df['completion'] = val_df['label'].apply(lambda x: ' ' + x)

# instruction based prompt
all_labels = set(train_df['completion'].unique())
train_df['prompt'] = train_df['text'].apply(lambda x: create_instruction_prompt(x, all_labels))
val_df['prompt'] = val_df['text'].apply(lambda x: create_instruction_prompt(x, all_labels))

# # raw text based prompt
# train_df['completion'] = train_df['text'].apply(create_raw_prompt)
# val_df['completion'] = val_df['text'].apply(create_raw_prompt)

train_df[['prompt', 'completion']].to_json("./data/train_hatespeech.jsonl", orient='records', lines=True)
val_df[['prompt', 'completion']].to_json("./data/val_hatespeech.jsonl", orient='records', lines=True)


# Finetune

In [None]:
# Run in terminal
# Use OpenAI API key https://platform.openai.com/account/api-keys

# openai -k YOUR_API_KEY api fine_tunes.create -t ./data/train_hatespeech.jsonl -v ./data/val_hatespeech.jsonl -m ada --compute_classification_metrics --classification_n_classes 3 --n_epochs 3
# openai -k YOUR_API_KEY api fine_tunes.create -t ./data/train_hatespeech.jsonl -v ./data/val_hatespeech.jsonl -m davinci --compute_classification_metrics --classification_n_classes 3 --n_epochs 3
# openai wandb sync

# Finetune report