In [8]:
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report

In [10]:
def get_result_table():
    c = ['Model', 'Accuracy', 'precision', 'recall', 'f1-score', 'hate f1', "non-hate f1", 'hate support', 'non-hate support']
    result_table = pd.DataFrame(columns=c)
    return result_table

def get_classification_report(i, cr):
    return [i, cr['accuracy'], cr['macro avg']['precision'], 
            cr['macro avg']['recall'], cr['macro avg']['f1-score'],
            cr['Hate']['f1-score'],cr['Non-Hate']['f1-score'], 
            cr['Hate']['support'],cr['Non-Hate']['support']]

def get_result_single(y_test, y_test_pred, model_name, result_table):
    cr = classification_report(y_test, y_test_pred, labels=["Hate","Non-Hate"], output_dict=True)
    result_table.loc[len(result_table)] = get_classification_report(model_name, cr)


# GPT

- https://platform.openai.com/docs/api-reference/fine-tunes

In [1]:
import openai
import os
import time

## Creating

In [None]:
# dataset for gpt
df_gpt = pd.DataFrame(zip(x_train,y_train_binary), columns = ['prompt', 'completion'])
df_gpt.to_json(f"Dataset/{dataset_name}/gpt_data_train.jsonl", orient='records', lines=True)
print(len(df_gpt))

df_gpt = pd.DataFrame(zip(x_test,y_test_binary), columns = ['prompt', 'completion'])
df_gpt.to_json(f"Dataset/{dataset_name}/gpt_data_test.jsonl", orient='records', lines=True)
print(len(df_gpt))

In [None]:
# prepare dataset for fine tune do in cmd
print(f"openai tools fine_tunes.prepare_data -f Dataset/{dataset_name}/gpt_data_train.jsonl")
print(f"openai tools fine_tunes.prepare_data -f Dataset/{dataset_name}/gpt_data_test.jsonl")

In [None]:
# upload file to openai and create fine tune model
train_create_output = openai.File.create(
  file=open(f"Dataset/{dataset_name}/gpt_data_train_prepared.jsonl", "rb"),
  purpose='fine-tune'
)
file_train_id = train_create_output.get('id')
print(file_train_id, train_create_output.get('status'))

test_create_output = openai.File.create(
  file=open(f"Dataset/{dataset_name}/gpt_data_test_prepared.jsonl", "rb"),
  purpose='fine-tune'
)
file_test_id = test_create_output.get('id')
print(file_test_id, test_create_output.get('status'))


In [None]:
# create the fine tune job
fine_tune_create_output = openai.FineTune.create(training_file =file_train_id,
                       validation_file=file_test_id,
                       model = "ada",
                       compute_classification_metrics = True,
                       classification_positive_class = " 0"
                       )
fine_tune_id = fine_tune_create_output.get('id')
print(fine_tune_id)

## Checking

In [None]:
# fine tune list
all_finetune = openai.FineTune.list()
all_finetune_data = all_finetune.get('data')
for i in range(len(all_finetune_data)):
    print(all_finetune_data[i].get('id'), all_finetune_data[i].get('status'), all_finetune_data[i].get('fine_tuned_model'))

In [None]:
# model list
all_models = openai.Model.list()
all_models_data = all_models.get('data')
owned_by_list = ['openai','openai-dev', 'openai-internal']
for i in range(len(all_models_data)):
    if all_models_data[i].get('owned_by') not in owned_by_list:
        print(all_models_data[i])

In [None]:
fine_tune_id = "ft-j51edHpwX7ZfLBe3GRrXKnDT"

In [None]:
retrieve_output = openai.FineTune.retrieve(id=fine_tune_id)
retrieve_output

In [None]:
if retrieve_output.get("status") == "succeeded":
    model_id = retrieve_output.get('fine_tuned_model')
    print("succeeded", model_id)
else:
    print(retrieve_output.get("status"))

In [None]:
openai.FineTune.list_events(id=fine_tune_id)

## GPT Model result

In [None]:
# get result of model in cmd
# download result to result.csv
print(f"openai api fine_tunes.results -i {fine_tune_id} > result.csv")

In [None]:
# result view
results = pd.read_csv('result.csv')
results[results['classification/accuracy'].notnull()].tail(1)

## Predict

In [None]:
dataset_name = "Implicit_hate_corpus"
model_id = "ada:ft-personal-2023-08-04-20-03-44"

In [None]:
predict_result = None

def gpt_complete_create(prompt_text):
    #model_id = "ada:ft-personal-2023-06-26-17-27-28" 
    result_gpt = openai.Completion.create(model=model_id, prompt=prompt_text, max_tokens=1, temperature=0)
    return result_gpt

def gpt_predict(start = 0, step = 10, max_s=10):
    prompts_tosend = []
    for i in range(start, start+step):
        if i == max_s: break
        p = df_gpt.loc[i]['prompt']
        prompts_tosend.append(p)
    
    predict_result = gpt_complete_create(prompts_tosend)
    choices_gpt = predict_result.get('choices')
    
    for i in range(len(choices_gpt)):
        j = choices_gpt[i]['index']
        df_gpt.loc[start+j, 'predicted'] = int(choices_gpt[i]['text']) 

    print(f"predicted {start} to {start+len(prompts_tosend)-1}")

    return prompts_tosend, predict_result

def loop_gpt(start, end, step, max_s):
    for i in range(start, end, step):
        bb, cc = gpt_predict(i, step, max_s)
        time.sleep(50)
    return bb, cc

In [None]:
filepath = f"Dataset/{dataset_name}/gpt_data_test_prepared.jsonl"
df_gpt = pd.read_json(filepath, lines=True)
df_gpt['predicted'] = -1
df_gpt.head()

In [None]:
max_s = len(df_gpt)
max_s

In [None]:
try:
    p, r = loop_gpt(600, max_s, 500, max_s)
except Exception as e:
    print(e)

## View result for prediction

In [18]:
def load_gpt_result(dataset_name):
    df_gpt = pd.read_json(f"Dataset/{dataset_name}/gpt_data_test_result1.jsonl", orient='records', lines=True)
    return df_gpt

def get_y_result(df_gpt):
    y_test = df_gpt['completion'].to_numpy()
    y_test_pred = df_gpt['predicted'].to_numpy()

    y_test = np.where(y_test == 1, "Hate", "Non-Hate") 
    y_test_pred = np.where(y_test_pred == 1, "Hate", "Non-Hate") 
    return y_test, y_test_pred

In [None]:
dataset_name = "Balanced"
df_gpt = pd.read_json(f"Dataset/{dataset_name}/gpt_data_test_result1.jsonl", orient='records', lines=True)
df_gpt

In [12]:
def get_y_result(df_gpt):
    y_test = df_gpt['completion'].to_numpy()
    y_test_pred = df_gpt['predicted'].to_numpy()

    y_test = np.where(y_test == 1, "Hate", "Non-Hate") 
    y_test_pred = np.where(y_test_pred == 1, "Hate", "Non-Hate") 
    return y_test, y_test_pred

In [33]:
df_result = get_result_table()

In [34]:
df_gpt = load_gpt_result("Balanced")
y_test, y_test_pred = get_y_result(df_gpt)
get_result_single(y_test, y_test_pred, "Balanced_GPT", df_result)

df_gpt = load_gpt_result("GabHateCorpus")
y_test, y_test_pred = get_y_result(df_gpt)
get_result_single(y_test, y_test_pred, "GabHateCorpus_GPT", df_result)

df_gpt = load_gpt_result("Implicit_hate_corpus")
y_test, y_test_pred = get_y_result(df_gpt)
get_result_single(y_test, y_test_pred, "Implicit_hate_corpus_GPT", df_result)

df_gpt = load_gpt_result("SE2019")
y_test, y_test_pred = get_y_result(df_gpt)
get_result_single(y_test, y_test_pred, "SE2019", df_result)

In [35]:
df_result

Unnamed: 0,Model,Accuracy,precision,recall,f1-score,hate f1,non-hate f1,hate support,non-hate support
0,Balanced_GPT,0.768033,0.768028,0.768025,0.768026,0.769253,0.7668,3420.0,3387.0
1,GabHateCorpus_GPT,0.894815,0.74845,0.716789,0.730999,0.521079,0.940919,639.0,4761.0
2,Implicit_hate_corpus_GPT,0.806331,0.79751,0.784396,0.789644,0.730395,0.848892,1622.0,2674.0
3,SE2019,0.814333,0.809284,0.809436,0.809359,0.778565,0.840153,1128.0,1565.0
