In [1]:

import os

import openai
import pandas as pd

openai.api_key = "YOUR_API_KEY"

# Prepare Data

In [2]:
train_df = pd.read_csv('./data/train.csv')
val_df = pd.read_csv('./data/val.csv')


# We can create prompt in 2 ways: instruction or raw text
# For instruction propmpt we will use a human understandable textual instruction.
# For raw prompt we will use raw text with special separator between propmpt and completion

def create_instruction_prompt(text, all_labels):
    prompt =  f''' Classify the following messages into one of the following categories: {','.join(all_labels)}

Message: {text}

Category:'''
    return prompt

def create_raw_prompt(text):
    prompt =  f'''{text} /n/n###/n/n'''
    return prompt

# For classification task we need 1 token completion. The completion token must be in model vocabulary. 
# GPT tokenization required completion tokens started with whitespace.

train_df['completion'] = train_df['label'].apply(lambda x: ' '+ x)
val_df['completion'] = val_df['label'].apply(lambda x: ' ' + x)

# instruction based prompt
all_labels = set(train_df['completion'].unique())
train_df['prompt'] = train_df['text'].apply(lambda x: create_instruction_prompt(x, all_labels))
val_df['prompt'] = val_df['text'].apply(lambda x: create_instruction_prompt(x, all_labels))

# # raw text based prompt
# train_df['completion'] = train_df['text'].apply(create_raw_prompt)
# val_df['completion'] = val_df['text'].apply(create_raw_prompt)

train_df[['prompt', 'completion']].to_json("./data/train_hatespeech.jsonl", orient='records', lines=True)
val_df[['prompt', 'completion']].to_json("./data/val_hatespeech.jsonl", orient='records', lines=True)


# Finetune

In [None]:
# Run in terminal
# Use OpenAI API key https://platform.openai.com/account/api-keys

# openai -k YOUR_API_KEY api fine_tunes.create -t ./data/train_hatespeech.jsonl -v ./data/val_hatespeech.jsonl -m ada --compute_classification_metrics --classification_n_classes 3 --n_epochs 4
# openai -k YOUR_API_KEY api fine_tunes.create -t ./data/train_hatespeech.jsonl -v ./data/val_hatespeech.jsonl -m davinci --compute_classification_metrics --classification_n_classes 3 --n_epochs 4
# openai wandb sync

# Finetune report

https://api.wandb.ai/links/vetka925/tgmn8w11

# Validation of zero-shot ChatGPT (gpt-3.5-turbo)

In [10]:
%pip install --upgrade openai

Collecting openai
  Downloading openai-0.27.0-py3-none-any.whl (70 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m70.1/70.1 KB[0m [31m238.3 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Installing collected packages: openai
  Attempting uninstall: openai
    Found existing installation: openai 0.26.5
    Uninstalling openai-0.26.5:
      Successfully uninstalled openai-0.26.5
Successfully installed openai-0.27.0
Note: you may need to restart the kernel to use updated packages.


In [7]:
val_df = pd.read_csv('./data/val.csv')


def create_instruction_prompt(text, all_labels):
    prompt =  f''' Classify the following messages into one of the following categories: {', '.join(all_labels)}

Message: {text}

Category:'''
    return prompt

all_labels = set(val_df['label'].unique())

val_df['prompt'] = val_df['text'].apply(lambda x: create_instruction_prompt(x, all_labels))



In [8]:
from tqdm.notebook import tqdm
import time

MODEL = "gpt-3.5-turbo"

result = []
for prompt in tqdm(val_df['prompt']):
    time.sleep(0.1)
    response = openai.ChatCompletion.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": "You are a hate speech, offensive language classifier."},
            {"role": "user", "content": prompt},
        ],
        temperature=0.7,
    )
    result.append(response['choices'][0]['message']['content'].lower())


  0%|          | 0/200 [00:00<?, ?it/s]

In [10]:
print(result[:10])

['hate', 'offensive', 'hate.', 'hate', 'neutral', 'neutral.', 'offensive', 'hate.', 'hate', 'offensive/hate.']


In [12]:
def gen_accuracy(true_labels, gens):
    total = len(true_labels)
    correct = 0
    for i in range(total):
        len_true = len(true_labels[i])
        if true_labels[i].lower() == gens[i].strip()[:len_true].lower():
            correct += 1
    return round(correct / total, 3)
        
    
print(f"HATESPEECH DETECTION ACCURACY: {gen_accuracy(list(val_df['label']), result)}")

HATESPEECH DETECTION ACCURACY: 0.71


# Validate news categories

In [8]:
from datasets import load_dataset
import pandas as pd
import openai
from tqdm import tqdm
import time

news_dataset = load_dataset('heegyu/news-category-balanced-top10')



to_replace = {'BUSINESS': 'Business', 'ENTERTAINMENT': 'Entertainment', 'FOOD & DRINK': 'Food', 'PARENTING': 'Parenting', 'POLITICS': 'Politics', 'STYLE & BEAUTY': 'Style', 'TRAVEL': 'Travel'}

news_data = pd.DataFrame(news_dataset['train'])[['category', 'short_description']]
news_data = news_data[news_data['category'].isin(to_replace)].sample(100, random_state=22)
news_data['category'] = news_data['category'].replace(to_replace)

news_categories = news_data['category'].unique()

def create_instruction_prompt(text, all_labels):
    prompt =  f''' Classify the following messages into one of the following categories: {', '.join(all_labels)}

Message: {text}

Category:'''
    return prompt


news_data['prompt'] = news_data['short_description'].apply(lambda x: create_instruction_prompt(x[:150], news_categories))

news_data.head()

Found cached dataset json (/home/vetka/.cache/huggingface/datasets/heegyu___json/heegyu--news-category-balanced-top10-5f881f7cd497c7a8/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)


  0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,category,short_description,prompt
47713,Politics,It's undetermined whether the FBI or the Justi...,Classify the following messages into one of t...
43809,Politics,The West Virginia senator's unwillingness to b...,Classify the following messages into one of t...
19830,Food,From fancy Spam crisps to fatty Spam sandwiche...,Classify the following messages into one of t...
38873,Politics,"Students walked out in protest, and say they'l...",Classify the following messages into one of t...
56862,Style,The University of Alabama is praised for its p...,Classify the following messages into one of t...


In [9]:
MODEL = "gpt-3.5-turbo"
openai.api_key = "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"

result = []
for prompt in tqdm(news_data['prompt']):
    time.sleep(0.1)
    response = openai.ChatCompletion.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": "You are a news classifier."},
            {"role": "user", "content": prompt},
        ],
        temperature=0.7,
    )
    result.append(response['choices'][0]['message']['content'].lower())

100%|██████████| 100/100 [01:38<00:00,  1.02it/s]


In [17]:
def gen_accuracy(true_labels, gens):
    total = len(true_labels)
    correct = 0
    for i in range(total):
        len_true = len(true_labels[i])
        if true_labels[i].lower() == gens[i].strip()[:len_true].lower():
            correct += 1
    return round(correct / total, 3)
        
    
print(f"NEWS CATEGORIZING ACCURACY: {gen_accuracy(list(news_data['category']), result)}")

NEWS CATEGORIZING ACCURACY: 0.71
