In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import os

data = pd.read_csv("data/original/semeval_2016.csv", encoding='unicode_escape')

## Preparing training data for Semeval

In [2]:
data['prompt'] = data['Tweet'].str.replace("#SemST", "").str.strip()
data['completion'] = data['Target'].apply(lambda row: row.split(" ")[-1].strip()) + ", " + data['Stance'].str.title()

trump = pd.DataFrame({
    'Target':data.loc[data['Target'].str.strip() == "Donald Trump", 'Target'],
    'prompt':data.loc[data['Target'].str.strip() == "Donald Trump", 'prompt'],
    'completion':data.loc[data['Target'].str.strip() == "Donald Trump", 'completion']
})

clinton = pd.DataFrame({
    'Target':data.loc[data['Target'].str.strip() == "Hillary Clinton", 'Target'],
    'prompt':data.loc[data['Target'].str.strip() == "Hillary Clinton", 'prompt'],
    'completion':data.loc[data['Target'].str.strip() == "Hillary Clinton", 'completion']
})

# Train test split
trump_train, trump_test = train_test_split(trump, test_size=0.2)
clinton_train, clinton_test = train_test_split(clinton, test_size=0.2)

train_10 = pd.concat([trump_train[:5], clinton_train[:5]], axis=0).sample(frac=1, replace=False)
train_90 = pd.concat([trump_train[5:50], clinton_train[5:50]], axis=0).sample(frac=1, replace=False)
train_100 = pd.concat([train_10, train_90], axis=0)
train_rest = pd.concat([trump_train[50:], clinton_train[50:]],axis=0).sample(frac=1, replace=False)
train_all = pd.concat([train_100, train_rest], axis=0).reset_index(drop=True)

test = pd.concat([clinton_test, trump_test], axis=0).reset_index(drop=True)

train_all.loc[:,["prompt", "completion"]].to_csv("data/semeval_train.csv", index=False)
test.loc[:,["prompt", "completion"]].to_csv("data/semeval_test.csv", index=False)

# Few-shot examples generation

In [7]:
train_exemplar_trump = train_all.loc[train_all["Target"] == "Donald Trump",:].sample(100, replace=False).reset_index(drop=True)
train_exemplar_trump['prompt_t'] = "Example:\n" + train_exemplar_trump['prompt'] + "\n" + train_exemplar_trump['completion'] + "\n"

train_exemplar_clinton = train_all.loc[train_all["Target"] == "Hillary Clinton",:].sample(100, replace=False).reset_index(drop=True)
train_exemplar_clinton['prompt_c'] = "Example:\n" + train_exemplar_clinton['prompt'] + "\n" + train_exemplar_clinton['completion'] + "\n"

train_exemplar = pd.concat([train_exemplar_trump, train_exemplar_clinton],axis=1).drop(columns="prompt")
train_exemplar["prompt"] = train_exemplar["prompt_t"] + train_exemplar["prompt_c"]
train_exemplar["prompt"].to_csv("data/semeval_train_exemplar.csv", index=False)

## Fine-tuning data for GPT3 Ada, Davinci

In [8]:
import openai

if not os.path.exists("data/openai"):
    os.makedirs("data/openai")

def formatter(df: pd.DataFrame):
    output = df.copy()
    output['prompt'] = df['prompt'] + "\nTarget, Stance:"
    output['completion'] = " " + df['completion'] + "\n"

    return output

### Reformat according to GPT3 fine-tuning requirement

In [9]:
train_openai = formatter(train_all)
test_openai = formatter(test)
test_openai.loc[:,["prompt", "completion"]].to_csv("data/openai/semeval_gpt3_test.csv", index=False)

In [10]:
semeval_gpt3 = train_openai.loc[:,["Target", "prompt", "completion"]]
semeval_gpt3_10 = semeval_gpt3[:10]
semeval_gpt3_100 = semeval_gpt3[:100]
semeval_gpt3_all = train_openai.loc[:,["prompt", "completion"]]

semeval_gpt3_10.to_json("data/openai/semeval_gpt3_10.jsonl", orient='records', lines=True)
semeval_gpt3_100.to_json("data/openai/semeval_gpt3_100.jsonl", orient='records', lines=True)
semeval_gpt3_all.to_json("data/openai/semeval_gpt3_all.jsonl", orient='records', lines=True)

In [None]:
# Upload data for fine-tuning
openai.File.create(file=open("data/openai/semeval_gpt3_10.jsonl"), user_provided_filename="semeval_gpt3_10", purpose="fine-tune") 
openai.File.create(file=open("data/openai/semeval_gpt3_100.jsonl"), user_provided_filename="semeval_gpt3_100", purpose="fine-tune") 
openai.File.create(file=open("data/openai/semeval_gpt3_all.jsonl"), user_provided_filename="semeval_gpt3_all", purpose="fine-tune") 

openai.File.list()