In [16]:
import pandas as pd
from sklearn.model_selection import train_test_split
import os

mapping = {"S":"Favor", "O":"Against", "N":"None"}

data = pd.read_csv("data/original/facebook_comments.csv", encoding='unicode_escape', sep=";")

# Prepare training data for Facebook Comment

In [17]:
data = data.loc[:, ["page", "comment_message", "Clinton", "Trump"]]
data = data.rename(columns={"comment_message":"prompt"})
data['Clinton'] = data['Clinton'].replace(mapping, regex=True)
data['Trump'] = data['Trump'].replace(mapping, regex=True)

data['completion'] = data['Trump'].str.strip() + ", " + data['Clinton'].str.strip()
trump = data.loc[data['page']=="Trump"]
clinton = data.loc[data['page']=="Clinton"]

clinton_train, clinton_test = train_test_split(clinton, test_size=200, shuffle=True)
trump_train, trump_test = train_test_split(trump, test_size=200, shuffle=True)

train = pd.concat([clinton_train, trump_train], axis=0).sample(frac=1, replace=False).reset_index(drop=True)
test = pd.concat([clinton_test, trump_test], axis=0).reset_index(drop=True)

train.loc[:,["prompt", "completion"]].to_csv("data/fb_train.csv", index=False)
test.loc[:,["prompt", "completion"]].to_csv("data/fb_test.csv", index=False)

## Few-shot examples generation

In [18]:
train_exemplar = train.sample(100, replace=False)
train_exemplar['prompt'] = "Example:\n" + train_exemplar['prompt'] + "\nTrump: " + train_exemplar["Trump"] + ", " + "Clinton: " + train_exemplar["Clinton"] + "\n"
train_exemplar.loc[:,"prompt"].to_csv("data/fb_train_exemplar.csv", index=False)

## Fine-tuning data for GPT3 Ada, Davinci

In [19]:
import openai

if not os.path.exists("data/openai"):
    os.makedirs("data/openai")

def formatter(df: pd.DataFrame):
    output = df.copy()
    output['prompt'] = df['prompt'] + "\nStance:"
    output['completion'] = " " + df['completion'] + "\n"

    return output

### Reformat according to GPT3 fine-tuning requirement

In [20]:
train_openai = formatter(train)
test_openai = formatter(test)
test_openai.loc[:,["prompt", "completion"]].to_csv("data/openai/fb_gpt3_test.csv", index=False)

In [21]:
fb_gpt3 = train_openai.loc[:,['prompt', 'completion']]
fb_gpt3_10 = fb_gpt3[:10]
fb_gpt3_100 = fb_gpt3[:100]
fb_gpt3_1000 = fb_gpt3[:1000]
fb_gpt3_all = fb_gpt3.copy()

fb_gpt3_10.to_json("data/openai/fb_gpt3_10.jsonl", orient='records', lines=True)
fb_gpt3_100.to_json("data/openai/fb_gpt3_100.jsonl", orient='records', lines=True)
fb_gpt3_1000.to_json("data/openai/fb_gpt3_1000.jsonl", orient='records', lines=True)
fb_gpt3_all.to_json("data/openai/fb_gpt3_all.jsonl", orient='records', lines=True)

In [None]:
# Upload data for fine-tuning
openai.File.create(file=open("data/openai/fb_gpt3_10.jsonl"), user_provided_filename="fb_gpt3_10", purpose="fine-tune") 
openai.File.create(file=open("data/openai/fb_gpt3_100.jsonl"), user_provided_filename="fb_gpt3_100", purpose="fine-tune") 
openai.File.create(file=open("data/openai/fb_gpt3_1000.jsonl"), user_provided_filename="fb_gpt3_1000", purpose="fine-tune") 
openai.File.create(file=open("data/openai/fb_gpt3_all.jsonl"), user_provided_filename="fb_gpt3_all", purpose="fine-tune") 

openai.File.list()