In [None]:
import os
import pandas as pd
import json

In [4]:
df_answers = pd.read_csv('../../reddit_answers_big.csv', sep=';')
df_answers.head(3)

Unnamed: 0.1,Unnamed: 0,q_id,text,votes
0,0,hvbvpz,Two pet ducks. You may be tempted to go for on...,2359.0
1,1,hvbvpz,Nice try Jeff Bezos,764.0
2,2,hvbvpz,A curved shower rod. Seriously. $10 for a tens...,1525.0


In [5]:
df_top_votes = df_answers.groupby('q_id')['votes'].idxmax()
df_top_answers = df_answers.loc[df_top_votes]

df_top_answers.rename(columns={'text':'answer', 'q_id':'id', 'votes': 'answer_votes'}, inplace=True)

df_top_answers.head(3)

Unnamed: 0.1,Unnamed: 0,id,answer,answer_votes
1817014,1875645,1001ag,Tell him to go to a hospital. I can't stress t...,30.0
1591462,1643710,10029x,NOTE: Detail may not sum to totals because of ...,3.0
96052,99426,1004g5,Blow Me Away by Breaking Benjamin http://www....,7.0


In [6]:
df_questions = pd.read_csv('../../reddit_questions.csv', sep=';')

df_questions.rename(columns={'text':'question', 'votes': 'question_votes'}, inplace=True)

df_questions.head(3)

Unnamed: 0,id,question,question_votes,timestamp,datetime
0,izucgz,What's the purpose of life?,8,1601076000.0,Fri Sep 25 23:13:31 2020 UTC
1,9c784/,"I've tried to quit smoking, this is my seventh...",11,1250712000.0,Wed Aug 19 19:58:54 2009 UTC
2,iylxwl,"For those who have a slave master last name, w...",0,1600904000.0,Wed Sep 23 23:35:15 2020 UTC


In [11]:
merged_df = df_questions.merge(df_top_answers, on='id')

merged_df.drop(columns=['timestamp', 'datetime', 'Unnamed: 0'], inplace=True)

merged_df.head(3)

Unnamed: 0,id,question,question_votes,answer,answer_votes
0,izucgz,What's the purpose of life?,8,Breed and die.,5.0
1,9c784/,"I've tried to quit smoking, this is my seventh...",11,The secret to quitting smoking is to tell your...,4.0
2,iylxwl,"For those who have a slave master last name, w...",0,No. My last name sounds badass.,4.0


In [12]:
merged_df = merged_df.sort_values(by='answer_votes', ascending=False)[:1000]
merged_df.head(3)

Unnamed: 0,id,question,question_votes,answer,answer_votes
75274,fkzaca,What is something that has aged well?,66093,The word cool,99398.0
167081,a0a4cd,What's the most amazing thing about the universe?,81862,"It must be true that either It didn't exist, ...",86042.0
140939,d0jjc2,The 2010's decade will be over in 4 months. Wh...,113254,The social media explosion,85936.0


In [13]:
questions, answers = merged_df['question'], merged_df['answer']

In [14]:
qa_openai_format = [{"messages": [{"role": "system", "content": "Marv is a factual chatbot and a reddit expert who likes to answer with bullets."},
                                  {"role": "user", "content": q},
                                  {"role": "assistant", "content": a}]} for q, a in zip(questions, answers)]

qa_openai_format[2]

{'messages': [{'role': 'system',
   'content': 'Marv is a factual chatbot and a reddit expert who likes to answer with bullets.'},
  {'role': 'user',
   'content': "The 2010's decade will be over in 4 months. What do you think people will remember this decade for?"},
  {'role': 'assistant', 'content': 'The social media explosion'}]}

In [15]:
with open('reddit_qa_finetuning.jsonl', 'w') as f:
    for item in qa_openai_format:
        f.write(json.dumps(item) + '\n')

In [20]:
from collections import defaultdict

data_path = "reddit_qa_finetuning.jsonl"

# Load the dataset
with open(data_path, 'r', encoding='utf-8') as f:
  dataset = [json.loads(line) for line in f]

# Initial dataset stats
print("Num examples:", len(dataset))

# Format error checks
format_errors = defaultdict(int)

for ex in dataset:
    if not isinstance(ex, dict):
        format_errors["data_type"] += 1
        continue
        
    messages = ex.get("messages", None)
    if not messages:
        format_errors["missing_messages_list"] += 1
        continue
        
    for message in messages:
        if "role" not in message or "content" not in message:
            format_errors["message_missing_key"] += 1
        
        if any(k not in ("role", "content", "name", "function_call") for k in message):
            format_errors["message_unrecognized_key"] += 1
        
        if message.get("role", None) not in ("system", "user", "assistant", "function"):
            format_errors["unrecognized_role"] += 1
            
        content = message.get("content", None)
        function_call = message.get("function_call", None)
        
        if (not content and not function_call) or not isinstance(content, str):
            format_errors["missing_content"] += 1
    
    if not any(message.get("role", None) == "assistant" for message in messages):
        format_errors["example_missing_assistant_message"] += 1

if format_errors:
    print("Found errors:")
    for k, v in format_errors.items():
        print(f"{k}: {v}")
else:
    print("No errors found")

Num examples: 1000
No errors found


In [21]:
from openai import OpenAI

client = OpenAI()

OpenAIError: The api_key client option must be set either by passing api_key to the client or by setting the OPENAI_API_KEY environment variable

In [None]:
client.files.create(
    file=open('reddit_qa_finetuning.jsonl', 'rb'),
    purpose='fine-tune'
)

In [None]:
client.fine_tuning.jobs.create(
    training_file='', # file id here from above
    model="gpt-3.5-turbo"
)

In [None]:
client.fine_tuning.jobs.list()