# Code to convert data into proper format for GPT-4o mini fine-tuning

In [8]:
import pandas as pd
import json
from sklearn.model_selection import train_test_split

In [33]:
df = pd.read_csv('presidential_clean.csv')

In [34]:
filtered_df = df[
    (df['dialogue'].str.split().str.len() < 10) & 
    (df['Party'].isin(['Republican', 'Democrat']))
]

In [35]:
df['word_count'] = df['dialogue'].str.split().str.len()

In [36]:
df_filtered = df[df['word_count'] >= 10]


In [42]:
republican_df = df_filtered[(df_filtered['Party'] == 'Republican') & (df_filtered['date'] > '2015-01-01')]
democrat_df = df_filtered[(df_filtered['Party'] == 'Democrat') & (df_filtered['date'] > '2015-01-01')]

In [43]:
republican_df.to_csv('2016_republican_data.csv', index=False)
democrat_df.to_csv('2016_democrat_data.csv', index=False)

In [4]:
dem_1976 = pd.read_csv('processed_data/dem_1976_questions_clean.csv').drop(columns = ['Unnamed: 0'])
dem_1976.head()

Unnamed: 0,actor,dialogue,date,Party,word_count,gpt_4_mini_question
0,Ferraro,"Well, let me first say that I wasnt born at th...",1984-10-11,Democrat,383,What experiences do you believe best qualify y...
1,Ferraro,I dont. Let me say that Im not a believer in p...,1984-10-11,Democrat,132,What is your perspective on the significance o...
2,Ferraro,"I, I think what Im going to have to do is Im g...",1984-10-11,Democrat,161,What specific economic statistics or policies ...
3,Ferraro,With reference to the busing vote that I cast ...,1984-10-11,Democrat,395,What are your views on the role of federal gov...
4,Ferraro,I do not support the use of quotas. Both Mr. M...,1984-10-11,Democrat,130,"What is your stance on affirmative action, and..."


In [9]:
files = [
    ("processed_data/dem_1976_questions_clean.csv", "dem_1976_train.jsonl", "dem_1976_val.jsonl"),
    ("processed_data/rep_1976_questions_clean.csv", "rep_1976_train.jsonl", "rep_1976_val.jsonl"),
    ("processed_data/dem_2016_questions_clean.csv", "dem_2016_train.jsonl", "dem_2016_val.jsonl"),
    ("processed_data/rep_2016_questions_clean.csv", "rep_2016_train.jsonl", "rep_2016_val.jsonl"),
]


In [10]:
def process_csv_to_jsonl_with_split(input_file, train_file, val_file):
    df = pd.read_csv(input_file)
    
    # split into training and validation 
    train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)
    
    def write_jsonl(df, output_file):
        with open(output_file, "w") as f:
            for _, row in df.iterrows():
                jsonl_entry = {
                    "messages": [
                        {"role": "system", "content": "You are a chatbot that is a presidential candidate in a debate."},
                        {"role": "user", "content": row['gpt_4_mini_question']},
                        {"role": "assistant", "content": row['dialogue']}
                    ]
                }
                f.write(json.dumps(jsonl_entry) + "\n")
    
    write_jsonl(train_df, train_file)
    write_jsonl(val_df, val_file)
    
    print(f"Processed {input_file} -> {train_file} (train) and {val_file} (validation)")

In [11]:
for input_file, train_file, val_file in files:
    process_csv_to_jsonl_with_split(input_file, train_file, val_file)


Processed processed_data/dem_1976_questions_clean.csv -> dem_1976_train.jsonl (train) and dem_1976_val.jsonl (validation)
Processed processed_data/rep_1976_questions_clean.csv -> rep_1976_train.jsonl (train) and rep_1976_val.jsonl (validation)
Processed processed_data/dem_2016_questions_clean.csv -> dem_2016_train.jsonl (train) and dem_2016_val.jsonl (validation)
Processed processed_data/rep_2016_questions_clean.csv -> rep_2016_train.jsonl (train) and rep_2016_val.jsonl (validation)
