### Fine-tuning GPT using Compatible Pairs

This file fine-tunes GPT-2 using computed pairs from the matching algorithm that exceed a certain cosine similiarity threshold.

In [None]:
import pandas as pd
from transformers import GPT2LMHeadModel, GPT2Tokenizer, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments
import ast

In [None]:
## Load in dataframes for the process

matches_df = pd.read_csv('data/okcupid_matches.csv') 
matches_df = matches_df.rename(columns={'Unnamed: 0': 'Person_id'})

bios_text = pd.read_csv('data/okcupid_profiles.csv')
bios_text.reset_index(inplace=True)
bios_text = bios_text.rename(columns={'index': 'Person_id'})

## Joining the bios together

bios_text.fillna(' ', inplace=True)
bios_text['allessays'] = bios_text[['essay0', 'essay1', 'essay2', 'essay3', 'essay4', 'essay5', 'essay6', 'essay7', 'essay8']].apply(lambda x: ' '.join(x), axis=1)
bios_text = bios_text[['Person_id', 'allessays']]

## Join two df together on person_id

finetune_df = pd.merge(matches_df, bios_text, on='Person_id')

In [None]:
## Preparing matches for fine tuning

input_output_pairs = []

for index, row in finetune_df.iterrows():
    if row['matches'] != '[]':
       for match in ast.literal_eval(row['matches']):
           match_id = match[0]
           matching_row = finetune_df[finetune_df['Person_id'] == match_id]
           input_output_pairs.append(F"Input: {row['allessays']}\n")
           input_output_pairs.append(F"Output: {matching_row['allessays'].iloc[0]}\n")
      
input_output_pairs = "".join(input_output_pairs)

with open("input_output_pairs.txt", "w") as f:
    f.write(input_output_pairs)

In [None]:
def fine_tune_gpt2(train_file, output_dir):
    model = GPT2LMHeadModel.from_pretrained("gpt2")
    tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

    train_dataset = TextDataset(
        tokenizer=tokenizer,
        file_path=train_file,
        block_size=128)

    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=False)

    training_args = TrainingArguments(
        output_dir=output_dir,
        overwrite_output_dir=True,
        num_train_epochs=3,
        per_device_train_batch_size=4,
        save_steps=10_000,
        save_total_limit=2,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=train_dataset,
    )

    trainer.train()

    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)

In [None]:
fine_tune_gpt2("input_output_pairs.txt", "fine_tuned_model")