### Fine Tuning GPT 3.5 Turbo with Tagged PLRs

In [1]:
import os
import openai
import tiktoken
from dotenv import load_dotenv, find_dotenv
import tiktoken
import fitz
import shutil, random, os
from pathlib import Path
import pandas as pd
import openpyxl
import json

_ = load_dotenv(find_dotenv())  # read local .env file

openai.api_key = os.environ["OPENAI_API_KEY"]

In [2]:
%run functions.ipynb

In [3]:
# client = openai.OpenAI()
# encoding = tiktoken.encoding_for_model("gpt-3.5-turbo-1106")

#### Split into training and test set

In [4]:
# Set the path to your main folder containing PDF files
main_folder = "/Users/st414/Documents/PLR/elisa_plrs/train_set"

# Set the path to your training and test set folders
train_folder = os.path.join(main_folder, 'finetune_train')
test_folder = os.path.join(main_folder, 'finetune_test')

# Set the percentage of files to be moved to the test set
test_split_percentage = 85  # Adjust as needed

# Get the list of PDF files in the main folder
pdf_files = [file for file in os.listdir(main_folder) if file.endswith('.pdf')]

# Calculate the number of files for the test set
num_files_test = int(len(pdf_files) * (test_split_percentage / 100.0))

# Randomly select files for the test set
test_files = random.sample(pdf_files, num_files_test)

# Copy files to the respective folders
for file in pdf_files:
    source_path = os.path.join(main_folder, file)
    if file in test_files:
        destination_path = os.path.join(test_folder, file)
    else:
        destination_path = os.path.join(train_folder, file)
    shutil.copy(source_path, destination_path)

print("Splitting complete. Check the 'train' and 'test' folders for your files.")


Splitting complete. Check the 'train' and 'test' folders for your files.


#### Create .jsonl format for training input to gpt

In [8]:
pd_plr_text = text_to_df() # using this from the functions .ipynb
pd_plr_text.head()

Unnamed: 0,plr_number,text
0,201318003.pdf,Internal Revenue Service\nDepartment of the Tr...
1,201412013.pdf,Internal Revenue Service\nDepartment of the Tr...
2,200537019.pdf,\nInternal Revenue Service \nDepartment of th...
3,200206048.pdf,Internal Revenue Service\nNumber: 200206048\...
4,200623003.pdf,\n \n \nInternal Revenue Service \nDepartment...


In [21]:
pd_plr_text['plr_number'] = pd_plr_text['plr_number'].str.split('.').str[0]
pd_plr_text.head()

Unnamed: 0,plr_number,text
0,201318003,Internal Revenue Service\nDepartment of the Tr...
1,201412013,Internal Revenue Service\nDepartment of the Tr...
2,200537019,\nInternal Revenue Service \nDepartment of th...
3,200206048,Internal Revenue Service\nNumber: 200206048\...
4,200623003,\n \n \nInternal Revenue Service \nDepartment...


In [22]:
# join our training set with the reference set to tag the training set PLRs
reference_set = pd.read_csv('/Users/st414/Documents/PLR/elisa_plrs/reference_set.csv')

# converting plr_number from object to int64 for joining
pd_plr_text = pd_plr_text.astype({'plr_number': 'int64'})



In [39]:
plr_text_tag = pd.merge(pd_plr_text, reference_set[['plr_number','tag']], on = 'plr_number', how = 'inner')
plr_text_tag.head()

Unnamed: 0,plr_number,text,tag
0,201318003,Internal Revenue Service\nDepartment of the Tr...,Non-Adverse
1,201412013,Internal Revenue Service\nDepartment of the Tr...,Non-Adverse
2,200537019,\nInternal Revenue Service \nDepartment of th...,Adverse
3,200623003,\n \n \nInternal Revenue Service \nDepartment...,Non-Adverse
4,199906015,INTERNAL REVENUE SERVICE\nIndex No.: 61.00-00...,Adverse


In [43]:
# define the system prompt
system_prompt = 'You are a legal tax expert that, given a private letter ruling, classifies it as adverse or non-adverse.'

In [80]:
all_conversations = []

for idx, row in plr_text_tag.iterrows():
    all_conversations.append({'messages': [{'role': 'system', 'content': system_prompt},
                                           {'role': 'user', 'content': row['text']},
                                           {'role': 'assistant', 'content': row['tag']}]})


In [45]:
# get the first training data point and check format
all_conversations[0]

{'messages': [{'role': 'system',
   'content': 'You are a legal tax expert that, given a private letter ruling, classifies it as adverse or non-adverse.'},
  {'role': 'user',
   'content': 'Internal Revenue Service\nDepartment of the Treasury\nWashington, DC 20224\nNumber: 201318003\nRelease Date: 5/3/2013\nIndex Number:  1250.04-01\n------------------------------------------------------------\n------------------------\n---------------------------------\n------------------------------\n----------------------------------------------------\nThird Party Communication: None\nDate of Communication: Not Applicable\nPerson To Contact:\n------------------------, ID No. ------------------\n----------------------------------------------------\nTelephone Number:\n----------------------\nRefer Reply To:\nCC:ITA:B07\nPLR-132858-12\nDate:\nJanuary 22, 2013\nRe: Request for Private Letter Ruling under Sections 170, 291, and 1250\nLEGEND\nTaxpayer\n=\n----------------------------------------------\n--

In [71]:
len(all_conversations)

167

In [47]:
with open('instances.jsonl', 'w') as f:
    for conversation in all_conversations:
        json.dump(conversation, f)
        f.write('\n')

#### Fine tune the model with training data (.jsonl file)

Train on all 167 data points

In [48]:
# upload the .jsonl file to open AI
client = openai.OpenAI()

In [49]:
with open('instances.jsonl', 'rb') as f:
    response = client.files.create(file = f, purpose = 'fine-tune')

In [50]:
response

FileObject(id='file-WRdrJJ6xGdIM7O5GPPkO0mBF', bytes=2796079, created_at=1714588808, filename='instances.jsonl', object='file', purpose='fine-tune', status='processed', status_details=None)

In [52]:
file_id = 'file-WRdrJJ6xGdIM7O5GPPkO0mBF'

In [110]:
response = client.fine_tuning.jobs.create(
    training_file = file_id,
    model = 'gpt-3.5-turbo-1106'
)

In [54]:
response

FineTuningJob(id='ftjob-uzs01qnxx167WQUaPmNKWANH', created_at=1714589355, error=Error(code=None, message=None, param=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(n_epochs='auto', batch_size='auto', learning_rate_multiplier='auto'), model='gpt-3.5-turbo-0125', object='fine_tuning.job', organization_id='org-iSAPcQOGLvWwrnZj238M19jr', result_files=[], status='validating_files', trained_tokens=None, training_file='file-WRdrJJ6xGdIM7O5GPPkO0mBF', validation_file=None, user_provided_suffix=None, seed=534799480, estimated_finish=None, integrations=[])

In [63]:
job_id = 'ftjob-uzs01qnxx167WQUaPmNKWANH'
client.fine_tuning.jobs.retrieve(job_id)

FineTuningJob(id='ftjob-uzs01qnxx167WQUaPmNKWANH', created_at=1714589355, error=Error(code=None, message=None, param=None), fine_tuned_model='ft:gpt-3.5-turbo-0125:personal::9K9dz3a5', finished_at=1714590657, hyperparameters=Hyperparameters(n_epochs=3, batch_size=1, learning_rate_multiplier=2), model='gpt-3.5-turbo-0125', object='fine_tuning.job', organization_id='org-iSAPcQOGLvWwrnZj238M19jr', result_files=['file-mCq3pMfTPRNNFWE96HgdhSK5'], status='succeeded', trained_tokens=1725354, training_file='file-WRdrJJ6xGdIM7O5GPPkO0mBF', validation_file=None, user_provided_suffix=None, seed=534799480, estimated_finish=None, integrations=[])

In [64]:
model_id = 'ft:gpt-3.5-turbo-0125:personal::9K9dz3a5'

In [65]:
# try out the fine tuned model
client = openai.OpenAI()


def get_completion(prompt, model=model_id):
    messages = [{"role": "user", "content": prompt}]
    response = client.chat.completions.create(
        model=model, messages=messages, temperature=0
    )
    return response.choices[0].message.content

PLR Number: 200224023

In [67]:
# convert PLR PDF to text
pdf_to_convert = fitz.open("/Users/st414/Documents/PLR/elisa_plrs/train_set/200224023.pdf")
plr_200224023 = ""
for page in pdf_to_convert:
    text = page.get_text()
    plr_200224023 += text

In [69]:
prompt = f"""{plr_200224023}
"""

response = get_completion(prompt)
print(response)

cc:  Taxpayer
     Second Authorized Representative


The output is very off. We were expecting the output to be either adverse or non-adverse.

Retrain on a smaller subset of the training dataset with a new user prompt

In [77]:
all_conversations_2 = []

for idx, row in plr_text_tag.iterrows():
    all_conversations_2.append({'messages': [{'role': 'system', 'content': system_prompt},
                                           {'role': 'user', 'content': f''' 
                                            Here is the letter ruling, delimited by triple backticks, which has to be classified as Adverse or Non Adverse.

                                            Letter Ruling: ```{row['text']}```

                                            Provide your output as one of the two values: Adverse or Non-Adverse.'''},
                                           {'role': 'assistant', 'content': row['tag']}]})

In [78]:
all_conversations_2[0]

{'messages': [{'role': 'system',
   'content': 'You are a legal tax expert that, given a private letter ruling, classifies it as adverse or non-adverse.'},
  {'role': 'user',
   'content': ' Here is the letter ruling, delimited by triple backticks, which has to be classified as Adverse or Non Adverse.\n\n        Letter Ruling: ```Internal Revenue Service\nDepartment of the Treasury\nWashington, DC 20224\nNumber: 201318003\nRelease Date: 5/3/2013\nIndex Number:  1250.04-01\n------------------------------------------------------------\n------------------------\n---------------------------------\n------------------------------\n----------------------------------------------------\nThird Party Communication: None\nDate of Communication: Not Applicable\nPerson To Contact:\n------------------------, ID No. ------------------\n----------------------------------------------------\nTelephone Number:\n----------------------\nRefer Reply To:\nCC:ITA:B07\nPLR-132858-12\nDate:\nJanuary 22, 2013\nRe

Make a smaller subset of this to train on

In [79]:
training_subset_2 = all_conversations_2[:10]

In [81]:
with open('instances_2.jsonl', 'w') as f:
    for conversation in training_subset_2:
        json.dump(conversation, f)
        f.write('\n')

In [111]:
with open('instances_2.jsonl', 'rb') as f:
    response = client.files.create(file = f, purpose = 'fine-tune')

In [112]:
response

FileObject(id='file-opjCB81zi52MMOzpKfw8SxER', bytes=166566, created_at=1714670754, filename='instances_2.jsonl', object='file', purpose='fine-tune', status='processed', status_details=None)

In [113]:
file_id = 'file-opjCB81zi52MMOzpKfw8SxER'

In [114]:
response = client.fine_tuning.jobs.create(
    training_file = file_id,
    model = 'gpt-3.5-turbo-1106'
)

In [115]:
response

FineTuningJob(id='ftjob-E6l8PgRADbR4a8PL6R5XMzWE', created_at=1714670777, error=Error(code=None, message=None, param=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(n_epochs='auto', batch_size='auto', learning_rate_multiplier='auto'), model='gpt-3.5-turbo-1106', object='fine_tuning.job', organization_id='org-iSAPcQOGLvWwrnZj238M19jr', result_files=[], status='validating_files', trained_tokens=None, training_file='file-opjCB81zi52MMOzpKfw8SxER', validation_file=None, user_provided_suffix=None, seed=1396507463, estimated_finish=None, integrations=[])

In [119]:
job_id = 'ftjob-E6l8PgRADbR4a8PL6R5XMzWE'
client.fine_tuning.jobs.retrieve(job_id)

FineTuningJob(id='ftjob-E6l8PgRADbR4a8PL6R5XMzWE', created_at=1714670777, error=Error(code=None, message=None, param=None), fine_tuned_model='ft:gpt-3.5-turbo-1106:personal::9KUZajlq', finished_at=1714671109, hyperparameters=Hyperparameters(n_epochs=10, batch_size=1, learning_rate_multiplier=2), model='gpt-3.5-turbo-1106', object='fine_tuning.job', organization_id='org-iSAPcQOGLvWwrnZj238M19jr', result_files=['file-cRTcaRLXaDphAjLa4Ap0pq2i'], status='succeeded', trained_tokens=349910, training_file='file-opjCB81zi52MMOzpKfw8SxER', validation_file=None, user_provided_suffix=None, seed=1396507463, estimated_finish=None, integrations=[])

In [120]:
model_id = 'ft:gpt-3.5-turbo-1106:personal::9KUZajlq'

In [121]:
def get_completion(prompt, model=model_id):
    messages = [{"role": "user", "content": prompt}]
    response = client.chat.completions.create(
        model=model, messages=messages, temperature=0
    )
    return response.choices[0].message.content

PLR Number: 200224023

In [122]:
prompt = f"""
        Here is the letter ruling, delimited by triple backticks, which has to be classified as Adverse or Non Adverse.

        Letter Ruling: ```{plr_200224023}```

        Provide your output as one of the two values: Adverse or Non-Adverse.
"""

response = get_completion(prompt)
print(response)

Adverse


PLR Number: 200024009

In [123]:
# convert PLR PDF to text
pdf_to_convert = fitz.open("/Users/st414/Documents/PLR/elisa_plrs/train_set/200024009.pdf")
plr_200024009 = ""
for page in pdf_to_convert:
    text = page.get_text()
    plr_200024009 += text

In [124]:
prompt = f"""
        Here is the letter ruling, delimited by triple backticks, which has to be classified as Adverse or Non Adverse.

        Letter Ruling: ```{plr_200024009}```

        Provide your output as one of the two values: Adverse or Non-Adverse.
"""

response = get_completion(prompt)
print(response)

Non-Adverse


The results look better here and in the format we want. We can go ahead and test this model out on our test set and compare it with the result of the base 3.5 turbo model.

#### Comparing eval metrics between gpt 3.5 turbo 0125 and fine tuned version of the same model

##### Base GPT 3.5 Turbo 1106

In [107]:
def get_completion(prompt, model="gpt-3.5-turbo-1106"):
    messages = [{"role": "user", "content": prompt}]
    response = client.chat.completions.create(
        model=model, messages=messages, temperature=0
    )
    return response.choices[0].message.content

In [100]:
years = list(range(1999,2023))
folder_path = "/Users/st414/Documents/PLR/elisa_plrs/train_set/finetune_test"
base_test_eval = iterate_multiple_years(years, folder_path)

199906015.pdf
199937008.pdf
199923043.pdf
199914013.pdf
199941012.pdf
199932028.pdf
199929025.pdf
199937025.pdf
199917054.pdf
199915022.pdf
199915037.pdf
199915023.pdf
199947001.pdf
199933018.pdf
199952021.pdf
199921030.pdf
199928025.pdf
199949008.pdf
199952039.pdf
199938021.pdf
199936030.pdf
199936024.pdf
199915048.pdf
199952060.pdf
199922020.pdf
199920027.pdf
199951032.pdf
199935016.pdf
199911028.pdf
200013012.pdf
200001012.pdf
200022019.pdf
200006030.pdf
200027028.pdf
200051031.pdf
200014009.pdf
200020024.pdf
200013015.pdf
200041023.pdf
200018036.pdf
200046003.pdf
200036024.pdf
200049008.pdf
200026020.pdf
200052010.pdf
200042015.pdf
200017002.pdf
200005016.pdf
200047027.pdf
200021016.pdf
200042016.pdf
200019017.pdf
200010031.pdf
200052028.pdf
200024009.pdf
200040014.pdf
200037045.pdf
200023030.pdf
200017036.pdf
200003043.pdf
200025043.pdf
200031036.pdf
200009032.pdf
200047002.pdf
200026010.pdf
200039022.pdf
200029018.pdf
200030011.pdf
200029035.pdf
200027018.pdf
200046021.pdf
200025

In [103]:
calculate_metrics(base_test_eval, reference_set)

Accuracy: 94.84%
Recall: 78.67%
Precision: 81.94%


Let's test it out on a smaller subset as well - (due to budget constraints we can't test the fine tuned model on the entire test data set)

In [108]:
years = list(range(2017,2022))
folder_path = "/Users/st414/Documents/PLR/elisa_plrs/train_set/finetune_test"
base_test_eval_subset = iterate_multiple_years(years, folder_path)

201748005.pdf
201740016.pdf
201740005.pdf
201722014.pdf
201751011.pdf
201706006.pdf
201741012.pdf
201722010.pdf
201816004.pdf
201828010.pdf
201815005.pdf
201819006.pdf
201825003.pdf
201811002.pdf
201825006.pdf
201951001.pdf
201926006.pdf
201927005.pdf
201927012.pdf
201943020.pdf
202005020.pdf
202016001.pdf
202014004.pdf
202022005.pdf
202014005.pdf
202014001.pdf
202014002.pdf
202014003.pdf
202138001.pdf
202114001.pdf
202125007.pdf
202144005.pdf
202118021.pdf


Eval metrics with base gpt 3.5 turbo 1106

In [109]:
calculate_metrics(base_test_eval_subset, reference_set)

Accuracy: 70.37%
Recall: 88.24%
Precision: 71.43%


Eval metrics with base gpt 3.5 turbo 0125

In [106]:
calculate_metrics(base_test_eval_subset, reference_set)

Accuracy: 74.07%
Recall: 64.71%
Precision: 91.67%


##### Fine tuned gpt 3.5 turbo 1106

In [127]:
def get_completion(prompt, model=model_id):
    messages = [{"role": "user", "content": prompt}]
    response = client.chat.completions.create(
        model=model, messages=messages, temperature=0
    )
    return response.choices[0].message.content

In [128]:
def get_plr_classification(year, folder_path):
    plr_classification_list = []
    plr_list_by_year = files_by_year(year, folder_path)
    for plr in plr_list_by_year:
        plr_classification_dict = {}
        plr_filepath = os.path.join(folder_path, plr)
        pdf_to_convert = fitz.open(plr_filepath)
        plr_text = ""
        for page in pdf_to_convert:
            text = page.get_text()
            plr_text += text
        # get classification
        if len(encoding.encode(plr_text)) > 13000:
            pass
        else:
            print(plr)
            prompt = f"""
            Here is the letter ruling, delimited by triple backticks, which has to be classified as Adverse or Non Adverse.

            Letter Ruling: ```{plr_text}```

            Provide your output as one of the two values: Adverse or Non-Adverse.
        """

        response = get_completion(prompt)
        plr_classification_dict = {int(plr.split(".")[0]): response}
        plr_classification_list.append(plr_classification_dict)
    return plr_classification_list

In [129]:
years = list(range(2017,2022))
folder_path = "/Users/st414/Documents/PLR/elisa_plrs/train_set/finetune_test"
finetuned_test_eval_subset = iterate_multiple_years(years, folder_path)

201748005.pdf
201740016.pdf
201740005.pdf
201722014.pdf
201751011.pdf
201706006.pdf
201741012.pdf
201722010.pdf
201816004.pdf
201828010.pdf
201815005.pdf
201819006.pdf
201825003.pdf
201811002.pdf
201825006.pdf
201951001.pdf
201926006.pdf
201927005.pdf
201927012.pdf
201943020.pdf
202005020.pdf
202016001.pdf
202014004.pdf
202022005.pdf
202014005.pdf
202014001.pdf
202014002.pdf
202014003.pdf
202138001.pdf
202114001.pdf
202125007.pdf
202144005.pdf
202118021.pdf


Eval metrics with fine tunedgpt 3.5 turbo 1106

In [130]:
calculate_metrics(finetuned_test_eval_subset, reference_set)

Accuracy: 88.89%
Recall: 82.35%
Precision: 100.00%


We get a better accuracy and precision with the fine tuned model compared to the base model but a lower recall. Since we want our model to have a higher recall as opposed to precision we will go ahead with the base gpt 3.5 turbo 1106 model.