In [90]:
import os
import openai
import tiktoken
from dotenv import load_dotenv, find_dotenv

_ = load_dotenv(find_dotenv())  # read local .env file

openai.api_key = os.environ["OPENAI_API_KEY"]

In [91]:
import tiktoken
import fitz
import shutil, random, os
from pathlib import Path
import pandas as pd

In [92]:
client = openai.OpenAI()


def get_completion(prompt, model="gpt-3.5-turbo-1106"):
    messages = [{"role": "user", "content": prompt}]
    response = client.chat.completions.create(
        model=model, messages=messages, temperature=0
    )
    return response.choices[0].message.content

In [93]:
# convert letter ruling context PDF to text
pdf_to_convert = fitz.open("/Users/st414/Documents/PLR/plr_literature.pdf")
letterRuling_context = ""
for page in pdf_to_convert:
    text = page.get_text()
    letterRuling_context += text

Function to iterate through multiple PDFs

In [94]:
# function to get a specific year's PDFs
# assign directory
# directory = "/Users/st414/Documents/PLR/sample_plrs"


# iterate over files in that directory
def files_by_year(year, folder_path):
    plr_list_by_year = []
    files = os.listdir(folder_path)
    for file in files:
        if len(file) == 11:
            if str(year)[-2:] == str(file)[0:2]:
                plr_list_by_year.append(file)
        if len(file) == 13 and "_" in file:
            if str(year)[-2:] == str(file)[0:2]:
                plr_list_by_year.append(file)
        if len(file) == 13:
            if str(year) == str(file)[0:4]:
                plr_list_by_year.append(file)
        if len(file) == 15 and "_" in file:
            if str(year) == str(file)[0:4]:
                plr_list_by_year.append(file)
    # print(file_list_by_year)
    return plr_list_by_year

In [186]:
# function to iterate through multiple PLRs and get classification
def get_plr_classification(year, folder_path):
    plr_classification_list = []
    plr_list_by_year = files_by_year(year, folder_path)
    for plr in plr_list_by_year:
        print(plr)
        plr_classification_dict = {}
        plr_filepath = os.path.join(folder_path, plr)
        pdf_to_convert = fitz.open(plr_filepath)
        plr_text = ""
        for page in pdf_to_convert:
            text = page.get_text()
            plr_text += text
        # get classification
        prompt = f"""
        Your task is to classify letter rulings as adverse or non-adverse by using
        knowledge and context from the literature provided to you below, delimited
        by triple dollar signs.

        Literature: $$${letterRuling_context}$$$

        Below is the letter ruling, delimited by triple backticks, which has to be classified as Adverse or Non Adverse.

        Letter Ruling: ```{plr_text}```

        Provide your output as one of the two values: Adverse or Non-Adverse.
        """

        response = get_completion(prompt)
        plr_classification_dict = {int(plr.split(".")[0]): response}
        plr_classification_list.append(plr_classification_dict)
    return plr_classification_list

Read and clean Tagged PLRs csv

In [139]:
tagged_plrs = pd.read_excel(
    "/Users/st414/Documents/PLR/tagged_plrs.xlsx",
    sheet_name="Sheet1",
    converters={
        "Adverse, Discretionary": int,
        "Partially Adverse": int,
        "Not Adverse": int,
        "Mandatory": int,
        "Employee not IC": int,
        "Revocation": int,
    },
)

In [140]:
# Reshaping the DataFrame using melt
tagged_plrs_melted = pd.melt(
    tagged_plrs, var_name="classification", value_name="plr_number"
)

# Remove rows with NA values
tagged_plrs_melted = tagged_plrs_melted.dropna(subset=["plr_number"])

In [143]:
# Create a column for adverse, non-adverse
# Creating a new column 'Tag' based on a condition
test_set = tagged_plrs_melted.copy()
test_set['tag'] = 'Other' # Initialize the 'Tag' column with 'Other' as the default value

# Using DataFrame.loc[] to set values based on the condition
test_set.loc[test_set['classification'] == 'Adverse, Discretionary', 'tag'] = 'Adverse'
test_set.loc[test_set['classification'] == 'Not Adverse', 'tag'] = 'Non-Adverse'
# tagged_plrs_melted.loc[tagged_plrs_melted['Classification'] == 'Partially Adverse', 'Tag'] = 'Partially-Adverse'

# Displaying the modified dataframe
test_set.sample(10)


Unnamed: 0,classification,plr_number,tag
1795,Not Adverse,200836032,Non-Adverse
2205,Mandatory,201738012,Other
1965,Not Adverse,201434030,Non-Adverse
1536,Not Adverse,200237026,Non-Adverse
1440,Not Adverse,200107018,Non-Adverse
1753,Not Adverse,200718038,Non-Adverse
2042,Mandatory,200546017,Other
1458,Not Adverse,200121023,Non-Adverse
674,Partially Adverse,200125072,Other
1769,Not Adverse,200801006,Non-Adverse


In [146]:
# Remove other tags from Tagged dataset and keep only adverse and non-adverse
reference_set = test_set[test_set['tag'] != "Other"]
reference_set.sample(10)

Unnamed: 0,classification,plr_number,tag
1651,Not Adverse,200514003,Non-Adverse
1821,Not Adverse,200919020,Non-Adverse
1436,Not Adverse,200101035,Non-Adverse
1701,Not Adverse,200627012,Non-Adverse
1624,Not Adverse,200427030,Non-Adverse
1989,Not Adverse,201951013,Non-Adverse
1913,Not Adverse,201242019,Non-Adverse
1754,Not Adverse,200722007,Non-Adverse
1801,Not Adverse,200841028,Non-Adverse
1413,Not Adverse,200032044,Non-Adverse


Isolate tagged PLRs from Elisa's PLR library for training and testing

In [100]:
# remove all the pdfs with _ or - in them
# Function to check if a filename contains "_" or "-"
def has_underscore_or_dash(filename):
    return "_" in filename or "-" in filename

# Source and destination directories
source_folder = "/Users/st414/Documents/PLR/elisa_plrs/files_definite_plr"
destination_folder = "/Users/st414/Documents/PLR/elisa_plrs/duplicates_and_errors"

# Create the destination folder if it doesn't exist
if not os.path.exists(destination_folder):
    os.makedirs(destination_folder)

# Iterate through files in the source folder
for filename in os.listdir(source_folder):
    if filename.endswith(".pdf"):  # Check if it's a PDF file
        source_file_path = os.path.join(source_folder, filename)
        if has_underscore_or_dash(filename):
            # Move the file to the destination folder
            destination_file_path = os.path.join(destination_folder, filename)
            shutil.move(source_file_path, destination_file_path)
            print(f"Moved {filename} to {destination_folder}")

We're left with 28,135 PLRs after removing duplicates and error files.

Make train and test set

In [102]:
# Path to your source folder containing files
source_folder = "/Users/st414/Documents/PLR/elisa_plrs/files_definite_plr"

# Path to your destination folder where you want to copy the files
destination_folder_train = "/Users/st414/Documents/PLR/elisa_plrs/train_set"

# Iterate through the file numbers in the DataFrame
for file_number in tagged['PLR_Number']:
    source_file_path_org = os.path.join(source_folder, str(file_number) + ".pdf")
    # print(source_file_path_org)
    source_file_path_ = os.path.join(source_folder, str(file_number)[2:] + ".pdf")
    # print(source_file_path_)

    # Check if the file exists in the source folder
    if os.path.exists(source_file_path_org):
        # Copy the file to the destination folder
        destination_file_path = os.path.join(destination_folder_train, str(file_number) + ".pdf")
        shutil.copy(source_file_path_org, destination_file_path)
        print(f"File '{file_number}' copied to '{destination_folder_train}'")

    elif os.path.exists(source_file_path_):
        destination_file_path = os.path.join(destination_folder_train, str(file_number) + ".pdf")
        shutil.copy(source_file_path_, destination_file_path)
        print(f"File '{file_number}' copied to '{destination_folder_train}'")
print("File copying completed.")

File '199906015' copied to '/Users/st414/Documents/PLR/elisa_plrs/train_set'
File '199922020' copied to '/Users/st414/Documents/PLR/elisa_plrs/train_set'
File '199941012' copied to '/Users/st414/Documents/PLR/elisa_plrs/train_set'
File '200006016' copied to '/Users/st414/Documents/PLR/elisa_plrs/train_set'
File '200026020' copied to '/Users/st414/Documents/PLR/elisa_plrs/train_set'
File '200027028' copied to '/Users/st414/Documents/PLR/elisa_plrs/train_set'
File '200029018' copied to '/Users/st414/Documents/PLR/elisa_plrs/train_set'
File '200041023' copied to '/Users/st414/Documents/PLR/elisa_plrs/train_set'
File '200046021' copied to '/Users/st414/Documents/PLR/elisa_plrs/train_set'
File '200102015' copied to '/Users/st414/Documents/PLR/elisa_plrs/train_set'
File '200127027' copied to '/Users/st414/Documents/PLR/elisa_plrs/train_set'
File '200129021' copied to '/Users/st414/Documents/PLR/elisa_plrs/train_set'
File '200203034' copied to '/Users/st414/Documents/PLR/elisa_plrs/train_set'

Function to calculate accuracy, recall

In [104]:
x = get_plr_classification(2023, destination_folder_train)

In [131]:
def list_to_df(list_of_dicts):
    keys = []
    values = []

    for dict in list_of_dicts:
        for key, value in dict.items():
            keys.append(key)
            values.append(value)
    # create a dataframe
    df = pd.DataFrame({'plr_number': keys, 'tag': values})
    return df

In [187]:
def calculate_metrics(list_of_dicts, reference_set):
    # Convert list of dicts to DataFrame
    list_df = list_to_df(list_of_dicts)

    # Merge the DataFrame and list_df based on the PLR number
    merged_df = pd.merge(reference_set, list_df, on='plr_number', suffixes=('_ref', '_list'))
    # print(merged_df)

    # Calculate accuracy
    accuracy = sum(merged_df['tag_ref'] == merged_df['tag_list']) / len(merged_df)

    # Calculate recall
    true_positives = sum((merged_df['tag_ref'] == 'Adverse') & (merged_df['tag_list'] == 'Adverse'))
    false_negatives = sum((merged_df['tag_ref'] == 'Adverse') & (merged_df['tag_list'] != 'Adverse'))
    recall = true_positives / (true_positives + false_negatives)

    # Calculate precision
    false_positives = sum((merged_df['tag_ref'] != 'Adverse') & (merged_df['tag_list'] == 'Adverse'))
    precision = true_positives / (true_positives + false_positives)

    print(f"Accuracy: {accuracy*100:.2f}%")
    print(f"Recall: {recall*100:.2f}%")
    print(f"Precision: {precision*100:.2f}%")

In [156]:
calculate_metrics(x, reference_set)

           classification plr_number      tag_ref     tag_list
0  Adverse, Discretionary  202311001      Adverse      Adverse
1  Adverse, Discretionary  202319009      Adverse      Adverse
2             Not Adverse  202303011  Non-Adverse  Non-Adverse
3             Not Adverse  202321001  Non-Adverse  Non-Adverse


(1.0, 1.0, 1.0)

In [184]:
# iterate through multiple years
def iterate_multiple_years(years):
    years_list = []
    for y in years:
        years_list.extend(get_plr_classification(y, folder_path))
    return years_list

    

In [188]:
years = list(range(2015,2022))
folder_path = "/Users/st414/Documents/PLR/elisa_plrs/train_set"
l=iterate_multiple_years(years)

201552032.pdf
201521009.pdf
201528002.pdf
201551006.pdf
201543004.pdf
201503001.pdf
201532026.pdf
201549019.pdf
201538027.pdf


RateLimitError: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-3.5-turbo-1106 in organization org-iSAPcQOGLvWwrnZj238M19jr on tokens per min (TPM): Limit 40000, Used 25995, Requested 35734. Please try again in 32.593s. Visit https://platform.openai.com/account/rate-limits to learn more. You can increase your rate limit by adding a payment method to your account at https://platform.openai.com/account/billing.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}

In [None]:
calculate_metrics(l, reference_set)

           classification plr_number      tag_ref     tag_list
0  Adverse, Discretionary  202219006      Adverse      Adverse
1  Adverse, Discretionary  202219007      Adverse      Adverse
2  Adverse, Discretionary  202219008      Adverse      Adverse
3  Adverse, Discretionary  202311001      Adverse      Adverse
4  Adverse, Discretionary  202319009      Adverse      Adverse
5             Not Adverse  202206010  Non-Adverse      Adverse
6             Not Adverse  202303011  Non-Adverse  Non-Adverse
7             Not Adverse  202321001  Non-Adverse  Non-Adverse
Accuracy: 87.50%
Recall: 100.00%
Precision: 83.33%
