In [1]:
import os
import openai
import tiktoken
from dotenv import load_dotenv, find_dotenv
import tiktoken
import fitz
import shutil, random, os
from pathlib import Path
import pandas as pd

_ = load_dotenv(find_dotenv())  # read local .env file

openai.api_key = os.environ["OPENAI_API_KEY"]

In [3]:
client = openai.OpenAI()


def get_completion(prompt, model="gpt-3.5-turbo-1106"):
    messages = [{"role": "user", "content": prompt}]
    response = client.chat.completions.create(
        model=model, messages=messages, temperature=0
    )
    return response.choices[0].message.content

In [4]:
# convert letter ruling context PDF to text
pdf_to_convert = fitz.open("/Users/st414/Documents/PLR/plr_literature.pdf")
letterRuling_context = ""
for page in pdf_to_convert:
    text = page.get_text()
    letterRuling_context += text

### Function to iterate through multiple PDFs

In [5]:
# function to get a specific year's PDFs
# assign directory
# directory = "/Users/st414/Documents/PLR/sample_plrs"


# iterate over files in that directory
def files_by_year(year, folder_path):
    plr_list_by_year = []
    files = os.listdir(folder_path)
    for file in files:
        # print(file)
        if len(file) == 11:
            if str(year)[-2:] == str(file)[0:2]:
                plr_list_by_year.append(file)
        if len(file) == 13 and "_" in file:
            if str(year)[-2:] == str(file)[0:2]:
                plr_list_by_year.append(file)
        if len(file) == 13:
            if str(year) == str(file)[0:4]:
                plr_list_by_year.append(file)
        if len(file) == 15 and "_" in file:
            if str(year) == str(file)[0:4]:
                plr_list_by_year.append(file)
    # print(file_list_by_year)
    return plr_list_by_year

In [248]:
# # function to iterate through multiple PLRs and get classification
# def get_plr_classification(year, folder_path):
#     plr_classification_list = []
#     plr_list_by_year = files_by_year(year, folder_path)
#     for plr in plr_list_by_year:
#         print(plr)
#         plr_classification_dict = {}
#         plr_filepath = os.path.join(folder_path, plr)
#         pdf_to_convert = fitz.open(plr_filepath)
#         plr_text = ""
#         for page in pdf_to_convert:
#             text = page.get_text()
#             plr_text += text
#         # get classification
#         prompt = f"""
#         Your task is to classify letter rulings as adverse or non-adverse by using
#         knowledge and context from the literature provided to you below, delimited
#         by triple dollar signs.

#         Literature: $$${letterRuling_context}$$$

#         Below is the letter ruling, delimited by triple backticks, which has to be classified as Adverse or Non Adverse.

#         Letter Ruling: ```{plr_text}```

#         Provide your output as one of the two values: Adverse or Non-Adverse.
#         """

#         response = get_completion(prompt)
#         plr_classification_dict = {int(plr.split(".")[0]): response}
#         plr_classification_list.append(plr_classification_dict)
#     return plr_classification_list

In [6]:
# function to iterate through multiple PLRs and get classification
encoding = tiktoken.encoding_for_model("gpt-3.5-turbo-1106")

def get_plr_classification(year, folder_path):
    plr_classification_list = []
    plr_list_by_year = files_by_year(year, folder_path)
    for plr in plr_list_by_year:
        plr_classification_dict = {}
        plr_filepath = os.path.join(folder_path, plr)
        pdf_to_convert = fitz.open(plr_filepath)
        plr_text = ""
        for page in pdf_to_convert:
            text = page.get_text()
            plr_text += text
        # get classification
        if len(encoding.encode(plr_text)) > 13000:
            pass
        else:
            print(plr)
            prompt = f"""
        Your task is to classify letter rulings as adverse or non-adverse by using
        knowledge and context from the literature provided to you below, delimited
        by triple dollar signs.

        Literature: $$${letterRuling_context}$$$

        Below is the letter ruling, delimited by triple backticks, which has to be classified as Adverse or Non Adverse.

        Letter Ruling: ```{plr_text}```

        Provide your output as one of the two values: Adverse or Non-Adverse.
        """

        response = get_completion(prompt)
        plr_classification_dict = {int(plr.split(".")[0]): response}
        plr_classification_list.append(plr_classification_dict)
    return plr_classification_list

### Read and clean Tagged PLRs csv

In [7]:
tagged_plrs = pd.read_excel(
    "/Users/st414/Documents/PLR/tagged_plrs.xlsx",
    sheet_name="Sheet1",
    converters={
        "Adverse, Discretionary": int,
        "Partially Adverse": int,
        "Not Adverse": int,
        "Mandatory": int,
        "Employee not IC": int,
        "Revocation": int,
    },
)

In [8]:
# Reshaping the DataFrame using melt
tagged_plrs_melted = pd.melt(
    tagged_plrs, var_name="classification", value_name="plr_number"
)

# Remove rows with NA values
tagged_plrs_melted = tagged_plrs_melted.dropna(subset=["plr_number"])

In [9]:
# Create a column for adverse, non-adverse
# Creating a new column 'Tag' based on a condition
test_set = tagged_plrs_melted.copy()
test_set['tag'] = 'Other' # Initialize the 'Tag' column with 'Other' as the default value

# Using DataFrame.loc[] to set values based on the condition
test_set.loc[test_set['classification'] == 'Adverse, Discretionary', 'tag'] = 'Adverse'
test_set.loc[test_set['classification'] == 'Not Adverse', 'tag'] = 'Non-Adverse'
# tagged_plrs_melted.loc[tagged_plrs_melted['Classification'] == 'Partially Adverse', 'Tag'] = 'Partially-Adverse'

# Displaying the modified dataframe
test_set.sample(10)


Unnamed: 0,classification,plr_number,tag
1714,Not Adverse,200644027,Non-Adverse
1391,Not Adverse,200014009,Non-Adverse
1796,Not Adverse,200837002,Non-Adverse
34,"Adverse, Discretionary",200804004,Adverse
1786,Not Adverse,200825052,Non-Adverse
2242,Mandatory,202036007,Other
1370,Not Adverse,199940034,Non-Adverse
2202,Mandatory,201718040,Other
1985,Not Adverse,201940010,Non-Adverse
1540,Not Adverse,200241045,Non-Adverse


In [10]:
# Remove other tags from Tagged dataset and keep only adverse and non-adverse
reference_set = test_set[test_set['tag'] != "Other"]
reference_set.sample(10)

Unnamed: 0,classification,plr_number,tag
1932,Not Adverse,201330006,Non-Adverse
1952,Not Adverse,201415015,Non-Adverse
1841,Not Adverse,200953030,Non-Adverse
1626,Not Adverse,200431009,Non-Adverse
1705,Not Adverse,200631025,Non-Adverse
1536,Not Adverse,200237026,Non-Adverse
1636,Not Adverse,200442003,Non-Adverse
1849,Not Adverse,201019028,Non-Adverse
1910,Not Adverse,201230033,Non-Adverse
1751,Not Adverse,200718018,Non-Adverse


### Isolate tagged PLRs from Elisa's PLR library for training and testing

In [100]:
# remove all the pdfs with _ or - in them
# Function to check if a filename contains "_" or "-"
def has_underscore_or_dash(filename):
    return "_" in filename or "-" in filename

# Source and destination directories
source_folder = "/Users/st414/Documents/PLR/elisa_plrs/files_definite_plr"
destination_folder = "/Users/st414/Documents/PLR/elisa_plrs/duplicates_and_errors"

# Create the destination folder if it doesn't exist
if not os.path.exists(destination_folder):
    os.makedirs(destination_folder)

# Iterate through files in the source folder
for filename in os.listdir(source_folder):
    if filename.endswith(".pdf"):  # Check if it's a PDF file
        source_file_path = os.path.join(source_folder, filename)
        if has_underscore_or_dash(filename):
            # Move the file to the destination folder
            destination_file_path = os.path.join(destination_folder, filename)
            shutil.move(source_file_path, destination_file_path)
            print(f"Moved {filename} to {destination_folder}")

We're left with 28,135 PLRs after removing duplicates and error files.

### Make train and test set

In [None]:
# Path to your source folder containing files
source_folder = "/Users/st414/Documents/PLR/elisa_plrs/files_definite_plr"

# Path to your destination folder where you want to copy the files
destination_folder_train = "/Users/st414/Documents/PLR/elisa_plrs/train_set"

# Iterate through the file numbers in the DataFrame
for file_number in reference_set['PLR_Number']:
    source_file_path_org = os.path.join(source_folder, str(file_number) + ".pdf")
    # print(source_file_path_org)
    source_file_path_ = os.path.join(source_folder, str(file_number)[2:] + ".pdf")
    # print(source_file_path_)

    # Check if the file exists in the source folder
    if os.path.exists(source_file_path_org):
        # Copy the file to the destination folder
        destination_file_path = os.path.join(destination_folder_train, str(file_number) + ".pdf")
        shutil.copy(source_file_path_org, destination_file_path)
        print(f"File '{file_number}' copied to '{destination_folder_train}'")

    elif os.path.exists(source_file_path_):
        destination_file_path = os.path.join(destination_folder_train, str(file_number) + ".pdf")
        shutil.copy(source_file_path_, destination_file_path)
        print(f"File '{file_number}' copied to '{destination_folder_train}'")
print("File copying completed.")

### Function to calculate accuracy, recall

In [11]:
def list_to_df(list_of_dicts):
    keys = []
    values = []

    for dict in list_of_dicts:
        for key, value in dict.items():
            keys.append(key)
            values.append(value)
    # create a dataframe
    df = pd.DataFrame({'plr_number': keys, 'tag': values})
    return df

In [16]:
def calculate_metrics(list_of_dicts, reference_set):
    # Convert list of dicts to DataFrame
    list_df = list_to_df(list_of_dicts)

    # Merge the DataFrame and list_df based on the PLR number
    merged_df = pd.merge(reference_set, list_df, on='plr_number', suffixes=('_ref', '_list'))
    #print(merged_df.loc[(merged_df['tag_ref'] == 'Adverse') & (merged_df['tag_list'] == 'Non-Adverse')])

    # Calculate accuracy
    accuracy = sum(merged_df['tag_ref'] == merged_df['tag_list']) / len(merged_df)

    # Calculate recall
    true_positives = sum((merged_df['tag_ref'] == 'Adverse') & (merged_df['tag_list'] == 'Adverse'))
    false_negatives = sum((merged_df['tag_ref'] == 'Adverse') & (merged_df['tag_list'] != 'Adverse'))
    recall = true_positives / (true_positives + false_negatives)

    # Calculate precision
    false_positives = sum((merged_df['tag_ref'] != 'Adverse') & (merged_df['tag_list'] == 'Adverse'))
    precision = true_positives / (true_positives + false_positives)

    print(f"Accuracy: {accuracy*100:.2f}%")
    print(f"Recall: {recall*100:.2f}%")
    print(f"Precision: {precision*100:.2f}%")

In [13]:
# iterate through multiple years
def iterate_multiple_years(years, folder_path):
    years_list = []
    for y in years:
        years_list.extend(get_plr_classification(y, folder_path))
    return years_list

    

#### Calculate metrics from 2015 to 2021

In [95]:
years = list(range(2015,2022))
folder_path = "/Users/st414/Documents/PLR/elisa_plrs/train_set"
l=iterate_multiple_years(years, folder_path)

201552032.pdf
201521009.pdf
201528002.pdf
201551006.pdf
201543004.pdf
201503001.pdf
201532026.pdf
201549019.pdf
201538026.pdf
201625001.pdf
201623001.pdf
201619007.pdf
201610006.pdf
201633021.pdf
201628005.pdf
201628004.pdf
201628006.pdf
201622007.pdf
201616002.pdf
201748005.pdf
201740016.pdf
201740005.pdf
201722014.pdf
201751011.pdf
201706006.pdf
201741012.pdf
201722010.pdf
201816004.pdf
201828010.pdf
201815005.pdf
201819006.pdf
201825003.pdf
201811002.pdf
201825006.pdf
201951001.pdf
201926006.pdf
201926007.pdf
201927005.pdf
201927012.pdf
201943020.pdf
202005020.pdf
202016001.pdf
202014004.pdf
202022005.pdf
202014005.pdf
202014001.pdf
202014002.pdf
202014003.pdf
202138001.pdf
202114001.pdf
202125007.pdf
202144005.pdf
202118021.pdf


In [116]:
calculate_metrics(l, reference_set)

            classification plr_number  tag_ref     tag_list
5   Adverse, Discretionary  201610006  Adverse  Non-Adverse
8   Adverse, Discretionary  201622007  Adverse  Non-Adverse
17  Adverse, Discretionary  201741012  Adverse  Non-Adverse
24  Adverse, Discretionary  201943020  Adverse  Non-Adverse
Accuracy: 77.78%
Recall: 87.50%
Precision: 82.35%


#### Calculate metrics from 2000 to 2022

In [None]:
years = list(range(2000,2023))
folder_path = "/Users/st414/Documents/PLR/elisa_plrs/train_set"
l=iterate_multiple_years(years, folder_path)
calculate_metrics(l, reference_set)

In [18]:
calculate_metrics(l, reference_set)

Accuracy: 90.64%
Recall: 87.50%
Precision: 60.00%
