In [12]:
import os
import openai
import tiktoken
from dotenv import load_dotenv, find_dotenv

_ = load_dotenv(find_dotenv())  # read local .env file

openai.api_key = os.environ["OPENAI_API_KEY"]

In [13]:
import tiktoken
import fitz
import shutil, random, os
from pathlib import Path
import pandas as pd

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [14]:
client = openai.OpenAI()


def get_completion(prompt, model="gpt-3.5-turbo-1106"):
    messages = [{"role": "user", "content": prompt}]
    response = client.chat.completions.create(
        model=model, messages=messages, temperature=0
    )
    return response.choices[0].message.content

Function to iterate through multiple PDFs

In [15]:
# function to get a specific year's PDFs
# assign directory
directory = "/Users/st414/Documents/PLR/sample_plrs"


# iterate over files in that directory
def files_by_year(year, folder_path):
    plr_list_by_year = []
    files = os.listdir(folder_path)
    for file in files:
        if len(file) == 11:
            if str(year)[-2:] == str(file)[0:2]:
                plr_list_by_year.append(file)
        if len(file) == 13 and "_" in file:
            if str(year)[-2:] == str(file)[0:2]:
                plr_list_by_year.append(file)
        if len(file) == 13:
            if str(year) == str(file)[0:4]:
                plr_list_by_year.append(file)
        if len(file) == 15 and "_" in file:
            if str(year) == str(file)[0:4]:
                plr_list_by_year.append(file)
    # print(file_list_by_year)
    return plr_list_by_year

In [16]:
# function to iterate through multiple PLRs and get classification
def get_plr_classification(year, folder_path):
    plr_classification_list = []
    plr_list_by_year = files_by_year(year, directory)
    for plr in plr_list_by_year:
        plr_classification_dict = {}
        plr_filepath = os.path.join(folder_path, plr)
        pdf_to_convert = fitz.open(plr_filepath)
        plr_text = ""
        for page in pdf_to_convert:
            text = page.get_text()
            plr_text += text
        # get classification
        prompt = f"""
        Your task is to classify letter rulings as adverse or non-adverse by using
        knowledge and context from the literature provided to you below, delimited
        by triple dollar signs.

        Literature: $$${letterRuling_context}$$$

        Below is the letter ruling, delimited by triple backticks, which has to be classified as Adverse or Non Adverse.

        Letter Ruling: ```{plr_text}```

        Provide your output as one of the two values: Adverse or Non-Adverse.
        """

        response = get_completion(prompt)
        plr_classification_dict = {int(plr.split(".")[0]): response}
        plr_classification_list.append(plr_classification_dict)
    return plr_classification_list

Read and clean Tagged PLRs csv

In [17]:
tagged_plrs = pd.read_excel(
    "/Users/st414/Documents/PLR/tagged_plrs.xlsx",
    sheet_name="Sheet1",
    converters={
        "Adverse, Discretionary": int,
        "Partially Adverse": int,
        "Not Adverse": int,
        "Mandatory": int,
        "Employee not IC": int,
        "Revocation": int,
    },
)

In [18]:
# Reshaping the DataFrame using melt
tagged_plrs_melted = pd.melt(
    tagged_plrs, var_name="Classification", value_name="PLR Number"
)

# Remove rows with NA values
tagged_plrs_melted = tagged_plrs_melted.dropna(subset=["PLR Number"])

In [26]:
tagged_plrs_melted.sample(20)

Unnamed: 0,Classification,PLR Number
2023,Mandatory,200505023
2081,Mandatory,200945068
677,Partially Adverse,200125076
2042,Mandatory,200546017
1968,Not Adverse,201438006
1962,Not Adverse,201431002
1652,Not Adverse,200515017
1963,Not Adverse,201431038
1889,Not Adverse,201149017
1542,Not Adverse,200242046


In [28]:
# Create a column for adverse, non-adverse
# Creating a new column 'Tag' based on a condition
tagged_plrs_melted['Tag'] = 'Other' # Initialize the 'Tag' column with 'Other' as the default value

# Using DataFrame.loc[] to set values based on the condition
tagged_plrs_melted.loc[tagged_plrs_melted['Classification'] == 'Adverse, Discretionary', 'Tag'] = 'Adverse'
tagged_plrs_melted.loc[tagged_plrs_melted['Classification'] == 'Not Adverse', 'Tag'] = 'Non-Adverse'
tagged_plrs_melted.loc[tagged_plrs_melted['Classification'] == 'Partially Adverse', 'Tag'] = 'Partially-Adverse'

# Displaying the modified dataframe
tagged_plrs_melted.sample(10)


Unnamed: 0,Classification,PLR Number,Tag
1793,Not Adverse,200834013,Non-Adverse
669,Partially Adverse,200027018,Partially-Adverse
1946,Not Adverse,201408025,Non-Adverse
1488,Not Adverse,200152005,Non-Adverse
1627,Not Adverse,200433013,Non-Adverse
1797,Not Adverse,200840016,Non-Adverse
2056,Mandatory,200810034,Other
80,"Adverse, Discretionary",202125007,Adverse
2201,Mandatory,201714031,Other
1926,Not Adverse,201319004,Non-Adverse


Isolate tagged PLRs from Elisa's PLR library for testing

In [None]:
# remove all the pdfs with _ or - in them
# Function to check if a filename contains "_" or "-"
def has_underscore_or_dash(filename):
    return "_" in filename or "-" in filename

# Source and destination directories
source_folder = "/Users/st414/Documents/PLR/elisa_plrs/files_definite_plr"
destination_folder = "/Users/st414/Documents/PLR/elisa_plrs/duplicates_and_errors"

# Create the destination folder if it doesn't exist
if not os.path.exists(destination_folder):
    os.makedirs(destination_folder)

# Iterate through files in the source folder
for filename in os.listdir(source_folder):
    if filename.endswith(".pdf"):  # Check if it's a PDF file
        source_file_path = os.path.join(source_folder, filename)
        if has_underscore_or_dash(filename):
            # Move the file to the destination folder
            destination_file_path = os.path.join(destination_folder, filename)
            shutil.move(source_file_path, destination_file_path)
            print(f"Moved {filename} to {destination_folder}")

Function to check accuracy, recall