## Functions

Function to get text completetion from gpt api

In [1]:
def get_completion(prompt, model="gpt-3.5-turbo-1106"):
    messages = [{"role": "user", "content": prompt}]
    response = client.chat.completions.create(
        model=model, messages=messages, temperature=0
    )
    return response.choices[0].message.content

Function to iterate through multiple PDFs

In [2]:
# function to get a specific year's PDFs
# assign directory
# directory = "/Users/st414/Documents/PLR/sample_plrs"


# iterate over files in that directory
def files_by_year(year, folder_path):
    plr_list_by_year = []
    files = os.listdir(folder_path)
    for file in files:
        # print(file)
        if len(file) == 11:
            if str(year)[-2:] == str(file)[0:2]:
                plr_list_by_year.append(file)
        if len(file) == 13 and "_" in file:
            if str(year)[-2:] == str(file)[0:2]:
                plr_list_by_year.append(file)
        if len(file) == 13:
            if str(year) == str(file)[0:4]:
                plr_list_by_year.append(file)
        if len(file) == 15 and "_" in file:
            if str(year) == str(file)[0:4]:
                plr_list_by_year.append(file)
    # print(file_list_by_year)
    return plr_list_by_year

In [4]:
# function to iterate through multiple PLRs and get classification
# encoding = tiktoken.encoding_for_model("gpt-3.5-turbo-1106")

def get_plr_classification(year, folder_path):
    plr_classification_list = []
    plr_list_by_year = files_by_year(year, folder_path)
    for plr in plr_list_by_year:
        plr_classification_dict = {}
        plr_filepath = os.path.join(folder_path, plr)
        pdf_to_convert = fitz.open(plr_filepath)
        plr_text = ""
        for page in pdf_to_convert:
            text = page.get_text()
            plr_text += text
        # get classification
        if len(encoding.encode(plr_text)) > 13000:
            pass
        else:
            print(plr)
            prompt = f"""
        Your task is to classify letter rulings as adverse or non-adverse by using
        knowledge and context from the literature provided to you below, delimited
        by triple dollar signs.

        Literature: $$${letterRuling_context}$$$

        Below is the letter ruling, delimited by triple backticks, which has to be classified as Adverse or Non Adverse.

        Letter Ruling: ```{plr_text}```

        Provide your output as one of the two values: Adverse or Non-Adverse.
        """

        response = get_completion(prompt)
        plr_classification_dict = {int(plr.split(".")[0]): response}
        plr_classification_list.append(plr_classification_dict)
    return plr_classification_list

Function to calculate accuracy, recall

In [None]:
def list_to_df(list_of_dicts):
    keys = []
    values = []

    for dict in list_of_dicts:
        for key, value in dict.items():
            keys.append(key)
            values.append(value)
    # create a dataframe
    df = pd.DataFrame({'plr_number': keys, 'tag': values})
    return df

In [5]:
def calculate_metrics(list_of_dicts, reference_set):
    # Convert list of dicts to DataFrame
    list_df = list_to_df(list_of_dicts)

    # Merge the DataFrame and list_df based on the PLR number
    merged_df = pd.merge(reference_set, list_df, on='plr_number', suffixes=('_ref', '_list'))
    #print(merged_df.loc[(merged_df['tag_ref'] == 'Adverse') & (merged_df['tag_list'] == 'Non-Adverse')])

    # Calculate accuracy
    accuracy = sum(merged_df['tag_ref'] == merged_df['tag_list']) / len(merged_df)

    # Calculate recall
    true_positives = sum((merged_df['tag_ref'] == 'Adverse') & (merged_df['tag_list'] == 'Adverse'))
    false_negatives = sum((merged_df['tag_ref'] == 'Adverse') & (merged_df['tag_list'] != 'Adverse'))
    recall = true_positives / (true_positives + false_negatives)

    # Calculate precision
    false_positives = sum((merged_df['tag_ref'] != 'Adverse') & (merged_df['tag_list'] == 'Adverse'))
    precision = true_positives / (true_positives + false_positives)

    print(f"Accuracy: {accuracy*100:.2f}%")
    print(f"Recall: {recall*100:.2f}%")
    print(f"Precision: {precision*100:.2f}%")

In [6]:
# iterate through multiple years
def iterate_multiple_years(years, folder_path):
    years_list = []
    for y in years:
        years_list.extend(get_plr_classification(y, folder_path))
    return years_list

    

Convert a list of dict items to pandas dataframe and write to output folder

In [11]:
def list_to_excel(list, output_folder_path, output_filename):
    # Create a DataFrame from the list using list comprehension
    df = pd.DataFrame([(key, value) for item in list for key, value in item.items()], columns=['PLR Number', 'Classification'])
    
    # Write the DataFrame to the Excel file
    df.to_csv(output_folder_path+'/'+output_filename, index=False)

Fine tuning funcions

In [None]:
# Function to extract text from pdf

def extract_pdf_text(pdf_file):
    """
    Extracts text from a PDF file.
    Args:
        pdf_file: Path to the PDF file.
    Returns:
        Extracted text as a string.
    """
    pdf_to_convert = fitz.open(pdf_file)
    plr_text = ""
    for page in pdf_to_convert:
        text = page.get_text()
        plr_text += text
    return plr_text

In [None]:
# Function to make pandas dataframe for plrs

def text_to_df():
  # Replace with the actual directory containing your PDFs
  pdf_folder = train_folder

  # Create an empty list to store data
  data = []

  # Iterate through all PDF files in the folder
  for filename in os.listdir(pdf_folder):
    if filename.endswith(".pdf"):  # Check for PDF extension
      pdf_path = os.path.join(pdf_folder, filename)
      extracted_text = extract_pdf_text(pdf_path)
      data.append({"plr_number": filename, "text": extracted_text})

  # Create DataFrame from list
  df = pd.DataFrame(data)

  return df