In [26]:
# "Recurrent Tumor (R)" A&P 50%/A 50%

# Recurrence: A tumor is considered recurrent only if occuring in a prior Mohs or excision site/scar line, previous C&F procedure does not qualify

import os
import pandas as pd
import requests
import base64

#using azure
API_KEY = "2DWhd5NlWo6U8lpUBSWgyCuCL1bgNUAlQyv4sF2Oq6TsNxszq97NJQQJ99AKACHrzpqXJ3w3AAABACOGzNKu"
ENDPOINT = "https://rg-01-hippa-standalone-openai.openai.azure.com/openai/deployments/gpt-4o/chat/completions?api-version=2024-02-15-preview"

def call_gpt(query):
    headers = {
        "Content-Type": "application/json",
        "api-key": API_KEY,
    }
    
    system_context = """
    You are a medical assistant trained to identify whether a tumor is recurrent based on Mohs patient notes.
    
    Your task is to read the patient note and determine if it indicates the presence of a recurrent tumor.
    
    ONLY respond with 'Yes' or 'No'.
    
    If recurrence is not explicitly mentioned, analyze clinical wording and context to make a best-guess classification.
    
    Recurrence Context:
    - A tumor is considered recurrent only if it is occurring at the site of a prior Mohs surgery or excision (e.g., scar line, previously treated area).
    - Tumors at sites previously treated with Curettage and Fulguration (C&F) do NOT qualify as recurrent.
    """
    
    # Payload for the request
    payload = {
    "messages" : [
            {"role": "system", "content": system_context},
            {"role": "user", "content": f"{query}"},
        ],
      "temperature": 0.7,
      "top_p": 0.95,
      "max_tokens": 800
    }
    
    
    # Send request
    try:
        response = requests.post(ENDPOINT, headers=headers, json=payload)
        response.raise_for_status()  # Will raise an HTTPError if the HTTP request returned an unsuccessful status code
    except requests.RequestException as e:
        print("Response Content:", response.text)
        raise SystemExit(f"Failed to make the request. Error: {e}")

    # Handle the response as needed (e.g., print or process)
    #print(response.json())
    #return response.json().choices[0].message.content
    return response.json()["choices"][0]["message"]["content"]

In [27]:
# Use full Excel sheet
import pandas as pd
Cat = pd.read_excel(r'../De-Identified Mohs Full Data.xlsx',sheet_name='Identified Data',header=0) # Only Size at Greatest Dimension column
Cat.head()

Unnamed: 0,Randomized Study Number,Size at Greatest Dimension (S) ***Be sure this is from the note and not the Mohs episode!,Recurrent Tumor (R),"Location - Nose, Eyelid, Nail, Lip, Genitalia, or Acral (L)",Aggressive Tumor Pathology Type (AT),DFSP (D),Coordinated repair with ENT/oculoplastics (C),Additional C&F,Additional Excision,Multiple Sites,Business Days from Referral to First Appt,Physical Exam (PE),Assessment & Plan (A&P),Addendum (A)
0,36,0.8,Y,N,N,N,N,N,N,N,32.0,"Physical exam:\nGeneral: Well developed, well ...",1. Chronic actinic damage in the setting of hi...,Addendum 7/11/2023\n \nThe pathology results w...
1,240,0.6,Y,Y,N,N,N,N,N,N,40.0,Physical exam:\nGeneral: Well-dressed; well-no...,Procedure Orders\nBiopsy [486623795] ordered b...,Addendum 7/11/2023\nThe pathology results were...
2,32,0.3,N,N,N,N,N,N,N,N,4.0,Physical exam:\nGeneral: Well-dressed; well-no...,Assessment & Plan: \n1. Brown papule on the le...,Addendum 7/13/2023\n \nThe pathology results w...
3,111,6.5,N,N,N,N,Y,N,N,N,35.0,Physical exam:\nVital signs: There were no vit...,Assessment & Plan: \nBiopsy-proven basal cell ...,
4,197,3.5,N,N,N,N,N,Y,Y,N,35.0,"Physical exam:\nGeneral: Well developed, well ...",Assessment & Plan: \n1. Chronic actinic sun d...,Addendum 7/5/2023\n \nThe pathology results we...


In [28]:
# Set options here
sample_size = len(Cat) # Change number to increase or decrease number of patients read from file
#sample_size = 10
multiple_columns = False # Set to True to use all input text columns, False to use only PE column

# Array of indexes containing patients with notes that have problems (Some may be ok, these are from size so a location could exist but not size)
skip_patients_array = [7,8,22,23,32,34,38,39,49,50,52,55,62,66,73,80,88,89,101,102,107,110,121,123,131,133,135,
                       136,137,140,150,155,157,164,170,184,187,193,200,206,210,214,227,228,229,234,252,261,265,279,287,292,293]
skip_patients = False # Set to True to skip bad patient notes, False to keep them in

In [29]:
# Takes the text in the PE, A&P, and A columns and asks ChatGPT to return "Yes" or "No" to the question
# May use only A column
import re

# Always split into 3 GPT calls, one for each column
def inputSplit(input_columns, que, index):
    while len(input_columns) < 3: # Some text columns may be empty so add a blank entry
        input_columns.append("")
        
    input_string = " ".join(map(str, input_columns)) # Apply str() to each index of input_columns then join with a space
    prompt = f"{que} \n\n {input_string}" # Create base prompt (question + input_columns)
    
    if(len(input_string) > 0):
        tmp_ans = [""] * len(input_columns)
        check_ans = [""] * len(input_columns)
        
        # Split into 3 GPT calls, one per column, and remove anything that isn't a "Yes" or "No" from the result
        for i in range(len(input_columns)):
            prompt = f"{que} \n\n {input_columns[i]}" # Recreate prompt with question + 1 column
            tmp_ans[i] = call_gpt(prompt) # Call GPT with prompt and save answer to array
            
            print(f"tmp_ans[{i}] = {str(tmp_ans[i])}")
            
            check_ans[i] = re.split(r'\s+|-', tmp_ans[i])  # Split by spaces or hyphens
            check_ans[i] = [re.sub(r'[^a-zA-Z]', '', j) for j in check_ans[i]]  # Remove non-alphabetic characters
            check_ans[i] = [j.lower() for j in check_ans[i]]  # Convert all to lowercase to handle variations

            check_ans[i] = [j for j in check_ans[i] if j in ['yes', 'no']] # Keep only "yes" or "no"
            
            print(f"check_ans[{i}] = {str(check_ans[i])}")
            
        # Check if 2 arrays are the same output, if so return one of them if it is not empty
        if(check_ans[0] == check_ans[1] or check_ans[0] == check_ans[2]):
            if check_ans[0]:
                return tmp_ans[0]
            elif check_ans[1]:
                return tmp_ans[1]
            elif check_ans[2]:
                return tmp_ans[2]
        elif(check_ans[1] == check_ans[2]):
            if check_ans[1]:
                return tmp_ans[1]
            elif check_ans[2]:
                return tmp_ans[2]

        # Count num of "yes" and "no"
        yes_count = [0] * len(input_columns)
        no_count = [0] * len(input_columns)
        
        for i in range(len(check_ans)):
            yes_count[i] = check_ans[i].count('yes')
            no_count[i] = check_ans[i].count('no')
            
        print(f"Yes counts: {yes_count}")
        print(f"No counts: {no_count}")
        
        # If no matches, return string that has the most "yes" or "no"
        if any(count > 0 for count in yes_count) or any(count > 0 for count in no_count): # Check count strings for any occurences or "yes" or "no"
            max_yes_no = [0] * len(input_columns)
            max_answer = 0
            max_i = -1
            
            # Check if string has more "yes" or "no"
            for i in range(len(input_columns)):
                max_yes_no[i] = max(yes_count[i], no_count[i])
                
            # Look for highest overall "yes" or "no" then return that string
            for i in range(len(input_columns)):
                if max_answer < max_yes_no[i]:
                    max_answer = max_yes_no[i]
                    max_i = i
            return tmp_ans[max_i]
        
        # All non "yes" or "no" answers, return empty string
        return ""
    else:
        # Somehow input_string is empty
        return "input_string empty"
    

true_results = [0 for i in range(sample_size)]
true_results_final = [0 for i in range(sample_size)]

output = [0 for i in range(sample_size)]
output_final = [0 for i in range(sample_size)]

for i in range(0, min(sample_size, len(Cat))):
    # If skip_patients is True and current index is in skip_patients_array, move to next index
    if skip_patients and i in skip_patients_array:
        continue
        
    true_results[i] = Cat.at[i, 'Recurrent Tumor (R)']
    que = 'Question: Does the following patient note indicate that any of the tumors are recurrent?'
    ans = ""
    
    if multiple_columns:
        input_columns = Cat.loc[i, ['Physical Exam (PE)', 'Assessment & Plan (A&P)', 'Addendum (A)']]
        ans = inputSplit(input_columns, que, i) # Result from GPT call
    elif not multiple_columns:
        input_columns = Cat.at[i, 'Addendum (A)']
        prompt = f"{que} \n\n {input_columns}"
        ans = call_gpt(prompt)
    
    print(f"Result[{i}] = {ans}")
    output[i] = ans

Result[0] = Yes
Result[1] = Yes
Result[2] = No
Result[3] = No
Result[4] = No
Result[5] = No
Result[6] = No
Result[7] = No
Result[8] = No
Result[9] = No
Result[10] = No
Result[11] = No
Result[12] = No
Result[13] = No
Result[14] = No
Result[15] = No
Result[16] = No
Result[17] = No
Result[18] = No
Result[19] = No
Result[20] = No
Result[21] = No
Result[22] = No
Result[23] = No
Result[24] = No
Result[25] = No
Result[26] = No
Result[27] = No
Result[28] = No
Result[29] = Yes
Result[30] = No
Result[31] = No
Result[32] = No
Result[33] = No
Result[34] = Yes
Result[35] = No
Result[36] = No
Result[37] = No
Result[38] = No
Result[39] = Yes
Result[40] = No
Result[41] = No
Result[42] = No
Result[43] = No
Result[44] = No
Result[45] = No
Result[46] = No
Result[47] = No
Result[48] = No
Result[49] = No
Result[50] = No
Result[51] = No
Result[52] = No
Result[53] = No
Result[54] = No
Result[55] = No
Result[56] = No
Result[57] = No
Result[58] = No
Result[59] = No
Result[60] = No
Result[61] = No
Result[62] = 

In [30]:
import re

# Go through every GPT response and turn it into "Y" or "N" to compare
for i in range(0, min(sample_size, len(Cat))):
    if skip_patients and i in skip_patients_array:
        continue
        
    yn = output[i]
    print(f"i={i} Answer: {yn}")
    yn_real = str(true_results[i])
    
    yn = re.split(r'\s+|-', yn)  # Split by spaces or hyphens
    yn = [re.sub(r'[^a-zA-Z]', '', j) for j in yn]  # Remove non-alphabetic characters
    yn = [j.lower() for j in yn]  # Convert all to lowercase to handle variations
    
    yn = [j for j in yn if j in ['yes', 'no']] # Keep only "yes" or "no"
    
    if len(yn) == 0:
        output_final[i] = "error_2"  # No valid response detected, bad prediction
    
    yes_count = yn.count('yes')
    no_count = yn.count('no')
    
    if yes_count > no_count:
        output_final[i] = "Y"
    elif no_count > yes_count:
        output_final[i] = "N"
        
    # Check if the given input is valid
    if yn_real.strip() and yn_real.lower() != 'nan':
        if 'y' in yn_real.lower():
            true_results_final[i] = "Y" # Some are input like N/Y or Y/N/N so look for Y and if exists compare with that
        else:
            true_results_final[i] = yn_real.strip()
    else:
        true_results_final[i] = "error_1" # No valid input provided in true_results, bad note
    
    print(f"Result[{i}]: ChatGPT Prediction = {output_final[i]} Answer Given = {true_results_final[i]}")

i=0 Answer: Yes
Result[0]: ChatGPT Prediction = Y Answer Given = Y
i=1 Answer: Yes
Result[1]: ChatGPT Prediction = Y Answer Given = Y
i=2 Answer: No
Result[2]: ChatGPT Prediction = N Answer Given = N
i=3 Answer: No
Result[3]: ChatGPT Prediction = N Answer Given = N
i=4 Answer: No
Result[4]: ChatGPT Prediction = N Answer Given = N
i=5 Answer: No
Result[5]: ChatGPT Prediction = N Answer Given = N
i=6 Answer: No
Result[6]: ChatGPT Prediction = N Answer Given = N
i=7 Answer: No
Result[7]: ChatGPT Prediction = N Answer Given = N
i=8 Answer: No
Result[8]: ChatGPT Prediction = N Answer Given = N
i=9 Answer: No
Result[9]: ChatGPT Prediction = N Answer Given = N
i=10 Answer: No
Result[10]: ChatGPT Prediction = N Answer Given = N
i=11 Answer: No
Result[11]: ChatGPT Prediction = N Answer Given = Y
i=12 Answer: No
Result[12]: ChatGPT Prediction = N Answer Given = N
i=13 Answer: No
Result[13]: ChatGPT Prediction = N Answer Given = N
i=14 Answer: No
Result[14]: ChatGPT Prediction = N Answer Given = 

In [31]:
total_size = 0
correct_predictions = 0
no_response_given = 0
no_alphabetical_result_returned = 0
bad_cases_combined = 0
nrg_not_narr = 0
narr_not_nrg = 0
# nrg = no valid response given in (L) column
# narr = no alphabetical result returned from ChatGPT output

for i in range(0, min((sample_size), len(Cat))):
    if(output_final[i] == "error_2" and true_results_final[i] == "error_1"):
        bad_cases_combined += 1
        
    if(output_final[i] == "error_2"):
        no_alphabetical_result_returned += 1
        if(true_results_final[i] != "error_1"):
            narr_not_nrg += 1
            
    if(true_results_final[i] == "error_1"):
        no_response_given += 1
        if(output_final[i] != "error_2"):
            nrg_not_narr += 1
    
    if(output_final[i] == true_results_final[i]):
        correct_predictions += 1
        
    total_size += 1
    
accuracy = correct_predictions / total_size
print("# of Correct Predictions = " + str(correct_predictions))
print("Sample Size = " + str(total_size))
print("Accuracy = " + "{:.2f}%".format(accuracy * 100))

# Accuracy when removing cases where the Location is not given (best to look at)
no_response_given_accuracy = correct_predictions / (total_size - no_response_given)
print()
print("Number of Notes without a Ground Truth = " + str(no_response_given))
print("Accuracy = " +"{:.2f}%".format(no_response_given_accuracy * 100))

# Accuracy when removing cases where GPT didn't return a valid alphabetical answer (not the best to look at, could be slightly lower)
no_alphabetical_result_returned_accuracy = correct_predictions / (total_size - no_alphabetical_result_returned)
print()
print("Number of PE/A&P/A Notes without a valid GPT answer = " + str(no_alphabetical_result_returned))
print("Accuracy = " + "{:.2f}%".format(no_alphabetical_result_returned_accuracy * 100))

# Accuracy when removing all above bad cases (ok to look at, could be slightly lower)
bad_cases_total_accuracy = correct_predictions / (total_size - (narr_not_nrg + nrg_not_narr + bad_cases_combined))
print()
print("Number of Combined Notes with issues = " + str(narr_not_nrg + nrg_not_narr + bad_cases_combined))
print("Accuracy = " + "{:.2f}%".format(bad_cases_total_accuracy * 100))     

# of Correct Predictions = 274
Sample Size = 294
Accuracy = 93.20%

Number of Notes without a Ground Truth = 3
Accuracy = 94.16%

Number of PE/A&P/A Notes without a valid GPT answer = 0
Accuracy = 93.20%

Number of Combined Notes with issues = 3
Accuracy = 94.16%


In [22]:
n_count = 0
for i in range(0, min((sample_size), len(Cat))):
    if output_final[i].strip() != true_results_final[i].strip() and true_results_final[i] != "error_1" and output[i] != "Yes":
        n_count += 1
        print(f"i = {i}: Unfiltered ChatGPT Prediction = {output[i]}\nUnfiltered Ground Truth = {true_results[i]}")
        print(f"i = {i}: ChatGPT Prediction = {output_final[i]} Ground Truth = {true_results_final[i]}")
        print()
        print("-"*40) # Spacer
print(f"n_count = {n_count}")
# 11,18,255,288 twice

i = 11: Unfiltered ChatGPT Prediction = No
Unfiltered Ground Truth = Y
i = 11: ChatGPT Prediction = N Ground Truth = Y

----------------------------------------
i = 18: Unfiltered ChatGPT Prediction = No
Unfiltered Ground Truth = Y
i = 18: ChatGPT Prediction = N Ground Truth = Y

----------------------------------------
i = 255: Unfiltered ChatGPT Prediction = No
Unfiltered Ground Truth = Y 
i = 255: ChatGPT Prediction = N Ground Truth = Y

----------------------------------------
i = 288: Unfiltered ChatGPT Prediction = No
Unfiltered Ground Truth = N/N
i = 288: ChatGPT Prediction = N Ground Truth = N/N

----------------------------------------
n_count = 4


In [24]:
y_count = 0
for i in range(0, min((sample_size), len(Cat))):
    if output_final[i] != true_results_final[i] and true_results_final[i] != "error_1" and output[i] != "No":
        y_count += 1
        print(f"i = {i}: Unfiltered ChatGPT Prediction = {output[i]}\nUnfiltered Ground Truth = {true_results[i]}")
        print(f"i = {i}: ChatGPT Prediction = {output_final[i]} Ground Truth = {true_results_final[i]}")
        print()
        print("-"*40) # Spacer
print(f"y_count = {y_count}")
# 29,34,75,78,89,91,101,108,140,143,174,183,220,271

i = 29: Unfiltered ChatGPT Prediction = Yes
Unfiltered Ground Truth = N
i = 29: ChatGPT Prediction = Y Ground Truth = N

----------------------------------------
i = 34: Unfiltered ChatGPT Prediction = Yes
Unfiltered Ground Truth = N
i = 34: ChatGPT Prediction = Y Ground Truth = N

----------------------------------------
i = 75: Unfiltered ChatGPT Prediction = Yes
Unfiltered Ground Truth = N
i = 75: ChatGPT Prediction = Y Ground Truth = N

----------------------------------------
i = 78: Unfiltered ChatGPT Prediction = Yes
Unfiltered Ground Truth = N
i = 78: ChatGPT Prediction = Y Ground Truth = N

----------------------------------------
i = 89: Unfiltered ChatGPT Prediction = Yes
Unfiltered Ground Truth = N
i = 89: ChatGPT Prediction = Y Ground Truth = N

----------------------------------------
i = 91: Unfiltered ChatGPT Prediction = Yes
Unfiltered Ground Truth = N
i = 91: ChatGPT Prediction = Y Ground Truth = N

----------------------------------------
i = 101: Unfiltered ChatGPT 

In [25]:
count = 0
for i in range(0, min((sample_size), len(Cat))):
    if true_results_final[i] == "N":
        count += 1
print(f"no_count = {count}")
# 24 and 113 are bad notes for comparing, NA on every ground truth column

no_count = 281
