In [2]:
# Import all the required packages

import os
import pandas as pd
import numpy as np
import tiktoken
import sidetable
import time
from openai import OpenAI
from tqdm import tqdm
from tenacity import retry, stop_after_attempt, wait_random_exponential
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [None]:
# Initialize the OpenAI client with your API key
api_key = '..'
client = OpenAI(api_key=api_key)

In [None]:
# Path to the data files
path = r'...'

# Creating a list of all the paths for files in different experiments
paths = []
for subfolders in os.listdir(path):
    x = os.path.join(path, subfolders)
    paths.append(x)
    
paths

In [5]:
#Reading the sampled datafile for 100 patients
df_test = pd.read_csv(paths[0] + r'\df_test.csv')

In [6]:
df_test['final_deid'] = df_test['final_deid'].astype(str)

In [7]:
def remove_substring(text, substring):
    index = text.find(substring)
    if index != -1:
        return text[:index]  # Return the text up to the found index
    else:
        return text 

In [8]:
remove = "I, the teaching physician"
df_test['final_deid'] = df_test['final_deid'].apply(lambda x: remove_substring(x, remove))

remove = "ATTESTATION"
df_test['final_deid'] = df_test['final_deid'].apply(lambda x: remove_substring(x, remove))

remove = "Critical results were communicated"
df_test['final_deid'] = df_test['final_deid'].apply(lambda x: remove_substring(x, remove))

remove = "Electronically Signed by "
df_test['final_deid'] = df_test['final_deid'].apply(lambda x: remove_substring(x, remove))

In [9]:
def num_tokens_from_string(string: str, encoding_name: str) -> int:
    """Returns the number of tokens in a text string."""
#     encoding = tiktoken.encoding_for_model(encoding_name)
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

In [10]:
# Encoding name for the  model
# encoding_name = "gpt-4" ## if encoding for a specific model is used
encoding_name = 'cl100k_base'

In [11]:
# Now calculate the number of tokens for each truncated string
df_test['num_tokens'] = df_test['final_deid'].apply(lambda x: num_tokens_from_string(x, encoding_name))

# Calculate the total number of tokens
total_tokens = df_test['num_tokens'].sum()
print(f"Total number of tokens in the dataset: {total_tokens}")

Total number of tokens in the dataset: 187698


### Fewshot with baseline

In [None]:
prompt_text = """ Identify if the following radiology impression text indicates outcomes: (1) any cancer, (2) progression/worsening, (3) response/improvement, (4) brain metastases, (5) bone/osseous metastases, (6) adrenal metastases, (7) liver/hepatic metastases, (8) lung/pulmonary metastases, (9) lymph node/nodal metastases, (10) peritoneal metastases. Answer in Yes or No. Do not give an explanation.

EXAMPLE: 
An example of impression and output is given below:

IMPRESSION: 
...

1.	any cancer: Yes 
2.	progression: Yes
3.	response/improvement: No
4.	brain metastases: No 
5.	bone/osseous metastases: No 
6.	adrenal metastases: No 
7.	liver/hepatic metastases: No 
8.	For lung/pulmonary metastases: Yes
9.	For lymph node/nodal metastases: Yes
10.	For peritoneal metastases: No """

num_tokens_from_string(prompt_text, encoding_name)

382

In [13]:
df_test['total_num_tokens'] = df_test['num_tokens'] + 381
df_test['total_num_tokens'].sum()

1102860

In [None]:
# Function to create a single GPT-4 API call
@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(10))
def create_gpt4_call(content):
    response = client.chat.completions.create(
        model="model_name (gpt4/gpt4o)",
        messages=[
            {"role": "system", "content": "You are a helpful assistant designed to analyze radiology reports."},
            {"role": "user", "content": content}
        ],
        temperature=1e-12,
        max_tokens=2048,
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0
    )
    return response.choices[0].message.content.strip()

In [17]:
# List of conditions to be checked
conditions = ["any cancer", "progression/worsening", "response/improvement", 
                  "brain metastases", "bone/osseous metastases", "adrenal metastases", 
                  "liver/hepatic metastases", "lung/pulmonary metastases", 
                  "lymph node/nodal metastases", "peritoneal metastases"]

In [None]:
path_results = r'...'

# Function to classify impressions for the fixed set of conditions
def classify_impressions(df, column_name, conditions, prompt_text, num):
    
    # Reset the index of the DataFrame before running the loop. 
    # This will ensure that the DataFrame has a simple integer-based index which should align with loop's index variable.
    df = df.reset_index(drop=True)
    
    # Initialize a dictionary to store predictions
    predictions = {condition: [] for condition in conditions}
    
    # Calculate token count for the static part of the prompt
    static_prompt_tokens = num_tokens_from_string(prompt_text, "cl100k_base")

    # Calculate total number of tokens for all prompts
    total_tokens = sum(df[column_name].apply(lambda x: num_tokens_from_string(f"{prompt_text}\n\n{x}", "cl100k_base")))

    processed_tokens = 0

    # Initialize tqdm with the initial description and total number of rows
    pbar = tqdm(total=len(df), desc="Starting")

    for index, impression in df.iterrows():
        # Constructing the full prompt with impression
        full_prompt = f"{prompt_text}\n\n{impression[column_name]}" 
        
        # Generating a single response for all conditions
        response = create_gpt4_call(full_prompt)

        # Count the number of tokens for the current full prompt
        num_tokens = num_tokens_from_string(full_prompt, "cl100k_base")
        processed_tokens += num_tokens
        tokens_left = total_tokens - processed_tokens

        # Update tqdm description to show both item progress and token count
        pbar.set_description(f"Classifying - {index + 1}/{len(df)} - Tokens Processed: {processed_tokens}, Tokens Left: {tokens_left}")

        # Update progress by one iteration for the item counter
        pbar.update(1)
        
        # Parsing the response to extract labels for each condition
        labels = response.split("\n")
        for i, condition in enumerate(conditions):
            label = labels[i].strip() if i < len(labels) else "No"
            # Extract only the 'Yes' or 'No' part from the label
            label = "Yes" if "Yes" in label else "No"
            predictions[condition].append(label)
   
    # Save intermediate results every nth rows
        if (index + 1) % num == 0:
            for condition, condition_predictions in predictions.items():
                processed_predictions = condition_predictions[:index + 1]

                # Get indices of the rows to update
                indices_to_update = df.index[:index + 1]

                # Use .loc to update the original DataFrame
                df.loc[indices_to_update, f'{condition}_predicted'] = np.where(np.array(processed_predictions) == 'Yes', 1, 0)

            # Save the updated part of the DataFrame
            df.iloc[:index + 1].to_csv(path_results + f'output_at_row_{index + 1}.csv', index=False)

        
    # Convert 'Yes'/'No' labels to binary (1/0) and add to DataFrame
    for condition, condition_predictions in predictions.items():
        df[f'{condition}_predicted'] = np.where(np.array(condition_predictions) == 'Yes', 1, 0)
        

    pbar.close()
    return df, predictions

In [19]:
df, predictions = classify_impressions(df_test,'final_deid', conditions, prompt_text, 1000)

Classifying - 2402/2402 - Tokens Processed: 1105262, Tokens Left: 0: 100%|██████████| 2402/2402 [2:43:24<00:00,  4.08s/it]      


In [None]:
classified_df = df

In [21]:
predicted_label_cols = [col for col in classified_df.columns if col.endswith('_predicted')]
true_label_cols = classified_df.columns[3:13]

In [22]:
true_label_cols, predicted_label_cols

(Index(['any_cancer', 'progression', 'response', 'brain_met', 'bone_met',
        'adrenal_met', 'liver_met', 'lung_met', 'node_met', 'peritoneal_met'],
       dtype='object'),
 ['any cancer_predicted',
  'progression/worsening_predicted',
  'response/improvement_predicted',
  'brain metastases_predicted',
  'bone/osseous metastases_predicted',
  'adrenal metastases_predicted',
  'liver/hepatic metastases_predicted',
  'lung/pulmonary metastases_predicted',
  'lymph node/nodal metastases_predicted',
  'peritoneal metastases_predicted'])

In [23]:
def evaluate_model_performance(df, true_label_cols, predicted_label_cols):
    for true_label_col, predicted_label_col in zip(true_label_cols, predicted_label_cols):
        # Extract true and predicted labels
        true_labels = df[true_label_col]
        predicted_labels = df[predicted_label_col]

        # Compute the evaluation metrics
        accuracy = accuracy_score(true_labels, predicted_labels)
        precision = precision_score(true_labels, predicted_labels)
        recall = recall_score(true_labels, predicted_labels)
        f1 = f1_score(true_labels, predicted_labels)

        # Print the results
        print(f"Evaluation Metrics for {predicted_label_col}:")
        print(f"  Accuracy: {accuracy:.4f}")
        print(f"  Precision: {precision:.4f}")
        print(f"  Recall: {recall:.4f}")
        print(f"  F1 Score: {f1:.4f}\n")

In [24]:
evaluate_model_performance(classified_df, true_label_cols, predicted_label_cols)

Evaluation Metrics for any cancer_predicted:
  Accuracy: 0.8564
  Precision: 0.8346
  Recall: 0.9346
  F1 Score: 0.8818

Evaluation Metrics for progression/worsening_predicted:
  Accuracy: 0.8418
  Precision: 0.6067
  Recall: 0.8606
  F1 Score: 0.7117

Evaluation Metrics for response/improvement_predicted:
  Accuracy: 0.7573
  Precision: 0.2058
  Recall: 0.9740
  F1 Score: 0.3398

Evaluation Metrics for brain metastases_predicted:
  Accuracy: 0.9713
  Precision: 0.8063
  Recall: 0.7725
  F1 Score: 0.7890

Evaluation Metrics for bone/osseous metastases_predicted:
  Accuracy: 0.9534
  Precision: 0.8619
  Recall: 0.9267
  F1 Score: 0.8931

Evaluation Metrics for adrenal metastases_predicted:
  Accuracy: 0.9858
  Precision: 0.6916
  Recall: 0.9867
  F1 Score: 0.8132

Evaluation Metrics for liver/hepatic metastases_predicted:
  Accuracy: 0.9613
  Precision: 0.7515
  Recall: 0.9658
  F1 Score: 0.8453

Evaluation Metrics for lung/pulmonary metastases_predicted:
  Accuracy: 0.8976
  Precision:

In [27]:
for true_col, pred_col in zip(true_label_cols, predicted_label_cols):
    classified_df['results_' + true_col] = np.where(classified_df[true_col] == classified_df[pred_col], "Correct", "Incorrect")

In [29]:
classified_df.to_csv(path_results + "classified_df_test.csv")