<img src="purpose.png" alt="purpose" width="900"/>

In [2]:
# Import the necessary libraries
import pandas as pd
from openai import OpenAI
import json

In [4]:
# Load the data
df = pd.read_csv("transcriptions.csv")
df.head()

Unnamed: 0,medical_specialty,transcription
0,Allergy / Immunology,"SUBJECTIVE:, This 23-year-old white female pr..."
1,Orthopedic,"CHIEF COMPLAINT:, Achilles ruptured tendon.,H..."
2,Bariatrics,"PREOPERATIVE DIAGNOSIS: , Morbid obesity.,POST..."
3,Cardiovascular / Pulmonary,"PREOPERATIVE DIAGNOSES,Airway obstruction seco..."
4,Urology,"CHIEF COMPLAINT:, Urinary retention.,HISTORY ..."


<img src="workflow.png" alt="workflow" width="500"/>

In [5]:
# Initialize the OpenAI client
client = OpenAI()

# create a function calling
function_callings = [{
    'type':'function',
    'function':{
        'name':'extract_dataset',
        'description': "extracting patient information from dataset: patient's age, recommended treatment or procedure, medical specialty, ICD code corresponding to the transcript",
        "parameters": {
            "type": "object",
            "properties": {
                "age": {
                    "type": "string",
                    "description": "patient's age based on the string supplied."
                },
                "recommended treatment": {
                    "type": "string",
                    "description": "A recommended medical treatment, therapy, or procedure suggested based on the string is supplied."
                },
                "medical specialty": {
                    "type": "string",
                    "description": "the medical specialty is extracted from the string supplied."
                },
                "ICD code": {
                    "type": "string",
                    "description": "ICD code corresponding to the transcript."
                }
            },
            "required": ["age", "recommended treatment", "medical specialty", "ICD code"],
            "additionalProperties": False
        },
        "strict": True
    }
}]

In [6]:
# extract the transcript field from dataset.
transcriptions = df['transcription']

# content in the system role
content = "You are a medical data extraction assistant. Extract the following information from the clinical note provided:\n\n1. Patient's age\n2. Recommended treatment or procedure\n3. Medical specialty related to the case\n4. ICD-10 code for the diagnosis\n\nReturn the result in a structured JSON format with the following keys: 'age', 'recommended_treatment_or_procedure', 'medical_specialty', and 'icd_code'."

results = []
for transcription in transcriptions:
    messages = [{
            'role':'system',
            'content': content
        },{
            'role':'user',
            'content':transcription
        }]
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=messages,
        tools=function_callings
    )
    tool_call = response.choices[0].message.tool_calls[0]
    results.append(json.loads(response.choices[0].message.tool_calls[0].function.arguments))

print(results)

[{'age': '23', 'recommended treatment': 'Try Zyrtec instead of Allegra, Nasonex two sprays in each nostril for three weeks.', 'medical specialty': 'Allergy and Immunology', 'ICD code': 'J30.9'}, {'age': '41', 'recommended treatment': 'operative fixation', 'medical specialty': 'Orthopedics', 'ICD code': 'S86.001A'}, {'age': '30', 'recommended treatment': 'Laparoscopic antecolic antegastric Roux-en-Y gastric bypass with EEA anastomosis', 'medical specialty': 'Bariatric Surgery', 'ICD code': 'E66.01'}, {'age': '50', 'recommended treatment': 'Neck exploration; tracheostomy; urgent flexible bronchoscopy; removal of foreign body; dilation of distal trachea; placement of #8 Shiley single cannula tracheostomy tube', 'medical specialty': 'Laryngology and Thoracic Surgery', 'ICD code': 'J38.7'}, {'age': '66', 'recommended treatment': 'Prescription for 6 months of Flomax and Proscar, self-catheterization if urinary retention recurs', 'medical specialty': 'Urology', 'ICD code': 'N40'}]


In [8]:
# Converting the list of dictionaries to a DataFrame
df_structured = pd.DataFrame(results)
print(df_structured.head())

  age                              recommended treatment  \
0  23  Try Zyrtec instead of Allegra, Nasonex two spr...   
1  41                                 operative fixation   
2  30  Laparoscopic antecolic antegastric Roux-en-Y g...   
3  50  Neck exploration; tracheostomy; urgent flexibl...   
4  66  Prescription for 6 months of Flomax and Prosca...   

                  medical specialty  ICD code  
0            Allergy and Immunology     J30.9  
1                       Orthopedics  S86.001A  
2                 Bariatric Surgery    E66.01  
3  Laryngology and Thoracic Surgery     J38.7  
4                           Urology       N40  
