In [3]:
import pandas as pd

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [4]:
medical_reports = pd.read_csv('reports_.csv')
medical_reports.head()

Unnamed: 0,medical_specialty,report
0,Cardiovascular / Pulmonary,"2-D M-MODE: , ,1. Left atrial enlargement wit..."
1,Cardiovascular / Pulmonary,1. The left ventricular cavity size and wall ...
2,Cardiovascular / Pulmonary,"2-D ECHOCARDIOGRAM,Multiple views of the heart..."
3,Cardiovascular / Pulmonary,"DESCRIPTION:,1. Normal cardiac chambers size...."
4,Cardiovascular / Pulmonary,"2-D STUDY,1. Mild aortic stenosis, widely calc..."


In [5]:
medical_reports.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2179 entries, 0 to 2178
Data columns (total 2 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   medical_specialty  2179 non-null   object
 1   report             2179 non-null   object
dtypes: object(2)
memory usage: 34.2+ KB


In [7]:
medical_reports.dropna(subset=['report'],inplace=True)

In [8]:
medical_reports.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2179 entries, 0 to 2178
Data columns (total 2 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   medical_specialty  2179 non-null   object
 1   report             2179 non-null   object
dtypes: object(2)
memory usage: 34.2+ KB


In [None]:
# medical_reports.fillna()

In [11]:
#Train validation final test

medical_reports.groupby('medical_specialty').count()

Unnamed: 0_level_0,report
medical_specialty,Unnamed: 1_level_1
Cardiovascular / Pulmonary,371
Gastroenterology,224
Neurology,223
Radiology,273
Surgery,1088


In [39]:
grouped_data = medical_reports.groupby('medical_specialty').sample(50)

In [40]:
grouped_data['medical_specialty'].value_counts()

medical_specialty
Cardiovascular / Pulmonary    50
Gastroenterology              50
Neurology                     50
Radiology                     50
Surgery                       50
Name: count, dtype: int64

In [41]:
val_test_data = grouped_data.groupby('medical_specialty').sample(10)

In [42]:
val_data = val_test_data.groupby('medical_specialty').head(5)
test_data = val_test_data.groupby('medical_specialty').tail(5)

In [43]:
train_data = grouped_data[~grouped_data.index.isin(val_test_data.index)]

In [44]:
print(len(train_data), len(val_data), len(test_data))

200 25 25


## Dataset Statistics

In [45]:
import tiktoken

In [46]:
def num_tokens(string):
    encoding = tiktoken.get_encoding('cl100k_base')
    num_tokens = len(encoding.encode(string))
    return num_tokens

In [47]:
num_tokens("Hello world")

2

In [48]:
report_lengths = train_data['report'].apply(num_tokens)

In [49]:
report_lengths.describe()

count     200.00000
mean      656.71500
std       458.20499
min         5.00000
25%       322.75000
50%       550.00000
75%       854.00000
max      3701.00000
Name: report, dtype: float64

In [51]:
report_lengths.sum() * 0.008/1000

1.050744

## Data Formatting

In [52]:
train_data['medical_specialty'].unique()

array(['Cardiovascular / Pulmonary', 'Gastroenterology', 'Neurology',
       'Radiology', 'Surgery'], dtype=object)

In [None]:
#System Prompt
#User --> report
#Assistant --> medical_specialty

In [53]:
system_prompt = "Given the medical description report, classify it into one of these categories: 'Cardiovascular / Pulmonary', 'Gastroenterology', 'Neurology', 'Radiology', 'Surgery'"

In [54]:
print(system_prompt)

Given the medical description report, classify it into one of these categories: 'Cardiovascular / Pulmonary', 'Gastroenterology', 'Neurology', 'Radiology', 'Surgery'


In [55]:
sample_prompt = {"messages":[{'role':'system','content':system_prompt},
                             {'role':'user','content':train_data['report'].iloc[0]},
                             {'role':'assistant', 'content':train_data['medical_specialty'].iloc[0]}
                            ]}

In [61]:
def df_to_format(df):
    formatted_data = []

    for index, row in df.iterrows():
        entry = {"messages":[
                             {'role':'system','content':system_prompt},
                             {'role':'user','content':row['report']},
                             {'role':'assistant', 'content':row['medical_specialty']}
                            ]
                }
        formatted_data.append(entry)
    
    return formatted_data

In [67]:
train_formatdata = df_to_format(train_data)

In [68]:
train_formatdata[10]

{'messages': [{'role': 'system',
   'content': "Given the medical description report, classify it into one of these categories: 'Cardiovascular / Pulmonary', 'Gastroenterology', 'Neurology', 'Radiology', 'Surgery'"},
  {'role': 'user',
   'content': 'On review of systems, the patient admits to hypertension and occasional heartburn.  She undergoes mammograms every six months, which have been negative for malignancy.  She denies fevers, chills, weight loss, fatigue, diabetes mellitus, thyroid disease, upper extremity trauma, night sweats, DVT, pulmonary embolism, anorexia, bone pain, headaches, seizures, angina, peripheral edema, claudication, orthopnea, PND, coronary artery disease, rheumatoid arthritis, rashes, upper extremity edema, cat scratches, cough, hemoptysis, shortness of breath, dyspnea at two flights of stairs, hoarseness, GI bleeding, change in bowel habits, dysphagia, ulcers, hematuria, or history of TB exposure.  She has had negative PPD.,PAST MEDICAL HISTORY:,  Hypertensi

### Convert to json

In [64]:
import json

In [65]:
with open('fine_tuning_data.jsonl','w') as f:
    for entry in train_formatdata:
        f.write(json.dumps(entry))
        f.write("\n")

In [69]:
val_formatdata = df_to_format(val_data)

In [70]:
with open('fine_tuning_data_val.jsonl','w') as f:
    for entry in val_formatdata:
        f.write(json.dumps(entry))
        f.write('\n')

## Training Model

In [None]:
import openai
from openai import OpenAI

client = OpenAI(api_key="")

In [None]:
file_upload_response = client.files.create(
    file=open("fine_tuning_data.jsonl", "rb"), purpose="fine-tune"
)

In [None]:
file_upload_response.status

In [None]:
file_upload_response_val = client.files.create(
    file=open("fine_tuning_data_val.jsonl", "rb"), purpose="fine-tune"
)

In [None]:
file_upload_response_val.status

In [None]:
fine_tuning_response = client.fine_tuning.jobs.create(
    training_file=file_upload_response.id,
    model="gpt-3.5-turbo",
    hyperparameters={"n_epochs": 1},
    validation_file=file_upload_response_val.id,
)

In [None]:
def classify_report(test_report, model):
    completion = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": test_report},
        ],
    )
    return completion.choices[0].message.content.strip()

In [None]:
import numpy as np
model_list = ["gpt-3.5-turbo"] # add the name of your finetuned model here
for model in model_list:
    predicted_classes = []
    ground_truth_classes = []

    for line in test_data.iterrows():
        report, specialty = line[1]["report"], line[1]["medical_specialty"]
        ground_truth_classes.append(specialty.strip())
        prediction = classify_report(report, model=model)
        predicted_classes.append(prediction)

    print(model,(np.array(predicted_classes) == np.array(ground_truth_classes)).mean())