# Import package

In [1]:
import warnings
warnings.filterwarnings("ignore")

#Math and Vectors
import pandas as pd
import numpy as np

#Visualizations
import plotly.express as px

#ML
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import concurrent.futures

# Utils functions
from utils import prediction, compile_prompt, get_embedding, ml_models, create_auc_chart, gpt_reasoning
pd.set_option('display.max_columns', None)


# Load data

In [2]:
df = pd.read_csv("./data/raw data/heart_attack_predicton_kaggle.csv")
df.shape

(303, 14)

In [3]:
df.head(2)

Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall,output
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1


# Preprocessing

## check missing value

In [4]:
df.isna().sum()

age         0
sex         0
cp          0
trtbps      0
chol        0
fbs         0
restecg     0
thalachh    0
exng        0
oldpeak     0
slp         0
caa         0
thall       0
output      0
dtype: int64

## Outcome distribution

In [5]:
df['output'].value_counts()

1    165
0    138
Name: output, dtype: int64

# Machine learing model

## One hot encoding

In [6]:
cat_cols = ['sex','exng','cp','fbs','restecg','slp','thall']
df_model = pd.get_dummies(df,columns=cat_cols)
df_model.shape

(303, 27)

## Train test split

In [7]:
# Seperate dependent and independent variables
X = df_model.drop(axis=1,columns=['output'])
y = df_model['output'].tolist()

In [8]:
X_tr, X_val, y_tr, y_val = train_test_split(X, y, test_size=0.2, random_state=101,
                                                stratify=y,shuffle=True)

## Model training

In [9]:
models = ml_models()
lr = models['LR']
lasso = models['LASSO'] 
ridge = models['RIDGE'] 
rf = models['RF'] 

In [10]:
pred_dict = {}
for k, m in models.items():
    print(k)
    m.fit(X_tr, y_tr)
    preds = m.predict_proba(X_val)[:,1]
    auc = roc_auc_score(y_val, preds)
    pred_dict[k] = preds
    print(k + ': ', auc)

LR
LR:  0.7088744588744589
LASSO
LASSO:  0.7088744588744589
RIDGE
RIDGE:  0.7088744588744589
RF


RF:  0.8744588744588745


## Model performance

In [11]:
create_auc_chart(pred_dict, y_val, 'Model AUC')

# ChatGPT prediction

## Prepare data

In [12]:
df_gpt = df.copy()
df_gpt.shape

(303, 14)

In [13]:
df_gpt['sex'] = np.where(df_gpt['sex'] == 1, 'Male', 'Female')
df_gpt['cp'] = np.where(df_gpt['cp'] == 1, 'Typical angina', 
                       np.where(df_gpt['cp'] == 2, 'Atypical angina', 
                       np.where(df_gpt['cp'] == 3, 'Non-anginal pain', 'Asymptomatic')))
df_gpt['fbs'] = np.where(df_gpt['fbs'] == 1, 'Fasting blood sugar > 120 mg/dl', 'Fasting blood sugar <= 120 mg/dl')
df_gpt['restecg'] = np.where(df_gpt['restecg'] == 0, 'Normal', 
                       np.where(df_gpt['restecg'] == 1, 'Having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV)', 
                                    "Showing probable or definite left ventricular hypertrophy by Estes' criteria"))
df_gpt['exng'] = np.where(df_gpt['exng'] == 1, 'Exercise induced angina', 'Without exercise induced angina')
df_gpt['slp'] = np.where(df_gpt['slp'] == 0, 'The slope of the peak exercise ST segment is downsloping', 
                       np.where(df_gpt['slp'] == 1, 'The slope of the peak exercise ST segment is flat', 
                                    'The slope of the peak exercise ST segment is upsloping'))
df_gpt['thall'] = np.where(df_gpt['thall'] == 1, 'Thall is Fixed defect', 
                       np.where(df_gpt['thall'] == 2, 'Thall is normal', 'Thall is reversable defect'))
df_gpt.shape

(303, 14)

In [14]:
# Seperate dependent and independent variables
X = df_gpt.drop(axis=1,columns=['output'])
y = df_gpt['output']

X_tr, X_val, y_tr, y_val = train_test_split(X, y, test_size=0.2, random_state=101,
                                                stratify=y,shuffle=True)

In [15]:
# test df to dict
application_list = X_val.to_dict(orient='records')
len(application_list)

61

## API call to GPT-3.5 model - text-davinci-003

In [17]:
### get prediction from GPT-3.5 model: text-davinci-003 - multiprocessing pool
with concurrent.futures.ThreadPoolExecutor() as executor:
    # Combine credit_data and explain into a single iterable
    combined_data = zip(application_list, [False] * len(application_list))
    # Submit the transaction processing tasks to the executor
    results = executor.map(prediction, combined_data)

    # Collect the responses into a list
    responses = list(results)
responses_df = pd.DataFrame(responses)
# responses_df.to_csv('./Heart_Attach_ChatGPT_Ony_Prediction.csv', index = False)
responses_df.shape

In [None]:
# Load API call response
responses_df = pd.read_csv('./data/processed data/Heart_Attach_ChatGPT_Ony_Prediction.csv')
responses_df['output'] = np.where(responses_df['decision'] == 'More chance of heart attack', 1, 0)
responses_df.shape

(61, 2)

## GPT3.5 zero shot prediction result

In [32]:
auc_gpt = roc_auc_score(y_val, responses_df['output'])
auc_gpt

0.48160173160173153

# Model with OpenAI's embedding

## Prompt engineering

In [None]:
df_gpt['combined'] = df_gpt.apply(compile_prompt, axis=1)

## OpenAI embedding

In [55]:

df_gpt['ada_embedding'] = df_gpt.combined.apply(lambda x: get_embedding(x, model='text-embedding-ada-002'))
df_gpt = df_gpt.join(pd.DataFrame(df_gpt['ada_embedding'].apply(pd.Series)))
df_gpt.drop(['combined', 'ada_embedding'], axis = 1, inplace = True)
# df_gpt.to_csv('./Heart_Attack_df_gpt_with_embedding_before_onehot.csv', index = False)
# df_gpt.columns = df_gpt.columns.tolist()[:14] + ['Embedding_' + str(i) for i in df_gpt.columns.tolist()[14:]]
# df_gpt.shape


(303, 1550)

In [35]:
# load OpenAI API response
df_gpt = pd.read_csv('.//data/processed data/Heart_Attack_df_gpt_with_embedding_before_onehot.csv')
df_gpt.columns = df_gpt.columns.tolist()[:14] + ['Embedding_' + str(i) for i in df_gpt.columns.tolist()[14:]]
df_gpt.shape

(303, 1550)

In [36]:
df = pd.concat([df, df_gpt[[i for i in df_gpt.columns.tolist() if i.startswith('Embedding_')]]], axis=1)
df.shape

(303, 1550)

## Train model

In [38]:
# Seperate dependent and independent variables
X = df.drop(axis=1,columns=['output'])
y = df['output'].tolist()

X_tr, X_val, y_tr, y_val = train_test_split(X, y, test_size=0.2, random_state=101,
                                                stratify=y,shuffle=True)

In [62]:
models = ml_models()
lr = models['LR']
lasso = models['LASSO'] 
ridge = models['RIDGE'] 
rf = models['RF'] 

In [43]:
pred_dict_gpt = {}
for k, m in models.items():
    print(k)
    m.fit(X_tr, y_tr)
    preds = m.predict_proba(X_val)[:,1]
    auc = roc_auc_score(y_val, preds)
    pred_dict_gpt[k + '_With_GPT_Embedding'] = preds
    print(k + '_With_GPT_Embedding' + ': ', auc)

LR
LR_With_GPT_Embedding:  0.7099567099567099
LASSO
LASSO_With_GPT_Embedding:  0.7099567099567099
RIDGE
RIDGE_With_GPT_Embedding:  0.7099567099567099
RF
RF_With_GPT_Embedding:  0.8841991341991343


## Model performance

In [44]:
pred_dict_combine = dict(list(pred_dict.items()) + list(pred_dict_gpt.items()))
create_auc_chart(pred_dict_combine, y_val, 'Model AUC')

# Explainability

In [18]:
application_data = application_list[0]
application_data

{'age': 51,
 'sex': 'Male',
 'cp': 'Atypical angina',
 'trtbps': 125,
 'chol': 245,
 'fbs': 'Fasting blood sugar > 120 mg/dl',
 'restecg': 'Normal',
 'thalachh': 166,
 'exng': 'Without exercise induced angina',
 'oldpeak': 2.4,
 'slp': 'The slope of the peak exercise ST segment is flat',
 'caa': 0,
 'thall': 'Thall is normal'}

In [25]:
response = gpt_reasoning(application_data)
response

You are a medical expert / underwriter in a global insurance company. Your job is to evaluate the chance of having heart attack.
        Please encode your response as json in the following format
        {
            "decision": "<Either less chance of heart attack or more chance of heart attack>",
            "reasoning": "<Provide a 300 words explaination of why you made this decision>"
        }
        ---- BEGIN OF THE DATA ----
        What is the age of the applicant?: 51
        What is the sex of the applicant?: Male
        What is the chest pain type of the applicant?: Atypical angina
        What is the resting blood pressure (in mm Hg) of the applicant?: 125
        What is the cholestoral in mg/dl fetched via BMI sensor of the applicant?: 245
        What is the fasting blood sugar level of the applicant?: Fasting blood sugar > 120 mg/dl
        What is the resting electrocardiographic results of the applicant?: Normal
        What is the maximum heart rate achieved of 

'{\n  "decision": "less chance of heart attack",\n  "reasoning": "Based on the information provided, the applicant has several factors that indicate a lower chance of having a heart attack. \n\nFirstly, the age of the applicant is 51, which is not considered young but also not in the high-risk range for heart attacks. \n\nSecondly, the applicant is male. While men generally have a higher risk than women for heart attacks, it is not the sole determining factor. \n\nThirdly, the chest pain type reported by the applicant is atypical angina. Atypical angina is characterized by chest pain that is less predictable and may have different patterns compared to typical angina. This can indicate a lower risk of heart attack compared to typical angina.\n\nFourthly, the resting blood pressure of the applicant is 125 mm Hg. This falls within the normal range and does not indicate hypertension, which is a risk factor for heart attacks.\n\nFifthly, the cholesterol level of the applicant is 245 mg/dl. 