In [22]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

In [23]:
df = pd.read_csv("data.csv")
df.head()

Unnamed: 0,person ID,age,gender,chest pain type,resting blood pressure,serum cholesterol in mg/dl,fasting blood sugar > 120 mg/dl,resting electrocardiographic results,maximum heart rate achieved,exercise induced angina,oldpeak = ST depression induced by exercise relative to rest,the slope of the peak exercise ST segment,number of major vessels (0-3) colored by flourosopy,Has heart disease? (Prediction Target)
0,1,63,M,Type 3,145,233,Yes,hypertrophy of heart,150,No,2.3,0,0,Yes
1,2,37,M,Type 2,130,250,No,myocardial infarction,187,No,3.5,0,0,Yes
2,3,41,F,Type 1,130,204,No,hypertrophy of heart,172,No,1.4,2,0,Yes
3,4,56,M,Type 1,120,236,No,myocardial infarction,178,No,0.8,2,0,Yes
4,5,57,F,Type 0,120,354,No,myocardial infarction,163,Yes,0.6,2,0,Yes


In [24]:
# print(df.describe())
print(df.isna().sum())

person ID                                                       0
age                                                             0
gender                                                          0
chest pain type                                                 0
resting blood pressure                                          0
serum cholesterol in mg/dl                                      0
fasting blood sugar > 120 mg/dl                                 0
resting electrocardiographic results                            0
maximum heart rate achieved                                     0
exercise induced angina                                         0
oldpeak = ST depression induced by exercise relative to rest    0
the slope of the peak exercise ST segment                       0
number of major vessels (0-3) colored by flourosopy             0
Has heart disease? (Prediction Target)                          0
dtype: int64


# Preprocessing

In [30]:
label_mappings = {}
categorical_cols = [
    "gender",
    "chest pain type", 
    "fasting blood sugar > 120 mg/dl", 
    "resting electrocardiographic results",
    "exercise induced angina",
    "Has heart disease? (Prediction Target)"
]
encoded_df = df.copy()

for col in categorical_cols:
    encoder = LabelEncoder()
    encoded_df[col] = encoder.fit_transform(encoded_df[col])
    label_mappings[col] = dict(zip(encoder.classes_, encoder.transform(encoder.classes_)))

In [43]:
for k, v in label_mappings.items():
    print(k)

    for k2, v2 in v.items():
        print(f"{k2}:\t{v2}")
    print("---"*10)

gender
F:	0
M:	1
------------------------------
chest pain type
Type 0:	0
Type 1:	1
Type 2:	2
Type 3:	3
------------------------------
fasting blood sugar > 120 mg/dl
No:	0
Yes:	1
------------------------------
resting electrocardiographic results
hypertrophy of heart:	0
ischemia:	1
myocardial infarction:	2
------------------------------
exercise induced angina
No:	0
Yes:	1
------------------------------
Has heart disease? (Prediction Target)
No:	0
Yes:	1
------------------------------


# Development

In [44]:
def generate_splitpoints(df: pd.DataFrame, col: str):
    assert df is not None, "Dataframe does not exist"
    assert col in list(df.columns), "The column is not in the dataframe"
    assert pd.api.types.is_numeric_dtype(df[col]), "The column is not numeric"

    vals = df[col].unique()
    vals.sort()
    n = len(vals)

    ret = np.zeros(n - 1)
    
    for i in range(n - 1):
        ret[i] = 1/2 * (vals[i] + vals[i+1])

    return ret

In [None]:
def entropy(left: tuple, right: tuple, N: int):
    n_left = sum(left)
    n_right = sum(right)
    
    left_weight = n_left/N
    right_weight = sum(right)/N

    p_l_1 = left[0]/sum
    

In [76]:
def find_split(df: pd.DataFrame, split_col: str, label_col: str, metric: str = "entropy"):
    splitpoints = generate_splitpoints(df, split_col)
    
    data = df[[split_col, label_col]].to_numpy()
    data = data[data[:, 0].argsort()]

    for sp in splitpoints:
        print(sp)
        lt = data[data[:, 0] < sp]
        gt = data[data[:, 0] >= sp]

        split_lt = (len(lt[lt[:, 1] == 1]), len(lt[lt[:, 1] == 0]))
        split_gt = (len(gt[gt[:, 1] == 1]), len(gt[gt[:, 1] == 0]))

        print(f"Left size: {lt.shape[0]}\t | \t(1, 0): {split_lt}")
        print(f"Right size: {gt.shape[0]}\t | \t(1, 0): {split_gt}")
        print("---"*10)

    

In [77]:
find_split(encoded_df, "resting blood pressure", "Has heart disease? (Prediction Target)")

97.0
Left size: 2	 | 	(1, 0): (2, 0)
Right size: 301	 | 	(1, 0): (163, 138)
------------------------------
100.5
Left size: 6	 | 	(1, 0): (4, 2)
Right size: 297	 | 	(1, 0): (161, 136)
------------------------------
101.5
Left size: 7	 | 	(1, 0): (5, 2)
Right size: 296	 | 	(1, 0): (160, 136)
------------------------------
103.0
Left size: 9	 | 	(1, 0): (7, 2)
Right size: 294	 | 	(1, 0): (158, 136)
------------------------------
104.5
Left size: 10	 | 	(1, 0): (8, 2)
Right size: 293	 | 	(1, 0): (157, 136)
------------------------------
105.5
Left size: 13	 | 	(1, 0): (11, 2)
Right size: 290	 | 	(1, 0): (154, 136)
------------------------------
107.0
Left size: 14	 | 	(1, 0): (12, 2)
Right size: 289	 | 	(1, 0): (153, 136)
------------------------------
109.0
Left size: 20	 | 	(1, 0): (16, 4)
Right size: 283	 | 	(1, 0): (149, 134)
------------------------------
111.0
Left size: 39	 | 	(1, 0): (24, 15)
Right size: 264	 | 	(1, 0): (141, 123)
------------------------------
113.0
Left size: 48