imports:

In [128]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from collections import Counter

load dataset:

In [129]:
data = pd.read_csv("heart_attack_prediction_dataset.csv")
data

Unnamed: 0,Patient ID,Age,Sex,Cholesterol,Blood Pressure,Heart Rate,Diabetes,Family History,Smoking,Obesity,...,Sedentary Hours Per Day,Income,BMI,Triglycerides,Physical Activity Days Per Week,Sleep Hours Per Day,Country,Continent,Hemisphere,Heart Attack Risk
0,BMW7812,67,Male,208,158/88,72,0,0,1,0,...,6.615001,261404,31.251233,286,0,6,Argentina,South America,Southern Hemisphere,0
1,CZE1114,21,Male,389,165/93,98,1,1,1,1,...,4.963459,285768,27.194973,235,1,7,Canada,North America,Northern Hemisphere,0
2,BNI9906,21,Female,324,174/99,72,1,0,0,0,...,9.463426,235282,28.176571,587,4,4,France,Europe,Northern Hemisphere,0
3,JLN3497,84,Male,383,163/100,73,1,1,1,0,...,7.648981,125640,36.464704,378,3,4,Canada,North America,Northern Hemisphere,0
4,GFO8847,66,Male,318,91/88,93,1,1,1,1,...,1.514821,160555,21.809144,231,1,5,Thailand,Asia,Northern Hemisphere,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8758,MSV9918,60,Male,121,94/76,61,1,1,1,0,...,10.806373,235420,19.655895,67,7,7,Thailand,Asia,Northern Hemisphere,0
8759,QSV6764,28,Female,120,157/102,73,1,0,0,1,...,3.833038,217881,23.993866,617,4,9,Canada,North America,Northern Hemisphere,0
8760,XKA5925,47,Male,250,161/75,105,0,1,1,1,...,2.375214,36998,35.406146,527,4,4,Brazil,South America,Southern Hemisphere,1
8761,EPE6801,36,Male,178,119/67,60,1,0,1,0,...,0.029104,209943,27.294020,114,2,8,Brazil,South America,Southern Hemisphere,0


check and drop irrelevant data:

In [130]:
print(data.isnull().sum)

bp = data['Blood Pressure'].str.split('/', expand=True)
data['BP_Systolic']  = pd.to_numeric(bp[0], errors='coerce')
data['BP_Diastolic'] = pd.to_numeric(bp[1], errors='coerce')
data["Sex"] = data["Sex"].map({'Male':1,'Female':0}) #1 represents male, 0 represents female.
data["Diet"] = data["Diet"].map({'Unhealthy':-1,'Average':0, 'Healthy':1}) #-1 represents unhealthy, 0 represents average, 1 re[resents healthy.
data = data.drop(columns=["Blood Pressure", "Hemisphere", "Patient ID", "Continent", "Country"])
data



<bound method DataFrame.sum of       Patient ID    Age    Sex  Cholesterol  Blood Pressure  Heart Rate  \
0          False  False  False        False           False       False   
1          False  False  False        False           False       False   
2          False  False  False        False           False       False   
3          False  False  False        False           False       False   
4          False  False  False        False           False       False   
...          ...    ...    ...          ...             ...         ...   
8758       False  False  False        False           False       False   
8759       False  False  False        False           False       False   
8760       False  False  False        False           False       False   
8761       False  False  False        False           False       False   
8762       False  False  False        False           False       False   

      Diabetes  Family History  Smoking  Obesity  ...  \
0        Fa

Unnamed: 0,Age,Sex,Cholesterol,Heart Rate,Diabetes,Family History,Smoking,Obesity,Alcohol Consumption,Exercise Hours Per Week,...,Stress Level,Sedentary Hours Per Day,Income,BMI,Triglycerides,Physical Activity Days Per Week,Sleep Hours Per Day,Heart Attack Risk,BP_Systolic,BP_Diastolic
0,67,1,208,72,0,0,1,0,0,4.168189,...,9,6.615001,261404,31.251233,286,0,6,0,158,88
1,21,1,389,98,1,1,1,1,1,1.813242,...,1,4.963459,285768,27.194973,235,1,7,0,165,93
2,21,0,324,72,1,0,0,0,0,2.078353,...,9,9.463426,235282,28.176571,587,4,4,0,174,99
3,84,1,383,73,1,1,1,0,1,9.828130,...,9,7.648981,125640,36.464704,378,3,4,0,163,100
4,66,1,318,93,1,1,1,1,0,5.804299,...,6,1.514821,160555,21.809144,231,1,5,0,91,88
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8758,60,1,121,61,1,1,1,0,1,7.917342,...,8,10.806373,235420,19.655895,67,7,7,0,94,76
8759,28,0,120,73,1,0,0,1,0,16.558426,...,8,3.833038,217881,23.993866,617,4,9,0,157,102
8760,47,1,250,105,0,1,1,1,1,3.148438,...,5,2.375214,36998,35.406146,527,4,4,1,161,75
8761,36,1,178,60,1,0,1,0,0,3.789950,...,5,0.029104,209943,27.294020,114,2,8,0,119,67


In [131]:
# data = (data - data.mean())/data.std()
# for column in data.columns:
#     out_of_range = (data[column] < -3) | (data[column] > 3)
#     if out_of_range.any():
#         print(f"Column '{column}' has values outside [-3, 3]")
#     else:
#         print(f"✅ Column '{column}' is all within [-3, 3]")

In [132]:
valid_ranges = {
    'Age': (0, 120),
    'Cholesterol': (100, 500),
    'Systolic BP': (70, 250),
    'Diastolic BP': (40, 150),
    'Heart Rate': (30, 200),
    'Stress Level': (0, 10),
    'Sedentary Hours Per Day': (0, 24),
    'Income': (0, 1_000_000),
    'BMI': (10, 70),
    'Triglycerides': (30, 1000),
    'Exercise Hours Per Week': (0, 40),
    'Physical Activity Days Per Week': (0, 7),
    'Sleep Hours Per Day': (0, 24),
}
print("🔎 Checking for invalid values in your dataset...")
for col, (min_val, max_val) in valid_ranges.items():
    if col in data.columns:
        invalid = data[(data[col] < min_val) | (data[col] > max_val)]
        if not invalid.empty:
            print(f"\n⚠️ Column '{col}' has {len(invalid)} invalid value(s) outside range ({min_val}, {max_val}):")
            print(invalid[[col]])

# Optional summary count
print("\n📊 Summary of invalid values:")
for col, (min_val, max_val) in valid_ranges.items():
    if col in data.columns:
        count = ((data[col] < min_val) | (data[col] > max_val)).sum()
        print(f"{col}: {count} invalid value(s)")

🔎 Checking for invalid values in your dataset...

📊 Summary of invalid values:
Age: 0 invalid value(s)
Cholesterol: 0 invalid value(s)
Heart Rate: 0 invalid value(s)
Stress Level: 0 invalid value(s)
Sedentary Hours Per Day: 0 invalid value(s)
Income: 0 invalid value(s)
BMI: 0 invalid value(s)
Triglycerides: 0 invalid value(s)
Exercise Hours Per Week: 0 invalid value(s)
Physical Activity Days Per Week: 0 invalid value(s)
Sleep Hours Per Day: 0 invalid value(s)


part 1 - Decision trees

In [133]:
X = data.drop("Heart Attack Risk", axis=1)
y = data["Heart Attack Risk"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Initialize and train the classifier
model = DecisionTreeClassifier(random_state=42)
model.fit(X_train, y_train)

In [134]:
# Predict on the test set
y_pred = model.predict(X_test)

# Print accuracy
print("Accuracy:", accuracy_score(y_test, y_pred))

# Detailed report
print(classification_report(y_test, y_pred))

Accuracy: 0.5199695701787752
              precision    recall  f1-score   support

           0       0.63      0.60      0.62      1691
           1       0.34      0.37      0.36       938

    accuracy                           0.52      2629
   macro avg       0.49      0.49      0.49      2629
weighted avg       0.53      0.52      0.52      2629



now, we want to build implementation of decision tree:

In [135]:
from collections import Counter

class Leaf:
    def __init__(self, rows):
        self.rows = rows
        self.predictions = Counter(row[-1] for row in rows)

class DecisionNode:
    def __init__(self, question, true_branch, false_branch):
        self.question = question
        self.true_branch = true_branch
        self.false_branch = false_branch

class Question:
    def __init__(self, feature, value):
        self.feature = feature
        self.value = value

    def is_matched(self, ex):
        value = ex[self.feature]
        return value >= self.value

    def __repr__(self):
        return f" is feature [{self.feature}] >= {self.value}?"

class DecisionTree:
    def __init__(self, max_depth=10, min_samples_leaf=1):
        self.max_depth = max_depth
        self.min_samples_leaf = min_samples_leaf
        self.root = None

    def fit(self, rows):
        # Build the tree
        self.root = self._build_tree(rows, depth=0)
        return self

    def predict(self, samples):
        if self.root is None:
            raise ValueError("Tree must be fitted before making predictions")
        return [self._prediction(sample, self.root) for sample in samples]

    def predict_proba(self, samples):
        if self.root is None:
            raise ValueError("Tree must be fitted before making predictions")
        return [self._prediction_proba(sample, self.root) for sample in samples]

    def _build_tree(self, rows, depth=0):
        # Stop Criteria: Check if node is pure (all same labels) / we've reached maximum depth / we have too few samples
        labels = [row[-1] for row in rows]
        if len(set(labels)) == 1 or depth >= self.max_depth or len(rows) < self.min_samples_leaf:
            return Leaf(rows)

        # Try to find the best split
        gain, question = self._find_best_split(rows)

        # Stop Criteria: Check if no improvement possible (gain = 0)
        if gain == 0:
            return Leaf(rows)

        # Check if split would create leaves that are too small
        true_rows, false_rows = self._partition(rows, question)

        if len(true_rows) == 0 or len(false_rows) == 0:
            return Leaf(rows)

        # Recursively build subtrees with incremented depth
        true_branch = self._build_tree(true_rows, depth + 1)
        false_branch = self._build_tree(false_rows, depth + 1)

        return DecisionNode(question, true_branch, false_branch)

    def _prediction(self, row, node):
        if isinstance(node, Leaf):
            return node.predictions.most_common(1)[0][0]  # Return most common class
        if node.question.is_matched(row):
            return self._prediction(row, node.true_branch)
        else:
            return self._prediction(row, node.false_branch)

    def _prediction_proba(self, row, node):
        """Return full predictions Counter for probability calculation"""
        if isinstance(node, Leaf):
            return node.predictions
        if node.question.is_matched(row):
            return self._prediction_proba(row, node.true_branch)
        else:
            return self._prediction_proba(row, node.false_branch)

    def _gini(self, rows):
        counts = Counter(rows)
        impurity = 1
        for label in counts:
            probability = counts[label] / float(len(rows))
            impurity -= probability ** 2
        return impurity

    def _info_gain(self, left, right, current_uncertainty):
        # do current_uncertainty - weighted average of impurity of nodes
        p = float(len(left)) / (len(left) + len(right))
        return current_uncertainty - (p * self._gini(left)) - ((1-p) * self._gini(right))

    def _partition(self, rows, question):
        good_rows, bad_rows = [], []
        for row in rows:
            if question.is_matched(row):
                good_rows.append(row)
            else:
                bad_rows.append(row)
        return good_rows, bad_rows

    def _find_best_split(self, rows):
        best_gain = 0
        best_question = None
        current_uncertainty = self._gini([row[-1] for row in rows])
        len_of_features = len(rows[0]) - 1

        for col in range(len_of_features):
            values = set(row[col] for row in rows)
            for val in values:
                question = Question(col, val)
                good_rows, bad_rows = self._partition(rows, question)
                if not good_rows or not bad_rows:
                    continue
                gain = self._info_gain([row[-1] for row in good_rows], [row[-1] for row in bad_rows], current_uncertainty)
                if gain > best_gain:
                    best_gain = gain
                    best_question = question
        return best_gain, best_question

