In [None]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('heart.csv') 

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1190 entries, 0 to 1189
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   age                  1190 non-null   int64  
 1   sex                  1190 non-null   int64  
 2   chest pain type      1190 non-null   int64  
 3   resting bp s         1190 non-null   int64  
 4   cholesterol          1190 non-null   int64  
 5   fasting blood sugar  1190 non-null   int64  
 6   resting ecg          1190 non-null   int64  
 7   max heart rate       1190 non-null   int64  
 8   exercise angina      1190 non-null   int64  
 9   oldpeak              1190 non-null   float64
 10  ST slope             1190 non-null   int64  
 11  target               1190 non-null   int64  
dtypes: float64(1), int64(11)
memory usage: 111.7 KB


In [4]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.tree import export_text


In [5]:
# Calculate correlation matrix
corr_matrix = df.corr().abs()
# Select the two features with the highest correlation with the target
top_features = corr_matrix['target'].sort_values(ascending=False).index[1:3]
print(f'Top features: {top_features}')
X = df[top_features]
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [6]:
# Train the first decision tree model
model1 = DecisionTreeClassifier(random_state=42)
model1.fit(X_train, y_train)
y_pred1 = model1.predict(X_test)
accuracy1 = accuracy_score(y_test, y_pred1)
print(f'Accuracy of model 1: {accuracy1}')

In [7]:
# Train the second decision tree model with different hyperparameters
model2 = DecisionTreeClassifier(max_depth=5, random_state=42)
model2.fit(X_train, y_train)
y_pred2 = model2.predict(X_test)
accuracy2 = accuracy_score(y_test, y_pred2)
print(f'Accuracy of model 2: {accuracy2}')

In [8]:
# Visualize decision boundaries
def plot_decision_boundary(model, X, y):
    x_min, x_max = X.iloc[:, 0].min() - 1, X.iloc[:, 0].max() + 1
    y_min, y_max = X.iloc[:, 1].min() - 1, X.iloc[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.01), np.arange(y_min, y_max, 0.01))
    Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    plt.contourf(xx, yy, Z, alpha=0.3)
    plt.scatter(X.iloc[:, 0], X.iloc[:, 1], c=y, edgecolors='k', marker='o')
    plt.xlabel(X.columns[0])
    plt.ylabel(X.columns[1])
    plt.show()

plot_decision_boundary(model1, X_test, y_test)
plot_decision_boundary(model2, X_test, y_test)

In [9]:
# Explore hyperparameters
hyperparameters = {
    'max_depth': range(1, 21),
    'min_samples_split': range(2, 21),
    'min_samples_leaf': range(1, 21),
    'min_weight_fraction_leaf': [i/10 for i in range(0, 5)],
    'max_features': range(1, len(top_features)+1),
    'max_leaf_nodes': range(2, 21),
    'min_impurity_decrease': [i/10 for i in range(0, 5)]
}

for param, values in hyperparameters.items():
    accuracies = []
    for value in values:
        model = DecisionTreeClassifier(**{param: value}, random_state=42)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        accuracies.append(accuracy_score(y_test, y_pred))
    plt.figure()
    plt.plot(values, accuracies)
    plt.xlabel(param)
    plt.ylabel('Accuracy')
    plt.title(f'Impact of {param} on model accuracy')
    plt.show()

In [10]:
# Train the final model with the best hyperparameters
best_model = DecisionTreeClassifier(max_depth=5, min_samples_split=4, random_state=42)
best_model.fit(X_train, y_train)
tree_rules = export_text(best_model, feature_names=list(X.columns))
print(tree_rules)