# 🧠 Project 2 - Decision Tree Classifier
This notebook demonstrates how to prepare data, train decision trees, visualize them, and evaluate performance on the UCI Heart Disease dataset.

In [None]:
!pip install pandas numpy matplotlib seaborn scikit-learn graphviz

In [None]:
import pandas as pd
from prepare_training_data import prepare_splits, train_all_models, evaluate_all
from visualize import visualize_all, plot_class_distributions, plot_original_distribution, analyze_accuracy_vs_depth

In [None]:
dataset_path = "Data/heart_disease.csv"
# Initialization
hd_df = pd.read_csv(dataset_path)
# feature: columns that are not target
    # "target": Determine target
    # axis=1: 1 if we want to remove column(target), 0 if we want to remove row
feature = hd_df.drop("target", axis=1)
# y: column that is target
y = hd_df["target"]
y = y.apply(lambda x: 1 if x > 0 else 0)
print(y.unique())

In [None]:
# Show the chart of original data
plot_original_distribution(y, "Original Heart Dataset")

In [None]:
# set up the proportions
splits = [(0.4, 0.6), (0.6, 0.4), (0.8, 0.2), (0.9, 0.1)]
# Implement preparing data 
datasets = prepare_splits(feature, y, splits)

In [None]:
# show charts of datasets after we "preparing" 
plot_class_distributions(datasets, splits, "Heart Disease")

In [None]:
# train data
clfs = train_all_models(datasets)
# visualize the trees based on trained models of datasets (clfs)
visualize_all(clfs, feature_names=feature.columns.tolist(), class_names=["No disease", "Disease"])

In [None]:
evaluate_all(clfs, datasets, class_names=["No disease", "Disease"])

In [None]:
results_df = analyze_accuracy_vs_depth(
    dataset=datasets[2], # 80/20 split
    feature_names=X.columns.tolist(),
    class_names=["No disease", "Disease"]
)

print(results_df)