# 🧠 Project 2 - Decision Tree Classifier
This notebook demonstrates how to prepare data, train decision trees, visualize them, and evaluate performance on the UCI Heart Disease dataset.

In [None]:
!pip install pandas numpy matplotlib seaborn scikit-learn graphviz

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import graphviz
from sklearn.tree import export_graphviz
from IPython.display import display

import pandas as pd
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from matplotlib.ticker import MaxNLocator
import pandas as pd
sns.set(style="whitegrid")

In [None]:
def prepare_splits(feature, target, splits=[(0.4, 0.6), (0.6, 0.4), (0.8, 0.2), (0.9, 0.1)]):
    """
    Prepares the data by applying one-hot encoding and splitting into train/test sets.

    Arguments:
    - feature: Feature set (DataFrame)
    - target: Target labels (Series or array)
    - splits: Proportions for train/test split (default: 40/60, 60/40, 80/20, 90/10)

    Returns:
    - datasets: List of tuples containing the training and testing data (features and labels)
    """

    # One-hot encode categorical features
    feature_encoded = pd.get_dummies(feature, drop_first=True)
    feature_names = feature_encoded.columns.tolist()
    # Initialize list to hold datasets for each split
    datasets = []

    # Split the data based on the proportions defined in 'splits'
    for train_size, test_size in splits:
        feature_train, feature_test, label_train, label_test = train_test_split(
            feature_encoded,
            target,
            train_size=train_size,
            test_size=test_size,
            stratify=target,
            shuffle=True,
            random_state=42
        )
        datasets.append((feature_train, feature_test, label_train, label_test))

    return datasets, feature_names


In [None]:
def train_model(feature_train, label_train):
    # Arguments:
        # + feature_train: columns those are not target used to train
        # + label_train: column that is target used to train
    
    # Create model of Decision Tree
            # + criterion = 'entropy': Determine the function to evaluate the quality of splitting
            # + random_state=42: Ensures the same training results every time we run it again 
    clf = DecisionTreeClassifier(criterion='entropy', random_state=42)
    # Method training
    # -> It will be trained based on those informations
    
    clf.fit(feature_train, label_train)
    return clf

In [None]:
def train_all_models(datasets):
    # Argument:
        # + datasets: Data after splitting
    # Save trained model of decision trees
    clfs = []
    
    for (feature_train, _, label_train, _) in datasets:
        # call to train_model function to get decision tree
        clf = train_model(feature_train, label_train)
        # add it to a list
        clfs.append(clf)
    return clfs

In [None]:
def evaluate_model(clf, feature_test, label_test, class_names, title=""):
    # Agrument:
        # + clf: trained model of decision tree
        # + feature_test: columns that are not target used to test
        # + label_test: column that is target used to test
        
    # Using the model input (clf) to predict label for feature_test
    label_pred = clf.predict(feature_test)
    
    # generate a report using classification_report
    # print classification_report
    print(classification_report(label_test, label_pred, target_names=class_names))

    # generate a report using confusion_matrix
    cm = confusion_matrix(label_test, label_pred)
    
    # display confusion matrix
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=class_names)
    # set the blue color for confusion_matrix
    fig = disp.plot(cmap=plt.cm.Blues)
    cbar = fig.figure_.axes[-1]
    cbar.yaxis.set_major_locator(MaxNLocator(integer=True))
    # set the titlte for the confusion_matrix
    plt.title(f"Confusion Matrix {title}")
    # no set grid
    plt.grid(False)
    # show the chart
    plt.show()

In [None]:
def evaluate_all(clfs, datasets, class_names):
    # 2.3 Evaluating the decision tree classifiers
    
    #Argument:
        # + clf: list of models of decision tree
        # feature_names: list of feature names
        # class_names: Disease or No disease
    for i, (clf, (feature_train, feature_test, label_train, label_test)) in enumerate(zip(clfs, datasets)):
        # recalculate % train
        percent_train = round(len(feature_train) * 100 / (len(feature_train) + len(feature_test)))
        depth = clf.get_depth()
        percent_test = 100 - percent_train
        # print the title
        print(f"Evaluation Tree {i+1}: ({percent_train:.1f}% train)")
        # call the evaluate_model to draw confusion_matrix
        evaluate_model(clf, feature_test, label_test, class_names, title=f"(Depth = {depth}, {percent_train}/{percent_test} Split)")

In [None]:
def visualize_tree(clf, feature_names, class_names):
    #Argument:
        # + clf: model decision tree
        # feature_names: list of feature names
        # class_names: Disease or No disease
        
    # Convert the decision tree after training to .dot (graph description text format). For drawing decision trees.
            # + clf: Decision tree model
            # + out_file = None: Instead of writting to file, it returns the description string
            # + filled=True: fill color
            # + rounded=True: rounded shapes
            # + special_characters: allow using the special_characters in labels or features
    dot_data = export_graphviz(
        clf, out_file=None,
        feature_names=feature_names,
        class_names=class_names,
        filled=True, rounded=True, special_characters=True
    )
    # Initialize the graph by the above description string
    graph = graphviz.Source(dot_data)
    display(graph)

In [None]:
def visualize_all(clfs, feature_names, class_names, splits):
    # 2.2 Building the decision tree classifiers

    #Argument:
        # + clf: list of models of decision tree
        # feature_names: list of feature names
        # class_names: Disease or No disease
    for i, clf in enumerate(clfs):
        
        print(f"Dataset {(splits[i][0] * 100):.0f}/{(splits[i][1] * 100):.0f} Split")
        visualize_tree(clf, feature_names, class_names)

In [None]:
def plot_class_distributions(datasets, splits, title):
    #Argument:
        # + datasets: For data after we split
        # + slipts: the proportions
        # + title: Name of the this data 
    
    # Draw the chart for test and train
    
    # For each pair of datasets: 
    for i, (__, __, label_train, label_test) in enumerate(datasets):
        
        # Create the figure with 2 charts in 1 line (1, 2), each chart has width: 10 inch, height 4 inch
        fig, axs = plt.subplots(1, 2, figsize=(10, 4))
        
        # axs[0] mean the first chart (train)
        # x for the x axis. It takes lables from label_train (like 0,1)
        sns.countplot(x=label_train, ax=axs[0])
        
        # Set the name for the first chart
        axs[0].set_title(f"Train {int(splits[i][0]*100)}%")  
        
        # axs[1] mean the second chart (test)
        # It takes lables from y_tes
        sns.countplot(x=label_test, ax=axs[1]) 
        
        # Set the name for the second chart
        axs[1].set_title(f"Test {int(splits[i][1]*100)}%")
        
        for ax in axs:
            
            ax.set_xlabel("Target") # x axis presents Tagert (0,1)
            ax.set_ylabel("Count") # y axis presents Quantity 
            ax.yaxis.set_major_locator(MaxNLocator(integer=True))
            # True: show the grid to read easily
            # line: grid draw by "--"
            # alpha: opacity of the grid 
            ax.grid(True, linestyle="--", alpha=0.5)

            for p in ax.patches:
                height = p.get_height()
                ax.annotate(f'{int(height)}', 
                (p.get_x() + p.get_width() / 2., height),
                ha='center', va='center', 
                fontsize=12, color='black', 
                xytext=(0, 9), textcoords='offset points')    
        
        plt.suptitle(title + f" {int(splits[i][0]*100)}/{int(splits[i][1]*100)}") 
        plt.tight_layout() # For the aesthetic
        plt.show()

In [None]:
def plot_original_distribution(target, title="Original Dataset"):
    # Draw the chart of the original data 
    ax = sns.countplot(x=target)   # x_axis count quantity of classes of target column (0, 1)
    for p in ax.patches:
        height = p.get_height() 
        ax.annotate(f'{int(height)}', 
        (p.get_x() + p.get_width() / 2., height),  
        ha='center', va='center',  
        fontsize=12, color='black', 
        xytext=(0, 9), textcoords='offset points') 
    
    plt.title(title) # set title
    plt.xlabel("Label") # set x axis title
    plt.ylabel("Count") # set y axis title
    plt.grid(True, linestyle="--", alpha=0.5) # Same
    plt.show()

In [None]:
def analyze_accuracy_vs_depth(dataset, feature_names, class_names, max_depth_values=[None, 2, 3, 4, 5, 6, 7]):
    
    
    
    #2.4 The depth and accuracy of a decision tree

    
    import pandas as pd
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.metrics import accuracy_score
    # Take the data 80/20
    X_train, X_test, y_train, y_test = dataset
    # Save the result
    results = []

    # For each depth [None, 2, 3, 4, 5, 6, 7]
    for depth in max_depth_values:
        
        # Set up for the decision tree but we add one more variable max_depth
        # Re-trained with limited depth requirement
        clf = DecisionTreeClassifier(criterion='entropy', max_depth=depth, random_state=42)
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        
        # Calculate by take the right predicts divide to total predicts
        acc = accuracy_score(y_test, y_pred)
        
        # Save the current depth
        results.append(("None" if depth is None else depth, acc))
        # Convert to string
        dot_data = export_graphviz(
            clf, out_file=None,
            feature_names=feature_names,
            class_names=class_names,
            filled=True, rounded=True, special_characters=True
        )
        # Create graph based on string
        graph = graphviz.Source(dot_data)
        print(f"Decision Tree with max_depth = {depth}, Accuracy = {acc:.4f}")
        display(graph)

    # For report the accuracy_score (on the test set) of the decision tree classifier for each value of the max_depth parameter.
    return pd.DataFrame(results, columns=["max_depth", "accuracy"]) 

In [None]:
def plot_accuracy_vs_depth(results, title="Accuracy vs. Tree Depth"):
    
    df = results.copy()

    mask_none = df["max_depth"] == "None"
    df_numeric = df[~mask_none]
    df_none    = df[ mask_none]
    
   
    df_ordered = pd.concat([df_numeric, df_none], ignore_index=True)
    
   
    df_ordered["x"] = df_ordered.index
    
 
    plt.figure()
    plt.plot(df_ordered["x"], df_ordered["accuracy"], marker='o')

    plt.xticks(df_ordered["x"], df_ordered["max_depth"])
    
    plt.xlabel("max_depth")
    plt.ylabel("Accuracy")
    plt.title(title)
    plt.grid(True)
    plt.show()

In [None]:
dataset_path = "Data/heart_disease.csv"
# Initialization
hd_df = pd.read_csv(dataset_path)
# feature: columns that are not target
    # "target": Determine target
    # axis=1: 1 if we want to remove column(target), 0 if we want to remove row
feature = hd_df.drop("target", axis=1)
# y: column that is target
y = hd_df["target"]
y = y.apply(lambda x: 1 if x > 0 else 0)
print(y.unique())

In [None]:
# Show the chart of original data
plot_original_distribution(y, "Original Heart Dataset")

In [None]:
# set up the proportions
splits = [(0.4, 0.6), (0.6, 0.4), (0.8, 0.2), (0.9, 0.1)]
# Implement preparing data 
datasets, feature_names = prepare_splits(feature, y, splits)

In [None]:
# show charts of datasets after we "preparing" 
plot_class_distributions(datasets, splits, "Heart Disease")

In [None]:
# train data
clfs = train_all_models(datasets)
# visualize the trees based on trained models of datasets (clfs)
visualize_all(clfs, feature_names=feature_names, class_names=["No disease", "Disease"], splits = splits)

In [None]:
evaluate_all(clfs, datasets, class_names=["No disease", "Disease"])

In [None]:
results_df = analyze_accuracy_vs_depth(
    dataset=datasets[2], # 80/20 split
    feature_names=feature_names,
    class_names=["No disease", "Disease"]
)

print(results_df)

In [None]:
# line chart between Accuracy and Depth
plot_accuracy_vs_depth(results_df, title="Accuracy vs Depth")