In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn import tree
import matplotlib.pyplot as plt
import numpy as np

import warnings
from sklearn.exceptions import DataConversionWarning

# Suppress specific warnings from scikit-learn
warnings.filterwarnings(action='ignore', category=DataConversionWarning)
warnings.filterwarnings('ignore')

## Regression Tree

In [None]:
# Creating a simple dataset
data = {
    'X': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    'y': [1.3, 3.5, 4.2, 5.0, 6.8, 7.4, 8.0, 8.4, 9.6, 10.1]
}
df = pd.DataFrame(data)

# Splitting the dataset into training and testing sets
X = df[['X']]
y = df['y']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# Plotting X against y to show the relationship
plt.figure(figsize=(5, 4))
plt.scatter(df['X'], df['y'], color='blue')
plt.title('Relationship between X and y')
plt.xlabel('X')
plt.ylabel('y')
plt.grid(True)
plt.tight_layout()
plt.savefig('pictures/dt-xy.pdf')
plt.show()

In [None]:
# Creating and training the model
regressor = DecisionTreeRegressor(random_state=42)
regressor.fit(X_train, y_train)


In [None]:
from matplotlib.gridspec import GridSpec

def plot_tree_and_predictions(depth, X_train, y_train, df):
    # Train the model
    regressor = DecisionTreeRegressor(max_depth=depth, random_state=42)
    regressor.fit(X_train, y_train)

    # Predictions for a range of X values
    X_range = np.linspace(df['X'].min(), df['X'].max(), 100).reshape(-1, 1)
    y_pred = regressor.predict(X_range)

    # Set up a figure with two subplots with different widths
    fig = plt.figure(figsize=(16, 6))
    gs = GridSpec(1, 2, width_ratios=[3, 1])  # Adjust the ratio here

    # Plotting the decision tree
    ax0 = fig.add_subplot(gs[0])
    tree.plot_tree(regressor, filled=True, feature_names=['X'], rounded=True, ax=ax0)
    ax0.set_title(f'Decision Tree with max_depth = {depth}')

    # Plotting the original data and the predictions
    ax1 = fig.add_subplot(gs[1])
    ax1.scatter(df['X'], df['y'], color='blue', label='Original Data')
    ax1.plot(X_range, y_pred, color='red', label='Tree Predictions')
    ax1.set_title(f'Decision Tree Predictions with max_depth = {depth}')
    ax1.set_xlabel('X')
    ax1.set_ylabel('y')

    # Drawing vertical lines for the split points
    # Extracting split points from the decision tree
    split_points = regressor.tree_.threshold
    split_points = split_points[split_points != -2]  # Filter out non-split nodes
    for sp in split_points:
        ax1.axvline(x=sp, color='green', linestyle='--')

    ax1.legend()
    ax1.grid(True)

    plt.tight_layout()
    plt.savefig(f'pictures/dt-depth-{depth}.pdf')
    plt.show()

# Plotting both the tree and its predictions for different depths
for depth in [1, 2, 3]:
    plot_tree_and_predictions(depth, X_train, y_train, df)

In [None]:
# # Visualizing the entire regression tree
# plt.figure(figsize=(10, 6))
# tree.plot_tree(regressor, filled=True, feature_names=['X'], rounded=True)
# plt.show()

## Classification Tree

In [None]:
from sklearn.datasets import make_classification
import matplotlib.pyplot as plt
from sklearn.datasets import make_circles
from sklearn.datasets import make_moons


# Creating a more complex synthetic dataset for classification
# X, y = make_classification(n_samples=300, n_features=2, n_redundant=0, 
#                                            n_informative=2, n_clusters_per_class=2, 
#                                            random_state=42)

# X, y = make_circles(n_samples=300, noise=0.05, factor=0.5, random_state=42)

X, y = make_moons(n_samples=300, noise=0.1, random_state=42)




# Visualizing the more complex 2D dataset
plt.figure(figsize=(6, 4))
plt.scatter(X[:, 0], X[:, 1], c=y, cmap='viridis', edgecolor='k', s=50)
plt.title("Synthetic 2D Classification Dataset")
plt.xlabel("X1")
plt.ylabel("X2")
plt.colorbar(label='Class')
plt.savefig('pictures/moon_dataset.pdf')
plt.show()


In [None]:
from sklearn.tree import DecisionTreeClassifier, plot_tree
import numpy as np
from sklearn.metrics import accuracy_score

def plot_classification_tree_and_predictions_2D(depth, X, y):
    # Train the model
    classifier = DecisionTreeClassifier(max_depth=depth, random_state=42)
    classifier.fit(X, y)

    # Calculate accuracy
    y_pred = classifier.predict(X)
    accuracy = accuracy_score(y, y_pred)

    # Set up a figure with two subplots with different widths
    fig = plt.figure(figsize=(16, 6))
    gs = GridSpec(1, 2, width_ratios=[3, 1.5])  # Adjust the ratio here

    # Plotting the decision tree
    ax0 = fig.add_subplot(gs[0])
    plot_tree(classifier, filled=True, ax=ax0, class_names=['0', '1'])
    ax0.set_title(f'Decision Tree with max_depth = {depth}')

    # Plotting the decision boundaries and split points
    ax1 = fig.add_subplot(gs[1])
    x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    xx, yy = np.meshgrid(np.linspace(x_min, x_max, 500), 
                         np.linspace(y_min, y_max, 500))
    Z = classifier.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)

    # Contour plot for decision boundaries
    ax1.contourf(xx, yy, Z, alpha=0.3, cmap='viridis')

    # Scatter plot of actual data points
    ax1.scatter(X[:, 0], X[:, 1], c=y, s=50, edgecolor='k', cmap='viridis')

    # Plotting the splits
    tree_ = classifier.tree_
    feature = tree_.feature
    threshold = tree_.threshold

    for i in range(tree_.node_count):
        if feature[i] != -2:  # Not a leaf node
            if feature[i] == 0:  # Split on feature 1
                ax1.axvline(x=threshold[i], color='green', linestyle='--')
            elif feature[i] == 1:  # Split on feature 2
                ax1.axhline(y=threshold[i], color='green', linestyle='--')

    # Title with accuracy
    ax1.set_title(f'Decision Boundaries (Accuracy: {accuracy:.2f})')
    ax1.set_xlabel('X[0]')
    ax1.set_ylabel('X[1]')
    ax1.grid(True)

    plt.tight_layout()
    plt.savefig(f'pictures/dt-class-moon-depth-{depth}.pdf')
    plt.show()


for depth in [1, 2, 3, 4]:
    plot_classification_tree_and_predictions_2D(depth, X, y)


## Random Forest

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Load the Heart Disease dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data"
column_names = ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target']
data = pd.read_csv(url, names=column_names, na_values="?")
data = data.dropna()

# Prepare the features and labels
X = data.drop('target', axis=1).values
y = data['target'].values

# Convert labels to binary (-1, 1)
y = np.where(y > 0, 1, -1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
from sklearn.tree import DecisionTreeClassifier
import numpy as np
from scipy.stats import mode

class MyRandomForestClassifier:
    def __init__(self, n_estimators=10, max_depth=None):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.trees = []

    def fit(self, X, y):
        n_samples, n_features = X.shape
        for _ in range(self.n_estimators):
            row_indices = ?
            n_selected_features = ?
            feature_indices = ?
            X_sample, y_sample = X[row_indices][:, feature_indices], y[row_indices]
            tree = ?
            tree.fit(X_sample, y_sample)
            self.trees.append((tree, feature_indices))

    def predict(self, X):
        predictions = np.array(
            [?
             for tree, features in self.trees]
        ).T
        return mode(predictions, axis=1).mode.ravel()
    
    def feature_importances(self):
        # Initialize an array to store feature importances
        importances = np.zeros(X_train.shape[1])

        # Sum up feature importances from each tree
        for tree, feature_indices in self.trees:
            tree_importances = tree.feature_importances_
            for i, idx in enumerate(feature_indices):
                importances[idx] += tree_importances[i]

        # Average the importances over all trees
        importances /= self.n_estimators

        return importances


In [None]:
from sklearn.metrics import accuracy_score

# Train a single decision tree classifier
tree_clf = DecisionTreeClassifier(max_depth=5)
tree_clf.fit(X_train, y_train)
tree_pred = tree_clf.predict(X_test)
tree_accuracy = accuracy_score(y_test, tree_pred)

# Train your custom random forest classifier
forest_clf = MyRandomForestClassifier(n_estimators=100, max_depth=5)
forest_clf.fit(X_train, y_train)
forest_pred = forest_clf.predict(X_test)
forest_accuracy = accuracy_score(y_test, forest_pred)

# Compare their accuracies
print("Decision Tree Accuracy:", tree_accuracy)
print("Random Forest Accuracy:", forest_accuracy)


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

cm_forest = confusion_matrix(y_test, forest_pred)
cm_tree = confusion_matrix(y_test, tree_pred)

# Plotting the confusion matrices
fig, ax = plt.subplots(1, 2, figsize=(8, 4))

# Confusion matrix for the decision tree
sns.heatmap(cm_tree, annot=True, fmt='d', cmap='Blues', ax=ax[0])
ax[0].set_title(f'Decision Tree Accuracy: {tree_accuracy:.3f}')
ax[0].set_xlabel('Predicted Label')
ax[0].set_ylabel('True Label')

# Confusion matrix for the random forest
sns.heatmap(cm_forest, annot=True, fmt='d', cmap='Blues', ax=ax[1])
ax[1].set_title(f'Random Forest Accuracy: {forest_accuracy:.3f}')
ax[1].set_xlabel('Predicted Label')
ax[1].set_ylabel('True Label')

plt.tight_layout()
plt.savefig('pictures/dt_vs_rf_confusion_matrix.pdf')
plt.show()



In [None]:
forest_clf.feature_importances()

In [None]:
import numpy as np
import matplotlib.pyplot as plt

# Assuming forest_clf.feature_importances() and column_names are defined in your environment
feature_importances = forest_clf.feature_importances()
feature_importances /= feature_importances.max() 
feature_importances *= 100

feature_names = column_names[:-1]  # Excluding the target variable name

# Sorting the features by their importance in ascending order
sorted_indices = np.argsort(feature_importances)

# To reverse the order to descending
sorted_indices_desc = sorted_indices[::-1]

# Creating a horizontal bar chart with black borders around the bars
plt.figure(figsize=(6, 6))
plt.title("Importances in My Random Forest", fontsize=18)
bars = plt.barh(range(len(sorted_indices_desc)), feature_importances[sorted_indices_desc], align='center', color='red', edgecolor='black')
plt.yticks(range(len(sorted_indices_desc)), np.array(feature_names)[sorted_indices_desc], fontsize=14)
plt.xlabel("Variable Importance", fontsize=16)

# Customize the plot borders
ax = plt.gca()  # Get the current Axes instance
ax.spines['top'].set_visible(False)    # Hide the top spine
ax.spines['right'].set_visible(False)  # Hide the right spine
ax.spines['left'].set_visible(False)   # Hide the left spine
# Optionally, you can make the bottom spine more prominent
ax.spines['bottom'].set_linewidth(1.5)
plt.tight_layout()

plt.savefig('pictures/rf_feature_importance.pdf')
plt.show()
