# DecisionTreeModel

In [1]:
import torch
from sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixin
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
from sklearn.metrics import accuracy_score, r2_score
import numpy as np
import logging
from typing import Optional, Union, Dict, Any, Tuple

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


class DecisionTreeNode:
    """
    A node in the Decision Tree.

    Attributes:
        feature_index (Optional[int]): Index of the feature used for splitting.
        threshold (Optional[float]): Threshold value for the split.
        left (Optional[DecisionTreeNode]): Left child node.
        right (Optional[DecisionTreeNode]): Right child node.
        value (Optional[Union[float, int]]): Value to predict if the node is a leaf.
        gain (float): Gain achieved by the split.
        class_counts (Optional[np.ndarray]): Class counts in the node (for classification).
    """
    def __init__(self,
                 feature_index: Optional[int] = None,
                 threshold: Optional[float] = None,
                 left: Optional['DecisionTreeNode'] = None,
                 right: Optional['DecisionTreeNode'] = None,
                 value: Optional[Union[float, int]] = None,
                 gain: float = 0.0,
                 class_counts: Optional[np.ndarray] = None):
        self.feature_index = feature_index
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value
        self.gain = gain
        self.class_counts = class_counts


class DecisionTreeModel(BaseEstimator, ClassifierMixin, RegressorMixin):
    """
    A Decision Tree model implemented in PyTorch, compatible with scikit-learn.
    Supports both classification and regression tasks with GPU acceleration.

    Parameters:
        max_depth (Optional[int]): Maximum depth of the tree. If None, nodes are expanded until all leaves are pure.
        min_samples_split (int): Minimum number of samples required to split an internal node.
        max_features (Union[int, float, str, None]): Number or fraction of features to consider when looking for the best split.
            If int, then consider `max_features` features at each split.
            If float, then `max_features` is a fraction and `int(max_features * n_features)` features are considered.
            If 'sqrt', then `max_features = sqrt(n_features)`.
            If 'log2', then `max_features = log2(n_features)`.
            If None, then `max_features = n_features`.
        task (str): 'classification' or 'regression'.
        random_state (Optional[int]): Seed for reproducibility.
        min_gain (float): Minimum gain required to make a split. Splits with gain below this threshold are ignored.
        classes_ (Optional[np.ndarray]): Global classes for classification alignment.
    """
    def __init__(self,
                 max_depth: Optional[int] = None,
                 min_samples_split: int = 2,
                 max_features: Union[int, float, str, None] = None,
                 task: str = 'classification',
                 random_state: Optional[int] = None,
                 min_gain: float = 1e-7,
                 classes_: Optional[np.ndarray] = None):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.max_features = max_features
        self.task = task
        self.random_state = random_state
        self.min_gain = min_gain
        self.root_ = None
        self.is_fitted_ = False
        self.device = 'cpu'
        self.classes_ = classes_  # Store global classes
        self.n_classes_ = len(self.classes_) if self.classes_ is not None else None

        if self.random_state is not None:
            torch.manual_seed(self.random_state)
            np.random.seed(self.random_state)

    def fit(self, X: np.ndarray, y: np.ndarray) -> 'DecisionTreeModel':
        """
        Build the decision tree from the training set (X, y).

        Parameters:
            X (np.ndarray): Training data of shape (n_samples, n_features).
            y (np.ndarray): Target values of shape (n_samples,).

        Returns:
            self
        """
        # Validate input
        X, y = check_X_y(X, y)
        n_samples, n_features = X.shape
        self.n_features_ = n_features  # Store for later use

        # Validate task
        self._validate_fit(X, y)

        # Determine device
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        logger.info(f"Using device: {self.device}")

        # Convert to torch tensors and move to device
        X_tensor = torch.tensor(X, dtype=torch.float32).to(self.device)
        if self.task == 'classification':
            y_tensor = torch.tensor(y, dtype=torch.long).to(self.device)
            if self.classes_ is None:
                self.classes_ = torch.unique(y_tensor).cpu().numpy()
                self.n_classes_ = len(self.classes_)
            else:
                self.n_classes_ = len(self.classes_)
        elif self.task == 'regression':
            y_tensor = torch.tensor(y, dtype=torch.float32).to(self.device)
        else:
            raise ValueError("Invalid task. Choose 'classification' or 'regression'.")

        # Build the tree
        self.root_ = self._build_tree(X_tensor, y_tensor, depth=0)
        self.is_fitted_ = True
        return self

    def _build_tree(self, X: torch.Tensor, y: torch.Tensor, depth: int) -> DecisionTreeNode:
        """
        Recursively build the decision tree.

        Parameters:
            X (torch.Tensor): Feature matrix.
            y (torch.Tensor): Target vector.
            depth (int): Current depth of the tree.

        Returns:
            DecisionTreeNode: The root node of the subtree.
        """
        num_samples, num_features = X.shape

        # Check for pure node or stopping criteria
        if self.task == 'classification':
            unique_classes = torch.unique(y)
            if len(unique_classes) == 1:
                leaf_value, class_counts = self._calculate_leaf_value(y)
                return DecisionTreeNode(value=leaf_value, class_counts=class_counts)
        elif self.task == 'regression':
            if torch.all(y == y[0]):
                leaf_value = self._calculate_leaf_value(y)
                return DecisionTreeNode(value=leaf_value)

        if self.max_depth is not None and depth >= self.max_depth:
            if self.task == 'classification':
                leaf_value, class_counts = self._calculate_leaf_value(y)
                return DecisionTreeNode(value=leaf_value, class_counts=class_counts)
            elif self.task == 'regression':
                leaf_value = self._calculate_leaf_value(y)
                return DecisionTreeNode(value=leaf_value)

        if num_samples < self.min_samples_split:
            if self.task == 'classification':
                leaf_value, class_counts = self._calculate_leaf_value(y)
                return DecisionTreeNode(value=leaf_value, class_counts=class_counts)
            elif self.task == 'regression':
                leaf_value = self._calculate_leaf_value(y)
                return DecisionTreeNode(value=leaf_value)

        # Find the best split
        best_split = self._get_best_split(X, y, num_features)

        if best_split["gain"] < self.min_gain:
            if self.task == 'classification':
                leaf_value, class_counts = self._calculate_leaf_value(y)
                return DecisionTreeNode(value=leaf_value, class_counts=class_counts)
            elif self.task == 'regression':
                leaf_value = self._calculate_leaf_value(y)
                return DecisionTreeNode(value=leaf_value)

        # Recursively build left and right subtrees
        left_subtree = self._build_tree(best_split["X_left"], best_split["y_left"], depth + 1)
        right_subtree = self._build_tree(best_split["X_right"], best_split["y_right"], depth + 1)

        return DecisionTreeNode(
            feature_index=best_split["feature_index"],
            threshold=best_split["threshold"],
            left=left_subtree,
            right=right_subtree,
            gain=best_split["gain"]
        )

    def _get_best_split(self, X: torch.Tensor, y: torch.Tensor, num_features: int) -> Dict[str, Any]:
        """
        Find the best split for the current node.

        Parameters:
            X (torch.Tensor): Feature matrix.
            y (torch.Tensor): Target vector.
            num_features (int): Number of features.

        Returns:
            dict: Information about the best split.
        """
        best_split = {}
        max_gain = -1

        if self.task == 'classification':
            parent_loss = self._gini(y)
        elif self.task == 'regression':
            parent_loss = self._mse(y)

        # Determine number of features to consider
        if self.max_features is None:
            n_features = num_features
        elif isinstance(self.max_features, int):
            n_features = min(self.max_features, num_features)
        elif isinstance(self.max_features, float):
            n_features = max(1, int(self.max_features * num_features))
        elif isinstance(self.max_features, str):
            if self.max_features == 'sqrt':
                n_features = max(1, int(np.sqrt(num_features)))
            elif self.max_features == 'log2':
                n_features = max(1, int(np.log2(num_features)))
            else:
                raise ValueError("Unsupported string value for max_features. Use 'sqrt' or 'log2'.")
        else:
            raise ValueError("max_features must be int, float, str ('sqrt' or 'log2'), or None")

        # Randomly select features to consider
        feature_indices = torch.randperm(num_features)[:n_features].tolist()

        for feature_index in feature_indices:
            X_column = X[:, feature_index]
            # Sort the feature values and corresponding targets
            sorted_indices = torch.argsort(X_column)
            X_sorted = X_column[sorted_indices]
            y_sorted = y[sorted_indices]

            # Compute potential split thresholds (unique midpoints)
            unique_values = torch.unique(X_sorted)
            if len(unique_values) == 1:
                continue  # No split possible on this feature
            thresholds = (unique_values[:-1] + unique_values[1:]) / 2.0

            for threshold in thresholds:
                # Determine the split
                left_mask = X_sorted <= threshold
                right_mask = X_sorted > threshold

                y_left = y_sorted[left_mask]
                y_right = y_sorted[right_mask]
                X_left = X[sorted_indices[left_mask], :]
                X_right = X[sorted_indices[right_mask], :]

                if len(y_left) == 0 or len(y_right) == 0:
                    continue  # Skip invalid splits

                # Calculate loss
                if self.task == 'classification':
                    loss = (len(y_left) / len(y)) * self._gini(y_left) + \
                           (len(y_right) / len(y)) * self._gini(y_right)
                elif self.task == 'regression':
                    loss = (len(y_left) / len(y)) * self._mse(y_left) + \
                           (len(y_right) / len(y)) * self._mse(y_right)

                gain = parent_loss - loss

                if gain > max_gain:
                    max_gain = gain
                    best_split = {
                        "feature_index": feature_index,
                        "threshold": threshold.item(),
                        "X_left": X_left,    # Full feature matrix for left split
                        "y_left": y_left,
                        "X_right": X_right,  # Full feature matrix for right split
                        "y_right": y_right,
                        "gain": gain
                    }

        return best_split

    def _gini(self, y: torch.Tensor) -> float:
        """
        Calculate Gini impurity for classification.

        Parameters:
            y (torch.Tensor): Target vector.

        Returns:
            float: Gini impurity.
        """
        if self.classes_ is None:
            raise AttributeError("Classes not defined. Ensure that classes_ is set correctly.")
        class_counts = torch.bincount(y, minlength=self.n_classes_).float()
        probabilities = class_counts / len(y)
        gini = 1.0 - torch.sum(probabilities ** 2)
        return gini.item()

    def _mse(self, y: torch.Tensor) -> float:
        """
        Calculate Mean Squared Error for regression.

        Parameters:
            y (torch.Tensor): Target vector.

        Returns:
            float: MSE.
        """
        mse = torch.mean((y - torch.mean(y)) ** 2)
        return mse.item()

    def _calculate_leaf_value(self, y: torch.Tensor) -> Union[float, int, Tuple[int, np.ndarray]]:
        """
        Calculate the value to assign to a leaf node.

        Parameters:
            y (torch.Tensor): Target vector.

        Returns:
            float or int or tuple: Prediction value and class counts (for classification).
        """
        if self.task == 'classification':
            # Ensure class_counts includes all classes by setting minlength
            counts = torch.bincount(y, minlength=self.n_classes_).cpu().numpy()
            most_common = torch.mode(y).values.item()
            return most_common, counts
        elif self.task == 'regression':
            return torch.mean(y).item()

    def predict(self, X: np.ndarray) -> np.ndarray:
        """
        Predict target values for samples in X.

        Parameters:
            X (np.ndarray): Input data of shape (n_queries, n_features).

        Returns:
            np.ndarray: Predicted class labels or regression values.
        """
        check_is_fitted(self, 'is_fitted_')
        X = check_array(X)
        # Convert to torch tensor and move to device
        X_tensor = torch.tensor(X, dtype=torch.float32).to(self.device)
        predictions = []

        for x in X_tensor:
            predictions.append(self._traverse_tree(x))

        return np.array(predictions)

    def predict_proba(self, X: np.ndarray) -> np.ndarray:
        """
        Predict class probabilities for samples in X.

        Parameters:
            X (np.ndarray): Input data of shape (n_queries, n_features).

        Returns:
            np.ndarray: Predicted probabilities of shape (n_queries, n_classes).
        """
        if self.task != 'classification':
            raise AttributeError("predict_proba is only available for classification tasks.")
        check_is_fitted(self, 'is_fitted_')
        if self.classes_ is None:
            raise AttributeError("Classes not found. Ensure that the model is fitted properly.")
        X = check_array(X)

        n_samples = X.shape[0]
        n_classes = len(self.classes_)
        proba = np.zeros((n_samples, n_classes))

        # Convert to torch tensor and move to device
        X_tensor = torch.tensor(X, dtype=torch.float32).to(self.device)

        for i, x in enumerate(X_tensor):
            node = self.root_
            while node.value is None:
                if x[node.feature_index].item() <= node.threshold:
                    node = node.left
                else:
                    node = node.right
            # Normalize class counts to probabilities
            if node.class_counts is not None:
                proba[i] = node.class_counts / np.sum(node.class_counts)
            else:
                # Fallback if class_counts are not available
                proba[i, node.value] = 1.0

        return proba

    def _traverse_tree(self, x: torch.Tensor) -> Union[float, int]:
        """
        Traverse the tree to make a prediction for a single sample.

        Parameters:
            x (torch.Tensor): Single sample.

        Returns:
            float or int: Predicted value.
        """
        node = self.root_
        while node.value is None:
            if x[node.feature_index].item() <= node.threshold:
                node = node.left
            else:
                node = node.right
        return node.value

    def score(self, X: np.ndarray, y: np.ndarray) -> float:
        """
        Compute the score of the model.
        For classification: accuracy.
        For regression: R² score.

        Parameters:
            X (np.ndarray): Test samples of shape (n_samples, n_features).
            y (np.ndarray): True labels or target values of shape (n_samples,).

        Returns:
            float: Score.
        """
        X = check_array(X)
        y = np.array(y)
        y_pred = self.predict(X)

        if self.task == 'classification':
            return accuracy_score(y, y_pred) * 100.0  # Percentage
        elif self.task == 'regression':
            return r2_score(y, y_pred)
        else:
            raise ValueError("Invalid task. Choose 'classification' or 'regression'.")

    def feature_importances_(self) -> np.ndarray:
        """
        Calculate feature importances based on the trained tree.

        Returns:
            np.ndarray: Array of feature importances.
        """
        if not self.is_fitted_:
            raise AttributeError("This DecisionTreeModel instance is not fitted yet.")

        importances = np.zeros(self.n_features_)

        def _accumulate_importances(node: DecisionTreeNode):
            if node is None or node.value is not None:
                return
            importances[node.feature_index] += node.gain
            _accumulate_importances(node.left)
            _accumulate_importances(node.right)

        _accumulate_importances(self.root_)
        total_gain = np.sum(importances)
        if total_gain > 0:
            importances /= total_gain
        return importances

    @property
    def max_features_(self) -> int:
        """
        Determine the number of features based on max_features parameter.

        Returns:
            int: Number of features to consider.
        """
        if self.max_features is None:
            return self.n_features_
        elif isinstance(self.max_features, int):
            return min(self.max_features, self.n_features_)
        elif isinstance(self.max_features, float):
            return max(1, int(self.max_features * self.n_features_))
        elif isinstance(self.max_features, str):
            if self.max_features == 'sqrt':
                return max(1, int(np.sqrt(self.n_features_)))
            elif self.max_features == 'log2':
                return max(1, int(np.log2(self.n_features_)))
        raise ValueError("Invalid max_features parameter.")

    def _serialize_node(self, node: Optional[DecisionTreeNode]) -> Optional[Dict[str, Any]]:
        """
        Serialize a DecisionTreeNode into a dictionary.

        Parameters:
            node (Optional[DecisionTreeNode]): Node to serialize.

        Returns:
            Optional[Dict[str, Any]]: Serialized node.
        """
        if node is None:
            return None
        return {
            'feature_index': node.feature_index,
            'threshold': node.threshold,
            'value': node.value,
            'gain': node.gain,
            'class_counts': node.class_counts.tolist() if node.class_counts is not None else None,
            'left': self._serialize_node(node.left),
            'right': self._serialize_node(node.right)
        }

    def _deserialize_node(self, node_dict: Optional[Dict[str, Any]]) -> Optional[DecisionTreeNode]:
        """
        Deserialize a dictionary into a DecisionTreeNode.

        Parameters:
            node_dict (Optional[Dict[str, Any]]): Serialized node.

        Returns:
            Optional[DecisionTreeNode]: Deserialized node.
        """
        if node_dict is None:
            return None
        class_counts = np.array(node_dict['class_counts']) if node_dict['class_counts'] is not None else None
        node = DecisionTreeNode(
            feature_index=node_dict['feature_index'],
            threshold=node_dict['threshold'],
            value=node_dict['value'],
            gain=node_dict.get('gain', 0.0),
            class_counts=class_counts
        )
        node.left = self._deserialize_node(node_dict['left'])
        node.right = self._deserialize_node(node_dict['right'])
        return node

    def load_model_from_dict(self, node_dict: Optional[Dict[str, Any]]) -> None:
        """
        Load a trained Decision Tree model from a dictionary.

        Parameters:
            node_dict (Optional[Dict[str, Any]]): Serialized node.
        """
        self.root_ = self._deserialize_node(node_dict)
        self.is_fitted_ = True
        logger.info("Decision Tree loaded from dictionary.")

    def save_model(self, filepath: str) -> None:
        """
        Save the trained Decision Tree model to a file.

        Parameters:
            filepath (str): Path to the file where the model will be saved.
        """
        check_is_fitted(self, 'is_fitted_')
        state = {
            'max_depth': self.max_depth,
            'min_samples_split': self.min_samples_split,
            'max_features': self.max_features,
            'task': self.task,
            'random_state': self.random_state,
            'min_gain': self.min_gain,
            'n_features_': self.n_features_,
            'classes_': self.classes_.tolist() if self.classes_ is not None else None,
            'root_': self._serialize_node(self.root_)
        }
        torch.save(state, filepath)  # Use torch's save method
        logger.info(f"Decision Tree model saved to {filepath}")

    def load_model(self, filepath: str) -> None:
        """
        Load a trained Decision Tree model from a file.

        Parameters:
            filepath (str): Path to the file from which the model will be loaded.
        """
        state = torch.load(filepath, map_location=self.device)
        self.max_depth = state['max_depth']
        self.min_samples_split = state['min_samples_split']
        self.max_features = state['max_features']
        self.task = state['task']
        self.random_state = state['random_state']
        self.min_gain = state.get('min_gain', 1e-7)
        self.n_features_ = state['n_features_']
        self.classes_ = np.array(state['classes_']) if state['classes_'] is not None else None
        self.n_classes_ = len(self.classes_) if self.classes_ is not None else None
        self.root_ = self._deserialize_node(state['root_'])
        self.is_fitted_ = True
        logger.info(f"Decision Tree model loaded from {filepath}")

    def _validate_fit(self, X: np.ndarray, y: np.ndarray) -> None:
        """
        Validate the fit parameters.

        Parameters:
            X (np.ndarray): Feature matrix.
            y (np.ndarray): Target vector.

        Raises:
            ValueError: If task is invalid.
        """
        if self.task not in ['classification', 'regression']:
            raise ValueError("Invalid task. Choose 'classification' or 'regression'.")

# RandomForestModel

In [2]:
import torch
from sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixin
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
from sklearn.metrics import accuracy_score, r2_score, roc_auc_score
from collections import Counter
import numpy as np
from concurrent.futures import ThreadPoolExecutor, as_completed
import logging
from typing import Optional, Union, Dict, Any, List

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


class RandomForestModel(BaseEstimator, ClassifierMixin, RegressorMixin):
    """
    A Random Forest model implemented in PyTorch, compatible with scikit-learn.
    Supports both classification and regression tasks with GPU acceleration.

    Parameters:
        n_estimators (int): Number of trees in the forest.
        max_depth (Optional[int]): Maximum depth of the trees. If None, nodes are expanded until all leaves are pure.
        min_samples_split (int): Minimum number of samples required to split an internal node.
        max_features (Union[int, float, str, None]): Number or fraction of features to consider when looking for the best split.
            If int, then consider `max_features` features at each split.
            If float, then `max_features` is a fraction and `int(max_features * n_features)` features are considered.
            If 'sqrt', then `max_features = sqrt(n_features)`.
            If 'log2', then `max_features = log2(n_features)`.
            If None, then `max_features = n_features`.
        bootstrap (bool): Whether bootstrap samples are used when building trees.
        random_state (Optional[int]): Seed for reproducibility.
        task (str): 'classification' or 'regression'.
        min_gain (float): Minimum gain required to make a split. Splits with gain below this threshold are ignored.
    """

    def __init__(self,
                 n_estimators: int = 100,
                 max_depth: Optional[int] = None,
                 min_samples_split: int = 2,
                 max_features: Union[int, float, str, None] = None,
                 bootstrap: bool = True,
                 random_state: Optional[int] = None,
                 task: str = 'classification',
                 min_gain: float = 1e-7):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.max_features = max_features
        self.bootstrap = bootstrap
        self.random_state = random_state
        self.task = task
        self.min_gain = min_gain
        self.trees_: List['DecisionTreeModel'] = []
        self.is_fitted_ = False
        self.oob_score_: Optional[float] = None
        self.classes_: Optional[np.ndarray] = None  # To store unique classes for classification

        if self.random_state is not None:
            torch.manual_seed(self.random_state)
            np.random.seed(self.random_state)

    def fit(self, X: np.ndarray, y: np.ndarray) -> 'RandomForestModel':
        """
        Build a forest of trees from the training set (X, y).

        Parameters:
            X (np.ndarray): Training data of shape (n_samples, n_features).
            y (np.ndarray): Target values of shape (n_samples,).

        Returns:
            self
        """
        # Validate input
        X, y = check_X_y(X, y)
        n_samples, n_features = X.shape
        self.n_features_ = n_features  # Store for later use

        # Validate task
        self._validate_fit(X, y)

        # Determine device
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        logger.info(f"Using device: {self.device}")

        # For classification, store unique classes
        if self.task == 'classification':
            self.classes_ = np.unique(y)
            logger.info(f"Classes found: {self.classes_}")

        # Initialize OOB predictions
        if self.bootstrap:
            if self.task == 'classification':
                # For classification, store votes for each class per sample
                self.oob_votes_ = [Counter() for _ in range(n_samples)]
            elif self.task == 'regression':
                # For regression, accumulate predictions
                self.oob_preds_ = np.zeros(n_samples)
                self.oob_counts_ = np.zeros(n_samples)
        else:
            self.oob_score_ = None  # OOB score not applicable

        # Function to train a single tree
        def train_tree(i: int) -> Dict[str, Any]:
            # Bootstrap sampling
            if self.bootstrap:
                indices = np.random.choice(n_samples, size=n_samples, replace=True)
                X_sample = X[indices]
                y_sample = y[indices]
                oob_indices = np.setdiff1d(np.arange(n_samples), indices)
            else:
                X_sample = X
                y_sample = y
                oob_indices = np.array([])

            # Generate unique random state
            tree_random_state = self.random_state + i if self.random_state is not None else None

            # Initialize and train Decision Tree
            tree = DecisionTreeModel(
                max_depth=self.max_depth,
                min_samples_split=self.min_samples_split,
                max_features=self.max_features,
                task=self.task,
                random_state=tree_random_state,
                min_gain=self.min_gain,
                classes_=self.classes_ if self.task == 'classification' else None  # Pass global classes
            )
            tree.fit(X_sample, y_sample)

            # Collect OOB predictions
            oob_pred = None
            if self.bootstrap and len(oob_indices) > 0:
                if self.task == 'classification':
                    oob_pred = tree.predict(X[oob_indices])
                elif self.task == 'regression':
                    oob_pred = tree.predict(X[oob_indices])

            return {
                'tree': tree,
                'oob_pred': oob_pred,
                'oob_indices': oob_indices
            }

        # Train trees in parallel
        with ThreadPoolExecutor() as executor:
            futures = [executor.submit(train_tree, i) for i in range(self.n_estimators)]
            for future in as_completed(futures):
                result = future.result()
                tree = result['tree']
                self.trees_.append(tree)
                logger.info(f"Trained tree {len(self.trees_)}/{self.n_estimators}")

                if self.bootstrap:
                    if self.task == 'classification' and result['oob_pred'] is not None:
                        for idx, pred in zip(result['oob_indices'], result['oob_pred']):
                            self.oob_votes_[idx][pred] += 1
                    elif self.task == 'regression' and result['oob_pred'] is not None:
                        self.oob_preds_[result['oob_indices']] += result['oob_pred']
                        self.oob_counts_[result['oob_indices']] += 1

        # Calculate OOB score if applicable
        if self.bootstrap:
            if self.task == 'classification':
                # Determine majority vote for each OOB sample
                oob_pred_majority = np.array([
                    vote.most_common(1)[0][0] if len(vote) > 0 else -1  # Assign -1 if no votes
                    for vote in self.oob_votes_
                ])
                # Filter out samples with no OOB votes
                valid_mask = oob_pred_majority != -1
                if np.any(valid_mask):
                    self.oob_score_ = accuracy_score(y[valid_mask], oob_pred_majority[valid_mask]) * 100.0
                    logger.info(f"OOB Score: {self.oob_score_:.2f}%")
                else:
                    self.oob_score_ = None
                    logger.warning("No OOB samples were found. OOB score is not available.")
            elif self.task == 'regression':
                # Avoid division by zero
                valid_mask = self.oob_counts_ > 0
                if np.any(valid_mask):
                    self.oob_preds_[valid_mask] /= self.oob_counts_[valid_mask]
                    self.oob_score_ = r2_score(y[valid_mask], self.oob_preds_[valid_mask])
                    logger.info(f"OOB R² Score: {self.oob_score_:.4f}")
                else:
                    self.oob_score_ = None
                    logger.warning("No OOB samples were found. OOB score is not available.")

        self.is_fitted_ = True
        return self

    def predict_proba(self, X: np.ndarray) -> np.ndarray:
        """
        Predict class probabilities for samples in X.

        Parameters:
            X (np.ndarray): Input data of shape (n_queries, n_features).

        Returns:
            np.ndarray: Predicted probabilities of shape (n_queries, n_classes).
        """
        check_is_fitted(self, 'is_fitted_')
        if self.task != 'classification':
            raise AttributeError("predict_proba is only available for classification tasks.")
        if self.classes_ is None:
            raise AttributeError("Classes not found. Ensure that the model is fitted properly.")
        X = check_array(X)

        n_samples = X.shape[0]
        n_classes = len(self.classes_)
        proba = np.zeros((n_samples, n_classes))

        # Iterate over each tree and accumulate probabilities
        for tree in self.trees_:
            tree_proba = tree.predict_proba(X)
            tree_classes = tree.classes_

            # Create a mapping from tree classes to indices in self.classes_
            class_indices = {cls: idx for idx, cls in enumerate(self.classes_)}

            # Initialize tree_proba_aligned with zeros
            tree_proba_aligned = np.zeros((n_samples, n_classes))

            # Assign probabilities to the corresponding class indices
            for i, cls in enumerate(tree_classes):
                if cls in class_indices:
                    tree_proba_aligned[:, class_indices[cls]] = tree_proba[:, i]

            # Accumulate the aligned probabilities
            proba += tree_proba_aligned

        # Average the probabilities
        proba /= self.n_estimators

        return proba

    def predict(self, X: np.ndarray) -> np.ndarray:
        """
        Predict target values for samples in X.

        Parameters:
            X (np.ndarray): Input data of shape (n_queries, n_features).

        Returns:
            np.ndarray: Predicted class labels or regression values.
        """
        check_is_fitted(self, 'is_fitted_')
        X = check_array(X)

        # Collect predictions from all trees
        if self.task == 'classification':
            # Collect class predictions
            tree_preds = np.array([tree.predict(X) for tree in self.trees_])
            # Transpose to shape (n_samples, n_estimators)
            tree_preds = tree_preds.T
            # Majority vote
            y_pred = np.array([
                Counter(row).most_common(1)[0][0] if len(row) > 0 else self.classes_[0] for row in tree_preds
            ])
        elif self.task == 'regression':
            # Collect regression predictions
            tree_preds = np.array([tree.predict(X) for tree in self.trees_])
            # Average predictions
            y_pred = np.mean(tree_preds, axis=0)
        else:
            raise ValueError("Invalid task. Choose 'classification' or 'regression'.")

        return y_pred

    def score(self, X: np.ndarray, y: np.ndarray) -> float:
        """
        Compute the score of the model.
        For classification: accuracy.
        For regression: R² score.

        Parameters:
            X (np.ndarray): Test samples of shape (n_samples, n_features).
            y (np.ndarray): True labels or target values of shape (n_samples,).

        Returns:
            float: Score.
        """
        X = check_array(X)
        y = np.array(y)
        y_pred = self.predict(X)

        if self.task == 'classification':
            return accuracy_score(y, y_pred) * 100.0  # Percentage
        elif self.task == 'regression':
            return r2_score(y, y_pred)
        else:
            raise ValueError("Invalid task. Choose 'classification' or 'regression'.")

    def feature_importances_(self) -> np.ndarray:
        """
        Calculate feature importances based on all trees in the forest.

        Returns:
            np.ndarray: Array of feature importances.
        """
        if not self.is_fitted_:
            raise AttributeError("This RandomForestModel instance is not fitted yet.")

        # Initialize importances array with the total number of features
        importances = np.zeros(self.n_features_)

        for tree in self.trees_:
            importances += tree.feature_importances_()

        importances /= self.n_estimators
        return importances

    @property
    def oob_score(self) -> Optional[float]:
        """
        Get the out-of-bag (OOB) score.

        Returns:
            Optional[float]: OOB score if bootstrap is True; otherwise, None.
        """
        return self.oob_score_

    def save_model(self, filepath: str) -> None:
        """
        Save the trained Random Forest model to a file.

        Parameters:
            filepath (str): Path to the file where the model will be saved.
        """
        check_is_fitted(self, 'is_fitted_')
        state = {
            'n_estimators': self.n_estimators,
            'max_depth': self.max_depth,
            'min_samples_split': self.min_samples_split,
            'max_features': self.max_features,
            'bootstrap': self.bootstrap,
            'random_state': self.random_state,
            'task': self.task,
            'min_gain': self.min_gain,
            'classes_': self.classes_.tolist() if self.classes_ is not None else None,
            'trees_': [tree._serialize_node(tree.root_) for tree in self.trees_]
        }
        torch.save(state, filepath)
        logger.info(f"Random Forest model saved to {filepath}")

    def load_model(self, filepath: str) -> None:
        """
        Load a trained Random Forest model from a file.

        Parameters:
            filepath (str): Path to the file from which the model will be loaded.
        """
        state = torch.load(filepath, map_location=self.device if hasattr(self, 'device') else 'cpu')

        self.n_estimators = state['n_estimators']
        self.max_depth = state['max_depth']
        self.min_samples_split = state['min_samples_split']
        self.max_features = state['max_features']
        self.bootstrap = state['bootstrap']
        self.random_state = state['random_state']
        self.task = state['task']
        self.min_gain = state.get('min_gain', 1e-7)
        self.classes_ = np.array(state['classes_']) if state['classes_'] is not None else None
        self.trees_ = []

        for i, tree_dict in enumerate(state['trees_']):
            tree = DecisionTreeModel(
                max_depth=self.max_depth,
                min_samples_split=self.min_samples_split,
                max_features=self.max_features,
                task=self.task,
                random_state=self.random_state + i if self.random_state is not None else None,
                min_gain=self.min_gain,
                classes_=self.classes_ if self.task == 'classification' else None  # Pass global classes
            )
            tree.load_model_from_dict(tree_dict)  # Load tree from its dict
            tree.device = self.device if hasattr(self, 'device') else 'cpu'  # Ensure tree uses the same device
            self.trees_.append(tree)
            logger.info(f"Loaded tree {i + 1}/{self.n_estimators}")

        # Note: OOB score cannot be recalculated without original training data
        if self.bootstrap:
            self.oob_score_ = None
            logger.warning("OOB score cannot be recalculated after loading the model without original training data.")

        self.is_fitted_ = True
        logger.info(f"Random Forest model loaded from {filepath}")

    def get_params(self, deep: bool = True) -> Dict[str, Any]:
        """
        Get parameters for this estimator.

        Returns:
            Dict[str, Any]: Parameter names mapped to their values.
        """
        return {
            'n_estimators': self.n_estimators,
            'max_depth': self.max_depth,
            'min_samples_split': self.min_samples_split,
            'max_features': self.max_features,
            'bootstrap': self.bootstrap,
            'random_state': self.random_state,
            'task': self.task,
            'min_gain': self.min_gain
        }

    def set_params(self, **params: Any) -> 'RandomForestModel':
        """
        Set the parameters of this estimator.

        Parameters:
            **params: Estimator parameters.

        Returns:
            self
        """
        for key, value in params.items():
            setattr(self, key, value)
        return self

    def _validate_fit(self, X: np.ndarray, y: np.ndarray) -> None:
        """
        Validate the fit parameters.

        Parameters:
            X (np.ndarray): Feature matrix.
            y (np.ndarray): Target vector.

        Raises:
            ValueError: If task is invalid.
        """
        if self.task not in ['classification', 'regression']:
            raise ValueError("Invalid task. Choose 'classification' or 'regression'.")

    def _compute_score(self, y_true: np.ndarray, y_pred: np.ndarray) -> float:
        """
        Compute the OOB score based on the task.

        Parameters:
            y_true (np.ndarray): True target values.
            y_pred (np.ndarray): OOB predicted values.

        Returns:
            float: OOB score.
        """
        if self.task == 'classification':
            return accuracy_score(y_true, y_pred) * 100.0
        elif self.task == 'regression':
            return r2_score(y_true, y_pred)
        else:
            raise ValueError("Invalid task. Choose 'classification' or 'regression'.")

# Demonstration

In [3]:
import time
import warnings
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.datasets import load_iris, load_diabetes
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, r2_score, roc_auc_score
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor


def classification_comparison():
    """
    Compare custom RandomForestModel with scikit-learn's RandomForestClassifier on the Iris dataset.
    """
    print("=== Classification Comparison: Custom RandomForestModel vs. scikit-learn ===")
    iris = load_iris()
    X, y = iris.data, iris.target

    # Split into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    # Feature Scaling (Optional but Recommended)
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    # Determine number of features for 'sqrt'
    n_features = X_train.shape[1]
    max_features = int(np.sqrt(n_features))
    print(f"Number of features: {n_features}, max_features set to: {max_features}")

    # Custom RandomForestModel
    print("\n--- Custom RandomForestModel ---")
    custom_rf = RandomForestModel(
        n_estimators=10,
        max_depth=5,
        min_samples_split=2,
        max_features=max_features,
        bootstrap=True,
        random_state=42,
        task='classification'
    )
    start_time = time.time()
    custom_rf.fit(X_train, y_train)
    train_time_custom = time.time() - start_time
    print(f"Training Time: {train_time_custom:.4f} seconds")

    start_time = time.time()
    y_pred_custom = custom_rf.predict(X_test)
    y_pred_proba_custom = custom_rf.predict_proba(X_test)
    predict_time_custom = time.time() - start_time
    accuracy_custom = accuracy_score(y_test, y_pred_custom) * 100.0
    roc_auc_custom = roc_auc_score(y_test, y_pred_proba_custom, multi_class='ovo')
    print(f"Prediction Time: {predict_time_custom:.4f} seconds")
    print(f"Accuracy: {accuracy_custom:.2f}%")
    print(f"ROC AUC: {roc_auc_custom:.4f}")

    # scikit-learn RandomForestClassifier
    print("\n--- scikit-learn RandomForestClassifier ---")
    sklearn_rf = RandomForestClassifier(
        n_estimators=10,
        max_depth=5,
        min_samples_split=2,
        max_features='sqrt',
        bootstrap=True,
        random_state=42
    )
    start_time = time.time()
    sklearn_rf.fit(X_train, y_train)
    train_time_sklearn = time.time() - start_time
    print(f"Training Time: {train_time_sklearn:.4f} seconds")

    start_time = time.time()
    y_pred_sklearn = sklearn_rf.predict(X_test)
    y_pred_proba_sklearn = sklearn_rf.predict_proba(X_test)
    predict_time_sklearn = time.time() - start_time
    accuracy_sklearn = accuracy_score(y_test, y_pred_sklearn) * 100.0
    roc_auc_sklearn = roc_auc_score(y_test, y_pred_proba_sklearn, multi_class='ovo')
    print(f"Prediction Time: {predict_time_sklearn:.4f} seconds")
    print(f"Accuracy: {accuracy_sklearn:.2f}%")
    print(f"ROC AUC: {roc_auc_sklearn:.4f}")

    # Summary
    print("\n--- Summary ---")
    print(f"Custom RandomForestModel Training Time: {train_time_custom:.4f} seconds")
    print(f"scikit-learn RandomForestClassifier Training Time: {train_time_sklearn:.4f} seconds")
    print(f"Custom RandomForestModel Prediction Time: {predict_time_custom:.4f} seconds")
    print(f"scikit-learn RandomForestClassifier Prediction Time: {predict_time_sklearn:.4f} seconds")
    print(f"Custom RandomForestModel Accuracy: {accuracy_custom:.2f}%")
    print(f"Custom RandomForestModel ROC AUC: {roc_auc_custom:.4f}")
    print(f"scikit-learn RandomForestClassifier Accuracy: {accuracy_sklearn:.2f}%")
    print(f"scikit-learn RandomForestClassifier ROC AUC: {roc_auc_sklearn:.4f}")

    return accuracy_custom, roc_auc_custom, accuracy_sklearn, roc_auc_sklearn


def regression_comparison():
    """
    Compare custom RandomForestModel with scikit-learn's RandomForestRegressor on the Diabetes dataset.
    """
    print("\n=== Regression Comparison: Custom RandomForestModel vs. scikit-learn ===")
    diabetes = load_diabetes()
    X, y = diabetes.data, diabetes.target

    # Split into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    # Feature Scaling
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    # Determine number of features for 'sqrt'
    n_features = X_train.shape[1]
    max_features = int(np.sqrt(n_features))
    print(f"Number of features: {n_features}, max_features set to: {max_features}")

    # Custom RandomForestModel
    print("\n--- Custom RandomForestModel ---")
    custom_rf = RandomForestModel(
        n_estimators=10,
        max_depth=5,
        min_samples_split=2,
        max_features=max_features,
        bootstrap=True,
        random_state=42,
        task='regression'
    )
    start_time = time.time()
    custom_rf.fit(X_train, y_train)
    train_time_custom = time.time() - start_time
    print(f"Training Time: {train_time_custom:.4f} seconds")

    start_time = time.time()
    y_pred_custom = custom_rf.predict(X_test)
    predict_time_custom = time.time() - start_time
    r2_custom = r2_score(y_test, y_pred_custom)
    rmse_custom = np.sqrt(np.mean((y_test - y_pred_custom) ** 2))
    print(f"Prediction Time: {predict_time_custom:.4f} seconds")
    print(f"R² Score: {r2_custom:.4f}")
    print(f"RMSE: {rmse_custom:.4f}")

    # scikit-learn RandomForestRegressor
    print("\n--- scikit-learn RandomForestRegressor ---")
    sklearn_rf = RandomForestRegressor(
        n_estimators=10,
        max_depth=5,
        min_samples_split=2,
        max_features='sqrt',
        bootstrap=True,
        random_state=42
    )
    start_time = time.time()
    sklearn_rf.fit(X_train, y_train)
    train_time_sklearn = time.time() - start_time
    print(f"Training Time: {train_time_sklearn:.4f} seconds")

    start_time = time.time()
    y_pred_sklearn = sklearn_rf.predict(X_test)
    predict_time_sklearn = time.time() - start_time
    r2_sklearn = r2_score(y_test, y_pred_sklearn)
    rmse_sklearn = np.sqrt(np.mean((y_test - y_pred_sklearn) ** 2))
    print(f"Prediction Time: {predict_time_sklearn:.4f} seconds")
    print(f"R² Score: {r2_sklearn:.4f}")
    print(f"RMSE: {rmse_sklearn:.4f}")

    # Summary
    print("\n--- Summary ---")
    print(f"Custom RandomForestModel Training Time: {train_time_custom:.4f} seconds")
    print(f"scikit-learn RandomForestRegressor Training Time: {train_time_sklearn:.4f} seconds")
    print(f"Custom RandomForestModel Prediction Time: {predict_time_custom:.4f} seconds")
    print(f"scikit-learn RandomForestRegressor Prediction Time: {predict_time_sklearn:.4f} seconds")
    print(f"Custom RandomForestModel R² Score: {r2_custom:.4f}")
    print(f"Custom RandomForestModel RMSE: {rmse_custom:.4f}")
    print(f"scikit-learn RandomForestRegressor R² Score: {r2_sklearn:.4f}")
    print(f"scikit-learn RandomForestRegressor RMSE: {rmse_sklearn:.4f}")

    return r2_custom, rmse_custom, r2_sklearn, rmse_sklearn


if __name__ == "__main__":
    accuracy_custom, roc_auc_custom, accuracy_sklearn, roc_auc_sklearn = classification_comparison()
    r2_custom, rmse_custom, r2_sklearn, rmse_sklearn = regression_comparison()

    # Create DataFrame
    metrics_df = pd.DataFrame({
        'Model': ['CUSTOM', 'SKLEARN'],
        'Accuracy': [accuracy_custom, accuracy_sklearn],
        'ROC AUC': [roc_auc_custom, roc_auc_sklearn],
        'R² Score': [r2_custom, r2_sklearn],
        'RMSE': [rmse_custom, rmse_sklearn]
    })
    print("\n")
    display(metrics_df)

=== Classification Comparison: Custom RandomForestModel vs. scikit-learn ===
Number of features: 4, max_features set to: 2

--- Custom RandomForestModel ---
Training Time: 1.4191 seconds
Prediction Time: 0.0354 seconds
Accuracy: 93.33%
ROC AUC: 0.9892

--- scikit-learn RandomForestClassifier ---
Training Time: 0.0143 seconds
Prediction Time: 0.0024 seconds
Accuracy: 96.67%
ROC AUC: 1.0000

--- Summary ---
Custom RandomForestModel Training Time: 1.4191 seconds
scikit-learn RandomForestClassifier Training Time: 0.0143 seconds
Custom RandomForestModel Prediction Time: 0.0354 seconds
scikit-learn RandomForestClassifier Prediction Time: 0.0024 seconds
Custom RandomForestModel Accuracy: 93.33%
Custom RandomForestModel ROC AUC: 0.9892
scikit-learn RandomForestClassifier Accuracy: 96.67%
scikit-learn RandomForestClassifier ROC AUC: 1.0000

=== Regression Comparison: Custom RandomForestModel vs. scikit-learn ===
Number of features: 10, max_features set to: 3

--- Custom RandomForestModel ---
Tr

Unnamed: 0,Model,Accuracy,ROC AUC,R² Score,RMSE
0,CUSTOM,93.333333,0.989167,0.412588,55.787068
1,SKLEARN,96.666667,1.0,0.423558,55.263706
