In [22]:
import pandas as pd
import numpy as np
from mse_wmse import mse, weighted_mse

In [2]:
df =pd.read_csv('data.csv')

In [96]:
def split(X: np.ndarray, y: np.ndarray, feature: int) -> float:
    """Find the best split for a node (one feature)"""
    best_threshold = None
    best_metric = float('inf')
    
    for split_value in np.unique(X[:, feature]):
        left_indices = X[:, feature] <= split_value
        right_indices = X[:, feature] > split_value

        left_y, right_y = y[left_indices], y[right_indices]
        metric = weighted_mse(left_y, right_y)

        if metric < best_metric:
            best_metric = metric
            best_threshold = split_value
    return best_threshold, best_metric

In [97]:
X = df.iloc[:, :-1].values
y = df['delay_days'].values

In [98]:
split(X, y, 0)

(48, 406.743974171499)

In [99]:
from __future__ import annotations

import numpy as np


def best_split(X: np.ndarray, y: np.ndarray) -> tuple[int, float]:
    """Find the best split for a node (one feature)"""
    best_feature = None
    best_threshold = None
    best_metric = float('inf')

    for feature in range(X.shape[1]):
        threshold, metric = split(X, y, feature)
        if metric < best_metric:
            best_metric = metric
            best_feature = feature
            best_threshold = threshold
    return best_feature, best_threshold

In [100]:
best_split(X, y)

(1, 44443)

In [102]:
from __future__ import annotations

from dataclasses import dataclass


@dataclass
class Node:
    """Decision tree node."""
    feature: int
    threshold: float
    n_samples: int
    left: Node = None
    right: Node = None
    mse: float = None
    value: int = None