In [None]:
import numpy as np

class Node:
    def __init__(self, feature_index=None, threshold=None, left=None, right=None, info_gain=None, value=None):
        self.feature_index = feature_index
        self.threshold = threshold
        self.left = left
        self.right = right
        self.info_gain = info_gain
        self.value = value

    def is_leaf_node(self):
        return self.value is not None

class DecisionTreeRegressor:
    def __init__(self, max_depth = 3):
        self.max_depth = max_depth
        self.root = None

    def fit(self, x, y):
        self.root = self.build_tree(x, y)

    def _mean_of_labels(self, y):
        return np.mean(y)

    def build_tree(self, x, y, depth=0):
        num_samples, num_features = x.shape
        num_labels = len(np.unique(y))

        if num_labels == 1 or self.max_depth is not None and depth >= self.max_depth:
            leaf_value = self._mean_of_labels(y)
            return Node(value=leaf_value)

        feature_index = np.arange(num_features)
        best_feature, best_threshold = self._best_split(x, y, feature_index)

        left_indices, right_indices = self._split(x[:, best_feature], best_threshold)
        left = self.build_tree(x[left_indices, :], y[left_indices],depth+1)
        right = self.build_tree(x[right_indices, :], y[right_indices],depth+1)
        return Node(best_feature, best_threshold, left, right)

    def _most_common_label(self, y):
        unique_labels, counts = np.unique(y, return_counts=True)
        return unique_labels[np.argmax(counts)]

    def _split(self, x, threshold):
        left_indices = np.argwhere(x <= threshold).flatten()
        right_indices = np.argwhere(x > threshold).flatten()
        return left_indices, right_indices

    def _best_split(self, x, y, feature_index):
        best_gain, split_index, split_threshold = float('-inf'), None, None

        for feature in feature_index:
            x_column = x[:, feature]
            x_column_sorted = np.sort(x_column)
            thresholds = (x_column_sorted[:-1] + x_column_sorted[1:]) / 2

            for threshold in thresholds:
                gain = self._information_gain(y, x_column, threshold)

                if gain > best_gain:
                    best_gain = gain
                    split_index = feature
                    split_threshold = threshold

        return split_index, split_threshold

    def _information_gain(self, y, x_column, threshold):
        parent_entropy = self._entropy(y)

        left_indices, right_indices = self._split(x_column, threshold)

        if len(left_indices) == 0 or len(right_indices) == 0:
            return 0

        n, n_l, n_r = len(y), len(left_indices), len(right_indices)
        e_l, e_r = self._entropy(y[left_indices]), self._entropy(y[right_indices])

        child_entropy = (n_l / n) * e_l + (n_r / n) * e_r
        information_gain = parent_entropy - child_entropy

        return information_gain

    def _entropy(self, y):
        y_mean = np.mean(y)
        return np.mean((y-y_mean)**2)

    def predict(self, X):
        return np.array([self._traverse_tree(x, self.root) for x in X])

    def _traverse_tree(self, x, node):
        if node.is_leaf_node():
            return node.value
        if x[node.feature_index] <= node.threshold:
            return self._traverse_tree(x, node.left)
        return self._traverse_tree(x, node.right)


In [None]:
X = np.array([[1,2],[2,3],[3,4],[4,5],[5,6]])
y=np.array([1,1,0,0,0])

In [None]:
regressor = DecisionTreeRegressor()
regressor.fit(X,y)

y_pred = regressor.predict(X)

In [None]:
mse = np.mean((y_pred - y) ** 2)
print("Mean Squared Error:", mse)

def r2_score_percentage(y_true, y_pred):
  tss = np.sum((y_true - np.mean(y_true)) ** 2)
  rss = np.sum((y_true - y_pred) ** 2)
  r2_score = 1 - (rss / tss)

  r2_percentage = r2_score * 100

  return r2_percentage

r2_percentage = r2_score_percentage(y, y_pred)
print("R2 Score (Percentage):", r2_percentage)

Mean Squared Error: 0.0
R2 Score (Percentage): 100.0


# Gradient Boosting

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

X = np.array([[1,2],[2,3],[3,4],[4,5],[5,6]])
y=np.array([1,1,0,0,0])

regressor = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
regressor.fit(X, y)

y_pred = regressor.predict(X)

mse = np.mean((y_pred - y) ** 2)
print("Mean Squared Error:", mse)

def r2_score_percentage(y_true, y_pred):
  tss = np.sum((y_true - np.mean(y_true)) ** 2)
  rss = np.sum((y_true - y_pred) ** 2)
  r2_score = 1 - (rss / tss)

  r2_percentage = r2_score * 100

  return r2_percentage

r2_percentage = r2_score_percentage(y, y_pred)
print("R2 Score (Percentage):", r2_percentage)

Mean Squared Error: 1.6932189860901083e-10
R2 Score (Percentage): 99.99999992944922


# Gradient Boosting using Numpy

In [None]:
class GradientBoostingRegressor:
  def __init__(self, n_estimators=100, learning_rate=0.1, max_depth=3):
    self.n_estimators = n_estimators
    self.learning_rate = learning_rate
    self.max_depth = max_depth
    self.trees = []

  def fit(self, X, y):
    m = len(y)
    self.initial_prediction = np.mean(y)
    residuals = y - self.initial_prediction

    for _ in range(self.n_estimators):
      tree = DecisionTreeRegressor(max_depth=self.max_depth)
      tree.fit(X, residuals)
      predictions = tree.predict(X)
      self.trees.append(tree)
      residuals -= self.learning_rate * predictions

  def predict(self, X):
    y_pred = np.full(X.shape[0], self.initial_prediction)
    for tree in self.trees:
      y_pred += self.learning_rate * tree.predict(X)
    return y_pred


In [None]:
X = np.array([[1,2],[2,3],[3,4],[4,5],[5,6]])
y=np.array([1,1,0,0,0])

regressor = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=10)
regressor.fit(X, y)

y_pred = regressor.predict(X)
mse = np.mean((y_pred - y) ** 2)
print("Mean Squared Error:", mse)

def r2_score_percentage(y_true, y_pred):
  tss = np.sum((y_true - np.mean(y_true)) ** 2)
  rss = np.sum((y_true - y_pred) ** 2)
  r2_score = 1 - (rss / tss)

  r2_percentage = r2_score * 100

  return r2_percentage

r2_percentage = r2_score_percentage(y, y_pred)
print("R2 Score (Percentage):", r2_percentage)

Mean Squared Error: 1.6932189860901083e-10
R2 Score (Percentage): 99.99999992944922
