## Using the Wine Dataset

The Wine dataset contains 13 features and three classes

In [None]:
# Import libraries
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from collections import Counter
import numpy as np

## Train-Test Split

We'll split the dataset into training and testing sets for evaluation.

In [None]:
# Load the Wine dataset
wine = load_wine()
X = wine.data
y = wine.target

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Implement and Train Random Forest

In [None]:
# Decision Tree Implementation (as base learners for Random Forest)
class DecisionTree:
    def __init__(self, max_depth=None):
        self.max_depth = max_depth
        self.tree = None

    def fit(self, X, y, depth=0):
        n_samples, n_features = X.shape
        if len(set(y)) == 1 or depth == self.max_depth:
            self.tree = Counter(y).most_common(1)[0][0]
            return

        best_feature, best_threshold = self._best_split(X, y)
        if best_feature is None:
            self.tree = Counter(y).most_common(1)[0][0]
            return

        left_indices = X[:, best_feature] <= best_threshold
        right_indices = X[:, best_feature] > best_threshold

        self.tree = {
            'feature': best_feature,
            'threshold': best_threshold,
            'left': DecisionTree(max_depth=self.max_depth),
            'right': DecisionTree(max_depth=self.max_depth)
        }

        self.tree['left'].fit(X[left_indices], y[left_indices], depth + 1)
        self.tree['right'].fit(X[right_indices], y[right_indices], depth + 1)

    def predict(self, X):
        if isinstance(self.tree, dict):
            feature = self.tree['feature']
            threshold = self.tree['threshold']
            if X[feature] <= threshold:
                return self.tree['left'].predict(X)
            else:
                return self.tree['right'].predict(X)
        else:
            return self.tree

    def _best_split(self, X, y):
        n_samples, n_features = X.shape
        best_gain = -1
        best_feature = None
        best_threshold = None

        for feature in range(n_features):
            thresholds = np.unique(X[:, feature])
            for threshold in thresholds:
                left_indices = X[:, feature] <= threshold
                right_indices = X[:, feature] > threshold
                if sum(left_indices) == 0 or sum(right_indices) == 0:
                    continue

                gain = self._information_gain(y, left_indices, right_indices)
                if gain > best_gain:
                    best_gain = gain
                    best_feature = feature
                    best_threshold = threshold

        return best_feature, best_threshold

    def _information_gain(self, y, left_indices, right_indices):
        def gini_impurity(y_subset):
            if len(y_subset) == 0:
                return 0
            counts = Counter(y_subset)
            probabilities = [count / len(y_subset) for count in counts.values()]
            return 1 - sum(p ** 2 for p in probabilities)

        left_impurity = gini_impurity(y[left_indices])
        right_impurity = gini_impurity(y[right_indices])
        n = len(y)
        n_left = sum(left_indices)
        n_right = sum(right_indices)

        return gini_impurity(y) - (n_left / n) * left_impurity - (n_right / n) * right_impurity

In [None]:
# Define the Random Forest class
class RandomForest:
    def __init__(self, n_estimators=10, max_depth=None, max_features=None):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.max_features = max_features
        self.trees = []

    def fit(self, X, y):
        self.trees = []
        n_samples, n_features = X.shape
        self.max_features = self.max_features or int(np.sqrt(n_features))

        for _ in range(self.n_estimators):
            bootstrap_indices = np.random.choice(n_samples, n_samples, replace=True)
            X_bootstrap = X[bootstrap_indices]
            y_bootstrap = y[bootstrap_indices]

            tree = DecisionTree(max_depth=self.max_depth)
            tree.fit(X_bootstrap[:, np.random.choice(n_features, self.max_features, replace=False)], y_bootstrap)
            self.trees.append(tree)

    def predict(self, X):
        predictions = np.array([[tree.predict(row) for tree in self.trees] for row in X])
        return np.array([Counter(row).most_common(1)[0][0] for row in predictions])

In [None]:
# Train and evaluate the Random Forest
rf = RandomForest(n_estimators=20, max_depth=5)
rf.fit(X_train, y_train)

# Predict and evaluate
y_pred = rf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"Random Forest Accuracy on Wine Dataset: {accuracy:.2f}")

Random Forest Accuracy on Wine Dataset: 0.39
