# Random forests classifier

## Lecture 7

### GRA 4160
### Predictive modelling with machine learning

#### Lecturer: Vegard H. Larsen

In [None]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.utils.multiclass import check_classification_targets
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted

from scipy.stats import mode


## Build a random forest from the `DecisionTreeClassifier` class

In [39]:
# Load the data into a pandas dataframe
df = pd.read_csv("../data/titanic/train.csv")

# Preprocess the data
df = df.dropna()
df['Sex'] = df['Sex'].apply(lambda x: 1 if x == 'male' else 0)

# Split the data into training and test sets
X = df[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare']]
y = df['Survived']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

# Print the first 5 rows of the training data
X_train.head()


Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare
230,1,0,35.0,1,0,83.475
724,1,1,27.0,1,0,53.1
257,1,0,30.0,0,0,86.5
434,1,1,50.0,1,0,55.9
195,1,0,58.0,0,0,146.5208


In [40]:
# Define a function to generate a random subset of features
def random_subset(n_features):

    # Determine the number of features to consider at each split
    k = int(np.sqrt(n_features))

    # Select a random subset of k features without replacement
    features = np.random.choice(n_features, size=k, replace=False)
    return features

In [41]:
# Define a function to train a decision tree on a bootstrapped sample of the data
def train_tree(X_train, y_train, n_features):

    # Create a bootstrapped sample of the data
    n_samples = X_train.shape[0]
    sample_indices = np.random.choice(n_samples, size=n_samples, replace=True)
    X_boot = X_train.iloc[sample_indices]
    y_boot = y_train.iloc[sample_indices]

    # Select a random subset of features
    features = random_subset(n_features)
    X_boot_subset = X_boot.iloc[:, features]

    # Train a decision tree on the bootstrapped sample
    dt = DecisionTreeClassifier(max_features=None, random_state=1)
    dt.fit(X_boot_subset, y_boot)
    return dt, features

In [42]:
# Define a function to predict the class labels for a new data point
def predict(X, trees):

    # Predict the class label for each tree and aggregate the predictions
    y_pred = np.zeros((X.shape[0], len(trees)))
    for i, tree in enumerate(trees):
        features = tree[1]
        X_subset = X.iloc[:, features]
        y_pred[:, i] = tree[0].predict(X_subset)

    # Convert the predictions to integer type
    y_pred = y_pred.astype(int)
    y_pred_agg = np.apply_along_axis(lambda x: np.bincount(x).argmax(), axis=1, arr=y_pred)
    return y_pred_agg

In [43]:
# Train multiple decision trees
n_trees = 100
max_depth = 3
trees = []
n_features = X_train.shape[1]
for i in range(n_trees):
    dt, features = train_tree(X_train, y_train, n_features)
    trees.append((dt, features))


In [44]:
# Make predictions on the testing data

y_pred = predict(X_test, trees)

In [45]:
# Calculate the accuracy of the random forest (TP + TN) / (TP + TN + FP + FN)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Calculate the precision of the random forest (TP /(TP+ FP))
precision = precision_score(y_test, y_pred)
print("Precision:", precision)

# Calculate the recall of the random forest (TP /(TP+ FN))
recall = recall_score(y_test, y_pred)
print("Recall:", recall)

# Calculate the F1 score of the random forest (2 * precision * recall / (precision + recall))
f1 = f1_score(y_test, y_pred)
print("F1:", f1)

Accuracy: 0.7608695652173914
Precision: 0.7777777777777778
Recall: 0.9032258064516129
F1: 0.835820895522388


In [46]:
print(f'Survival rate in test set: {y_test.sum()/len(y_test):.2f}')

Survival rate in test set: 0.67


## Using the `RandomForestClassifier`

In [47]:
# Create a RandomForestClassifier object
rfc = RandomForestClassifier(n_estimators=100, max_depth=3, random_state=30)

# Train the random forest classifier on the training data
rfc.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = rfc.predict(X_test)

In [48]:
# Calculate the accuracy of the random forest (TP + TN) / (TP + TN + FP + FN)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Calculate the precision of the random forest (TP /(TP+ FP))
precision = precision_score(y_test, y_pred)
print("Precision:", precision)

# Calculate the recall of the random forest (TP /(TP+ FN))
recall = recall_score(y_test, y_pred)
print("Recall:", recall)

# Calculate the F1 score of the random forest (2 * precision * recall / (precision + recall))
f1 = f1_score(y_test, y_pred)
print("F1:", f1)

Accuracy: 0.7608695652173914
Precision: 0.7631578947368421
Recall: 0.9354838709677419
F1: 0.8405797101449275


## Building ExtraTrees (Extremely Randomized Trees)

**Selecting a Subset of Features:** At each node in the tree, both Random Forest and Extra Trees algorithms start by selecting a random subset of the features (or predictors).

**Determining the Split Point:**

- *Random Forest:* Once the subset of features is selected, the Random Forest algorithm will search for the best possible split point among these features. This involves finding the value that best separates the data according to the target variable, often using a criterion like Gini impurity or entropy in classification tasks. This process is somewhat similar to what a standard decision tree does, but limited to a subset of features.

- *Extra Trees:* In contrast, the Extra Trees algorithm introduces more randomness. After selecting a subset of features, instead of searching for the most optimal split based on some criterion, it randomly selects a split point for each feature. Then, among these randomly generated splits, it chooses one to split the node. This means that the algorithm does not necessarily choose the best split from a statistical perspective, but rather a random one.

**Impact of Random Splits:**
This increased randomness in choosing splits can lead to more diversified trees within the ensemble, as it reduces the likelihood of creating similar trees even if they are based on the same training data.

As a result, the individual trees in an Extra Trees ensemble can have higher bias compared to those in a Random Forest, but when combined, the ensemble as a whole often has lower variance. This is because the random splits lead to less correlated trees, which is beneficial in an ensemble method.

In [33]:
# Load the data into a pandas dataframe
df = pd.read_csv("../data/titanic/train.csv")

# Preprocess the data
df = df.dropna()
df['Sex'] = df['Sex'].apply(lambda x: 1 if x == 'male' else 0)

# Split the data into training and test sets
X = df[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare']]
y = df['Survived']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [34]:
# Create an Extra Trees classifier object
etc = ExtraTreesClassifier(n_estimators=100, max_depth=3, random_state=1)

# Train the Extra Trees classifier on the training data
etc.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = etc.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)

Accuracy: 0.7391304347826086
Precision: 0.7878787878787878
Recall: 0.8387096774193549


In [35]:
print(f'Survival rate in test set: {y_test.sum()/len(y_test):.2f}')

Survival rate in test set: 0.67


## Can you build a Extra Trees classifier using only the DecisionTreeClassifier class?

In [36]:
class SimpleRandomSplitTree(BaseEstimator, ClassifierMixin):
    def __init__(self, max_depth=None):
        self.max_depth = max_depth

    def fit(self, X, y):
        # Check that X and y have correct shape
        X, y = check_X_y(X, y)
        # Check that y has acceptable targets
        check_classification_targets(y)

        self.classes_, y = np.unique(y, return_inverse=True)
        self.n_classes_ = len(self.classes_)
        self.tree_ = self._grow_tree(X, y, depth=0)
        return self

    def _grow_tree(self, X, y, depth):
        # Stopping criteria: if all targets are the same or if maximum depth is reached
        if len(set(y)) == 1 or (self.max_depth is not None and depth >= self.max_depth):
            return np.argmax(np.bincount(y))

        n_samples, n_features = X.shape
    
        # Attempt to split until valid split is found or decide it's a leaf node
        for _ in range(n_features):
            feature_idx = np.random.randint(0, n_features)
            unique_values = np.unique(X[:, feature_idx])

            # If there's less than 2 unique values, can't split on this feature
            if unique_values.size < 2:
                continue

            split_value = np.random.uniform(X[:, feature_idx].min(), X[:, feature_idx].max())

            left_idx = X[:, feature_idx] < split_value
            right_idx = ~left_idx

            # Check if the split actually divides the dataset
            if np.any(left_idx) and np.any(right_idx):
                left_child = self._grow_tree(X[left_idx], y[left_idx], depth + 1)
                right_child = self._grow_tree(X[right_idx], y[right_idx], depth + 1)
                return (feature_idx, split_value, left_child, right_child)

        # If no valid split found, return the most common target as leaf node
        return np.argmax(np.bincount(y))

    def predict(self, X):
        # Input validation
        X = check_array(X)
        check_is_fitted(self)

        predictions = [self._predict_one(x, self.tree_) for x in X]
        return self.classes_[np.array(predictions)]

    def _predict_one(self, x, node):
        # If we have a leaf node
        if not isinstance(node, tuple):
            return node

        # Decide whether to follow left or right child
        feature_idx, split_value, left_child, right_child = node
        if x[feature_idx] < split_value:
            return self._predict_one(x, left_child)
        else:
            return self._predict_one(x, right_child)

# Now we update the SimpleExtraTreesClassifier to use this new tree
class SimpleExtraTreesClassifier:
    def __init__(self, n_estimators=100, max_depth=None, max_features='sqrt'):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.max_features = max_features
        self.trees = []

    def fit(self, X, y):
        n_features = X.shape[1]
        for _ in range(self.n_estimators):
            tree = SimpleRandomSplitTree(max_depth=self.max_depth)

            # Randomly select features
            if self.max_features == 'sqrt':
                size = int(np.sqrt(n_features))
            elif self.max_features == 'log2':
                size = int(np.log2(n_features))
            else:
                size = n_features

            features_idx = np.random.choice(range(n_features), size=size, replace=False)
            X_subset = X.iloc[:, features_idx]

            # Train the tree
            tree.fit(X_subset, y)
            self.trees.append((tree, features_idx))

    def predict(self, X):
        predictions = np.zeros((self.n_estimators, len(X)), dtype=np.int64)
        for i, (tree, features_idx) in enumerate(self.trees):
            X_subset = X.iloc[:, features_idx]
            predictions[i] = tree.predict(X_subset)

        # Take mode along axis 0 (across all trees)
        # Returns (array([predictions]), array([counts]))
        final_predictions = mode(predictions, axis=0)
        # Return just the predictions array
        return final_predictions.mode.ravel()

In [50]:
# Example Usage
clf = SimpleExtraTreesClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

In [51]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))

Accuracy: 0.6739130434782609
Precision: 0.6818181818181818
Recall: 0.967741935483871
