# Random forests classifier

## Lecture 7

### GRA 4160
### Predictive modelling with machine learning

#### Lecturer: Vegard H. Larsen

In [44]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.utils.multiclass import check_classification_targets
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted

from scipy.stats import mode


## Build a random forest from the `DecisionTreeClassifier` class

In [45]:
# Load the data into a pandas dataframe
df = pd.read_csv("../data/titanic/train.csv")

# Preprocess the data
df = df.dropna()
df['Sex'] = df['Sex'].apply(lambda x: 1 if x == 'male' else 0)

# Split the data into training and test sets
X = df[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare']]
y = df['Survived']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

# Print the first 5 rows of the training data
X_train.head()


Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare
230,1,0,35.0,1,0,83.475
724,1,1,27.0,1,0,53.1
257,1,0,30.0,0,0,86.5
434,1,1,50.0,1,0,55.9
195,1,0,58.0,0,0,146.5208


In [46]:
# Define a function to generate a random subset of features
def random_subset(n_features):

    # Determine the number of features to consider at each split
    k = int(np.sqrt(n_features))

    # Select a random subset of k features without replacement
    features = np.random.choice(n_features, size=k, replace=False)
    return features

In [47]:
# Define a function to train a decision tree on a bootstrapped sample of the data
def train_tree(X_train, y_train, n_features):

    # Create a bootstrapped sample of the data
    n_samples = X_train.shape[0]
    sample_indices = np.random.choice(n_samples, size=n_samples, replace=True)
    X_boot = X_train.iloc[sample_indices]
    y_boot = y_train.iloc[sample_indices]

    # Select a random subset of features
    features = random_subset(n_features)
    X_boot_subset = X_boot.iloc[:, features]

    # Train a decision tree on the bootstrapped sample
    dt = DecisionTreeClassifier(max_features=None, random_state=1)
    dt.fit(X_boot_subset, y_boot)
    return dt, features

In [48]:
# Define a function to predict the class labels for a new data point
def predict(X, trees):

    # Predict the class label for each tree and aggregate the predictions
    y_pred = np.zeros((X.shape[0], len(trees)))
    for i, tree in enumerate(trees):
        features = tree[1]
        X_subset = X.iloc[:, features]
        y_pred[:, i] = tree[0].predict(X_subset)

    # Convert the predictions to integer type
    y_pred = y_pred.astype(int)
    y_pred_agg = np.apply_along_axis(lambda x: np.bincount(x).argmax(), axis=1, arr=y_pred)
    return y_pred_agg

In [49]:
# Train multiple decision trees
n_trees = 100
max_depth = 3
trees = []
n_features = X_train.shape[1]
for i in range(n_trees):
    dt, features = train_tree(X_train, y_train, n_features)
    trees.append((dt, features))


In [50]:
# Make predictions on the testing data

y_pred = predict(X_test, trees)

In [51]:
# Calculate the accuracy of the random forest (TP + TN) / (TP + TN + FP + FN)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Calculate the precision of the random forest (TP /(TP+ FP))
precision = precision_score(y_test, y_pred)
print("Precision:", precision)

# Calculate the recall of the random forest (TP /(TP+ FN))
recall = recall_score(y_test, y_pred)
print("Recall:", recall)

# Calculate the F1 score of the random forest (2 * precision * recall / (precision + recall))
f1 = f1_score(y_test, y_pred)
print("F1:", f1)

Accuracy: 0.7608695652173914
Precision: 0.7631578947368421
Recall: 0.9354838709677419
F1: 0.8405797101449275


In [52]:
print(f'Survival rate in test set: {y_test.sum()/len(y_test):.2f}')

Survival rate in test set: 0.67


## Using the `RandomForestClassifier`

In [53]:
# Create a RandomForestClassifier object
rfc = RandomForestClassifier(n_estimators=100, max_depth=3, random_state=1)

# Train the random forest classifier on the training data
rfc.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = rfc.predict(X_test)

In [54]:
# Calculate the accuracy of the random forest (TP + TN) / (TP + TN + FP + FN)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Calculate the precision of the random forest (TP /(TP+ FP))
precision = precision_score(y_test, y_pred)
print("Precision:", precision)

# Calculate the recall of the random forest (TP /(TP+ FN))
recall = recall_score(y_test, y_pred)
print("Recall:", recall)

# Calculate the F1 score of the random forest (2 * precision * recall / (precision + recall))
f1 = f1_score(y_test, y_pred)
print("F1:", f1)

Accuracy: 0.6956521739130435
Precision: 0.7428571428571429
Recall: 0.8387096774193549
F1: 0.7878787878787878


## Building ExtraTrees (Extremely Randomized Trees)

**Selecting a Subset of Features:** At each node in the tree, both Random Forest and Extra Trees algorithms start by selecting a random subset of the features (or predictors).

**Determining the Split Point:**

- *Random Forest:* Once the subset of features is selected, the Random Forest algorithm will search for the best possible split point among these features. This involves finding the value that best separates the data according to the target variable, often using a criterion like Gini impurity or entropy in classification tasks. This process is somewhat similar to what a standard decision tree does, but limited to a subset of features.

- *Extra Trees:* In contrast, the Extra Trees algorithm introduces more randomness. After selecting a subset of features, instead of searching for the most optimal split based on some criterion, it randomly selects a split point for each feature. Then, among these randomly generated splits, it chooses one to split the node. This means that the algorithm does not necessarily choose the best split from a statistical perspective, but rather a random one.

**Impact of Random Splits:**
This increased randomness in choosing splits can lead to more diversified trees within the ensemble, as it reduces the likelihood of creating similar trees even if they are based on the same training data.

As a result, the individual trees in an Extra Trees ensemble can have higher bias compared to those in a Random Forest, but when combined, the ensemble as a whole often has lower variance. This is because the random splits lead to less correlated trees, which is beneficial in an ensemble method.

In [55]:
# Load the data into a pandas dataframe
df = pd.read_csv("../data/titanic/train.csv")

# Preprocess the data
df = df.dropna()
df['Sex'] = df['Sex'].apply(lambda x: 1 if x == 'male' else 0)

# Split the data into training and test sets
X = df[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare']]
y = df['Survived']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [58]:
X_train

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare
230,1,0,35.0,1,0,83.4750
724,1,1,27.0,1,0,53.1000
257,1,0,30.0,0,0,86.5000
434,1,1,50.0,1,0,55.9000
195,1,0,58.0,0,0,146.5208
...,...,...,...,...,...,...
647,1,1,56.0,0,0,35.5000
679,1,1,36.0,0,1,512.3292
345,2,0,24.0,0,0,13.0000
690,1,1,31.0,1,0,57.0000


In [56]:
# Create an Extra Trees classifier object
etc = ExtraTreesClassifier(n_estimators=100, max_depth=3, random_state=1)

# Train the Extra Trees classifier on the training data
etc.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = etc.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)

Accuracy: 0.7391304347826086
Precision: 0.7878787878787878
Recall: 0.8387096774193549


In [57]:
print(f'Survival rate in test set: {y_test.sum()/len(y_test):.2f}')

Survival rate in test set: 0.67
