# Random forests classifier

## Lecture 7

### GRA 4160
### Predictive modelling with machine learning

#### Lecturer: Vegard H. Larsen

## Build a random forest from the `DecisionTreeClassifier` class

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the data into a pandas dataframe
df = pd.read_csv("../data/titanic/train.csv")

# Preprocess the data
df = df.dropna()
df['Sex'] = df['Sex'].apply(lambda x: 1 if x == 'male' else 0)

# Split the data into training and test sets
X = df[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare']]
y = df['Survived']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [2]:
import numpy as np

# Define a function to generate a random subset of features
def random_subset(n_features):

    # Determine the number of features to consider at each split
    k = int(np.sqrt(n_features))

    # Select a random subset of k features without replacement
    features = np.random.choice(n_features, size=k, replace=False)
    return features

In [3]:
from sklearn.tree import DecisionTreeClassifier

# Define a function to train a decision tree on a bootstrapped sample of the data
def train_tree(X_train, y_train, n_features):

    # Create a bootstrapped sample of the data
    n_samples = X_train.shape[0]
    sample_indices = np.random.choice(n_samples, size=n_samples, replace=True)
    X_boot = X_train.iloc[sample_indices]
    y_boot = y_train.iloc[sample_indices]

    # Select a random subset of features
    features = random_subset(n_features)
    X_boot_subset = X_boot.iloc[:, features]

    # Train a decision tree on the bootstrapped sample
    dt = DecisionTreeClassifier(max_features=None, random_state=1)
    dt.fit(X_boot_subset, y_boot)
    return dt, features

In [4]:
# Define a function to predict the class labels for a new data point
def predict(X, trees):

    # Predict the class label for each tree and aggregate the predictions
    y_pred = np.zeros((X.shape[0], len(trees)))
    for i, tree in enumerate(trees):
        features = tree[1]
        X_subset = X.iloc[:, features]
        y_pred[:, i] = tree[0].predict(X_subset)

    # Convert the predictions to integer type
    y_pred = y_pred.astype(int)
    y_pred_agg = np.apply_along_axis(lambda x: np.bincount(x).argmax(), axis=1, arr=y_pred)
    return y_pred_agg

In [5]:
# Train multiple decision trees
n_trees = 100
max_depth = 3
trees = []
n_features = X_train.shape[1]
for i in range(n_trees):
    dt, features = train_tree(X_train, y_train, n_features)
    trees.append((dt, features))


In [6]:
# Make predictions on the testing data

y_pred = predict(X_test, trees)

In [7]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Calculate the accuracy of the random forest (TP + TN) / (TP + TN + FP + FN)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Calculate the precision of the random forest (TP /(TP+ FP))
precision = precision_score(y_test, y_pred)
print("Precision:", precision)

# Calculate the recall of the random forest (TP /(TP+ FN))
recall = recall_score(y_test, y_pred)
print("Recall:", recall)

# Calculate the F1 score of the random forest (2 * precision * recall / (precision + recall))
f1 = f1_score(y_test, y_pred)
print("F1:", f1)

Accuracy: 0.7391304347826086
Precision: 0.7567567567567568
Recall: 0.9032258064516129
F1: 0.823529411764706


In [8]:
print(f'Survival rate in test set: {y_test.sum()/len(y_test):.2f}')

Survival rate in test set: 0.67


## Using the `RandomForestClassifier`

In [9]:
from sklearn.ensemble import RandomForestClassifier

# Create a RandomForestClassifier object
rfc = RandomForestClassifier(n_estimators=100, max_depth=3, random_state=30)

# Train the random forest classifier on the training data
rfc.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = rfc.predict(X_test)

In [10]:
from sklearn.metrics import accuracy_score, precision_score, recall_score

# Calculate the accuracy of the random forest (TP + TN) / (TP + TN + FP + FN)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Calculate the precision of the random forest (TP /(TP+ FP))
precision = precision_score(y_test, y_pred)
print("Precision:", precision)

# Calculate the recall of the random forest (TP /(TP+ FN))
recall = recall_score(y_test, y_pred)
print("Recall:", recall)

# Calculate the F1 score of the random forest (2 * precision * recall / (precision + recall))
f1 = f1_score(y_test, y_pred)
print("F1:", f1)

Accuracy: 0.7608695652173914
Precision: 0.7631578947368421
Recall: 0.9354838709677419
F1: 0.8405797101449276
