**Syed Muhammad Saad**

**AI-22015**

**RANDOM FOREST LAB 6**

In [23]:
import pandas as pd
import numpy as np
import itertools
import random
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from matplotlib import pyplot as plt


In [24]:
dataset = pd.read_csv('/content/advertising.csv')


def gini_index(y):
    m = len(y)
    if m <= 1:
        return 0

    counts = y.value_counts()
    gini = 1 - sum((count / m) ** 2 for count in counts)
    return gini

def entropy(y):
    a = y.value_counts() / y.shape[0]
    return np.sum(-a * np.log2(a + 1e-9))


def gini_information_gain(y, mask):
    total_size = len(y)
    left_size = sum(mask)
    right_size = total_size - left_size
    if left_size == 0 or right_size == 0:
        return 0
    left_gini = gini_index(y[mask])
    right_gini = gini_index(y[~mask])
    gain = gini_index(y) - (left_size / total_size * left_gini + right_size / total_size * right_gini)
    return gain


def entropy_information_gain(y, mask):
    total_size = len(y)
    left_size = sum(mask)
    right_size = total_size - left_size
    if left_size == 0 or right_size == 0:
        return 0
    left_entropy = entropy(y[mask])
    right_entropy = entropy(y[~mask])
    gain = entropy(y) - (left_size / total_size * left_entropy + right_size / total_size * right_entropy)
    return gain


def best_split(dataset, y, func=entropy):
    best_gain = -float('inf')
    best_split_value = None
    best_split_variable = None
    best_mask = None
    for column in dataset.columns:
        values = dataset[column].unique()
        for value in values:
            if dataset[column].dtype != 'O':  # If it's numeric
                mask = dataset[column] < value
            else:
                mask = dataset[column].isin([value])
            if func == entropy:
                gain = entropy_information_gain(y, mask)
            else:
                gain = gini_information_gain(y, mask)
            if gain > best_gain:
                best_gain = gain
                best_split_value = value
                best_split_variable = column
                best_mask = mask
    return best_split_variable, best_split_value, best_mask, best_gain


def train_tree(dataset, y, max_depth=None, min_samples_split=2, min_information_gain=1e-5, depth=0, func=entropy):
    if len(y.unique()) == 1:  # Base case: Only one class
        return y.iloc[0]
    if depth == max_depth or len(dataset) < min_samples_split:
        return y.mode()[0]  # Majority class
    best_variable, best_value, best_mask, best_gain = best_split(dataset, y, func)
    if best_gain < min_information_gain:
        return y.mode()[0]
    left_dataset = dataset[best_mask]
    right_dataset = dataset[~best_mask]
    left_y = y[best_mask]
    right_y = y[~best_mask]
    left_tree = train_tree(left_dataset, left_y, max_depth, min_samples_split, min_information_gain, depth+1, func)
    right_tree = train_tree(right_dataset, right_y, max_depth, min_samples_split, min_information_gain, depth+1, func)
    tree = {f'{best_variable} <= {best_value}': [left_tree, right_tree]}
    return tree


def classify(observation, tree):
    if isinstance(tree, dict):
        question = list(tree.keys())[0]
        feature, value = question.split(' <= ')
        feature_value = observation[feature]
        if isinstance(tree[question][0], dict):
            if feature_value <= float(value):
                return classify(observation, tree[question][0])
            else:
                return classify(observation, tree[question][1])
        else:
            return tree[question][0]
    else:
        return tree


def create_bootstrapped_subsets(dataset, n_subsets=10):
    subsets = []
    for _ in range(n_subsets):
        subset = dataset.sample(frac=1, replace=True)
        subsets.append(subset)
    return subsets


def train_multiple_trees(subsets, target_column, max_depth=10):
    trees = []
    for subset in subsets:
        X = subset.drop(columns=[target_column])
        y = subset[target_column]
        tree = train_tree(X, y, max_depth=max_depth, func=gini_index)
        trees.append(tree)
    return trees


def evaluate_trees(trees, X, y):
    accuracies = []
    for tree in trees:
        predictions = X.apply(lambda observation: classify(observation, tree), axis=1)
        accuracy = accuracy_score(y, predictions)
        accuracies.append(accuracy)
    return accuracies


def select_best_trees(trees, X, y, n_best=2):
    accuracies = evaluate_trees(trees, X, y)
    best_tree_indices = np.argsort(accuracies)[-n_best:]
    best_trees = [trees[i] for i in best_tree_indices]
    return best_trees


def random_forest_predict(observation, best_trees):
    tree_predictions = [classify(observation, tree) for tree in best_trees]
    return max(set(tree_predictions), key=tree_predictions.count)


subsets = create_bootstrapped_subsets(dataset, n_subsets=10)
target_column = 'Clicked on Ad'
X = dataset.drop(columns=[target_column])
y = dataset[target_column]

trees = train_multiple_trees(subsets, target_column)
best_trees = select_best_trees(trees, X, y, n_best=2)


predictions = X.apply(lambda observation: random_forest_predict(observation, best_trees), axis=1)
random_forest_accuracy = accuracy_score(y, predictions)
print(f"Random Forest Accuracy: {random_forest_accuracy}")


Random Forest Accuracy: 0.448


In [25]:
def print_tree(tree, indent=""):
    if isinstance(tree, dict):
        question = list(tree.keys())[0]
        print(f"{indent}Decision: {question}")
        print(f"{indent}-> Left:")
        print_tree(tree[question][0], indent + "  ")
        print(f"{indent}-> Right:")
        print_tree(tree[question][1], indent + "  ")
    else:
        print(f"{indent}Predict: {tree}")

for i, tree in enumerate(best_trees):
    print(f"Tree {i+1}:\n")
    print_tree(tree)
    print("\n" + "="*50 + "\n")

Tree 1:

Decision: Daily Internet Usage <= 177.55
-> Left:
  Decision: Daily Time Spent on Site <= 71.89
  -> Left:
    Decision: Area Income <= 76984.21
    -> Left:
      Predict: 1
    -> Right:
      Predict: 0
  -> Right:
    Decision: Daily Internet Usage <= 163.05
    -> Left:
      Decision: Daily Time Spent on Site <= 87.97
      -> Left:
        Decision: Age <= 30
        -> Left:
          Predict: 0
        -> Right:
          Predict: 1
      -> Right:
        Decision: Age <= 45
        -> Left:
          Predict: 0
        -> Right:
          Predict: 1
    -> Right:
      Decision: Age <= 58
      -> Left:
        Decision: Area Income <= 48761.14
        -> Left:
          Predict: 1
        -> Right:
          Predict: 0
      -> Right:
        Predict: 1
-> Right:
  Decision: Daily Time Spent on Site <= 55.55
  -> Left:
    Decision: Area Income <= 73882.91
    -> Left:
      Decision: Ad Topic Line <= Virtual context-sensitive support
      -> Left:
        Predict

**ON DIABETES DATASET**

In [28]:
dataset = pd.read_csv('/content/diabetes.csv')


dataset = dataset.dropna()


categorical_columns = dataset.select_dtypes(include=['object']).columns
for column in categorical_columns:
    dataset[column] = dataset[column].astype('category').cat.codes


target_column = 'Outcome'
X = dataset.drop(columns=[target_column])
y = dataset[target_column]


max_depth_values = [5, 10, 15]
min_samples_split_values = [2, 5, 10]
min_samples_leaf_values = [1, 2, 4]

best_trees = []

for max_depth in max_depth_values:
    for min_samples_split in min_samples_split_values:
        for min_samples_leaf in min_samples_leaf_values:
            print(f"Training trees with max_depth={max_depth}, min_samples_split={min_samples_split}, min_samples_leaf={min_samples_leaf}")

            subsets = create_bootstrapped_subsets(dataset, n_subsets=4)

            trees = train_multiple_trees(subsets, target_column, max_depth=max_depth)

            accuracies = evaluate_trees(trees, X, y)
            best_trees.append((max_depth, min_samples_split, min_samples_leaf, accuracies))

            print(f"Accuracy for this combination: {accuracies}")

Training trees with max_depth=5, min_samples_split=2, min_samples_leaf=1
Accuracy for this combination: [0.6940104166666666, 0.7122395833333334, 0.62109375, 0.37109375]
Training trees with max_depth=5, min_samples_split=2, min_samples_leaf=2
Accuracy for this combination: [0.609375, 0.7252604166666666, 0.7057291666666666, 0.5169270833333334]
Training trees with max_depth=5, min_samples_split=2, min_samples_leaf=4
Accuracy for this combination: [0.7057291666666666, 0.5559895833333334, 0.7200520833333334, 0.734375]
Training trees with max_depth=5, min_samples_split=5, min_samples_leaf=1
Accuracy for this combination: [0.6692708333333334, 0.7486979166666666, 0.57421875, 0.5455729166666666]
Training trees with max_depth=5, min_samples_split=5, min_samples_leaf=2
Accuracy for this combination: [0.7135416666666666, 0.58984375, 0.6770833333333334, 0.7643229166666666]
Training trees with max_depth=5, min_samples_split=5, min_samples_leaf=4
Accuracy for this combination: [0.6419270833333334, 0.

In [29]:
for(max_depth, min_samples_split, min_samples_leaf, accuracies) in best_trees:
    print(f"Results for max_depth={max_depth}, min_samples_split={min_samples_split}, min_samples_leaf={min_samples_leaf}")
    print(f"Accuracy: {accuracies}")


Results for max_depth=5, min_samples_split=2, min_samples_leaf=1
Accuracy: [0.6940104166666666, 0.7122395833333334, 0.62109375, 0.37109375]
Results for max_depth=5, min_samples_split=2, min_samples_leaf=2
Accuracy: [0.609375, 0.7252604166666666, 0.7057291666666666, 0.5169270833333334]
Results for max_depth=5, min_samples_split=2, min_samples_leaf=4
Accuracy: [0.7057291666666666, 0.5559895833333334, 0.7200520833333334, 0.734375]
Results for max_depth=5, min_samples_split=5, min_samples_leaf=1
Accuracy: [0.6692708333333334, 0.7486979166666666, 0.57421875, 0.5455729166666666]
Results for max_depth=5, min_samples_split=5, min_samples_leaf=2
Accuracy: [0.7135416666666666, 0.58984375, 0.6770833333333334, 0.7643229166666666]
Results for max_depth=5, min_samples_split=5, min_samples_leaf=4
Accuracy: [0.6419270833333334, 0.73046875, 0.71875, 0.7265625]
Results for max_depth=5, min_samples_split=10, min_samples_leaf=1
Accuracy: [0.7447916666666666, 0.6940104166666666, 0.7122395833333334, 0.74609

In [30]:
def print_tree(tree, indent=""):
    if isinstance(tree, dict):
        question = list(tree.keys())[0]
        print(f"{indent}Decision: {question}")
        print(f"{indent}-> Left:")
        print_tree(tree[question][0], indent + "  ")
        print(f"{indent}-> Right:")
        print_tree(tree[question][1], indent + "  ")
    else:
        print(f"{indent}Predict: {tree}")

for i, tree in enumerate(best_trees):
    print(f"Tree {i+1}:\n")
    print_tree(tree)
    print("\n" + "="*50 + "\n")

Tree 1:

Predict: (5, 2, 1, [0.6940104166666666, 0.7122395833333334, 0.62109375, 0.37109375])


Tree 2:

Predict: (5, 2, 2, [0.609375, 0.7252604166666666, 0.7057291666666666, 0.5169270833333334])


Tree 3:

Predict: (5, 2, 4, [0.7057291666666666, 0.5559895833333334, 0.7200520833333334, 0.734375])


Tree 4:

Predict: (5, 5, 1, [0.6692708333333334, 0.7486979166666666, 0.57421875, 0.5455729166666666])


Tree 5:

Predict: (5, 5, 2, [0.7135416666666666, 0.58984375, 0.6770833333333334, 0.7643229166666666])


Tree 6:

Predict: (5, 5, 4, [0.6419270833333334, 0.73046875, 0.71875, 0.7265625])


Tree 7:

Predict: (5, 10, 1, [0.7447916666666666, 0.6940104166666666, 0.7122395833333334, 0.74609375])


Tree 8:

Predict: (5, 10, 2, [0.7669270833333334, 0.7096354166666666, 0.7083333333333334, 0.6497395833333334])


Tree 9:

Predict: (5, 10, 4, [0.5299479166666666, 0.6822916666666666, 0.5690104166666666, 0.7174479166666666])


Tree 10:

Predict: (10, 2, 1, [0.6588541666666666, 0.6458333333333334, 0.7239