<a href="https://colab.research.google.com/github/yashkapur0403/Neural-Networks-Practise/blob/main/random_forest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [30]:

from google.colab import drive
import zipfile, os, cv2
import numpy as np

drive.mount('/content/drive')  # allow permission
zip_path = "/content/drive/My Drive/Colab Notebooks/chest_xray.zip"
extract_path = "/content/chest_xray"
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

# STEP 2: Load images and prepare data
def load_images_from_folder(folder_path, label):
    data = []
    for filename in os.listdir(folder_path):
        path = os.path.join(folder_path, filename)
        img = cv2.imread(path, cv2.IMREAD_GRAYSCALE)
        if img is not None:
            img = cv2.resize(img, (64, 64))
            img_flat = img.flatten() / 255.0
            data.append((img_flat, label))
    return data

normal_data = load_images_from_folder("/content/chest_xray/chest_xray/train/NORMAL", 0)
pneumonia_data = load_images_from_folder("/content/chest_xray/chest_xray/train/PNEUMONIA", 1)
all_data = normal_data + pneumonia_data

X_train = np.array([x for x, _ in all_data])
y_train = np.array([y for _, y in all_data])

normal_test = load_images_from_folder("/content/chest_xray/chest_xray/test/NORMAL", 0)
pneumonia_test = load_images_from_folder("/content/chest_xray/chest_xray/test/PNEUMONIA", 1)
all_test = normal_test + pneumonia_test

X_test = np.array([x for x, _ in all_test])
y_test = np.array([y for _, y in all_test])

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [31]:
# Gini Impurity
def gini(y):
    classes, counts = np.unique(y, return_counts=True)
    probs = counts / counts.sum()
    return 1 - np.sum(probs ** 2)

# Splitting the dataset
def split(X, y, feature_index, threshold):
    left_mask = X[:, feature_index] <= threshold
    right_mask = ~left_mask
    return X[left_mask], y[left_mask], X[right_mask], y[right_mask]

# Best split - EFFICIENT VERSION
def best_split(X, y):
    m, n = X.shape
    best_gini = 1
    best_idx, best_thresh = None, None

    for feature_index in range(n):
        feature_values = X[:, feature_index]
        min_val, max_val = feature_values.min(), feature_values.max()

        # Use fewer thresholds but better distributed
        if min_val == max_val:
            continue

        # Sample 15 thresholds between min and max
        thresholds = np.linspace(min_val, max_val, 15)[1:-1]

        for t in thresholds:
            _, y_left, _, y_right = split(X, y, feature_index, t)
            if len(y_left) == 0 or len(y_right) == 0:
                continue

            # Add minimum samples requirement to prevent overfitting
            if len(y_left) < 5 or len(y_right) < 5:
                continue

            g = (len(y_left) * gini(y_left) + len(y_right) * gini(y_right)) / m
            if g < best_gini:
                best_gini = g
                best_idx = feature_index
                best_thresh = t
    return best_idx, best_thresh

# Build the tree
class Node:
    def __init__(self, feature_index=None, threshold=None, left=None, right=None, value=None):
        self.feature_index = feature_index
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value

def build_tree(X, y, depth=0, max_depth=8, min_samples_split=10):
    # Add more stopping conditions to prevent overfitting
    print(f"Tree depth: {depth}, Samples: {len(y)}")
    if (len(set(y)) == 1 or
        depth == max_depth or
        len(y) < min_samples_split):
        # Leaf node
        values, counts = np.unique(y, return_counts=True)
        return Node(value=values[np.argmax(counts)])

    feat_idx, thresh = best_split(X, y)
    if feat_idx is None:
        values, counts = np.unique(y, return_counts=True)
        return Node(value=values[np.argmax(counts)])

    X_left, y_left, X_right, y_right = split(X, y, feat_idx, thresh)

    # Ensure minimum samples in each split
    if len(y_left) < 5 or len(y_right) < 5:
        values, counts = np.unique(y, return_counts=True)
        return Node(value=values[np.argmax(counts)])

    left_child = build_tree(X_left, y_left, depth + 1, max_depth, min_samples_split)
    right_child = build_tree(X_right, y_right, depth + 1, max_depth, min_samples_split)
    return Node(feature_index=feat_idx, threshold=thresh, left=left_child, right=right_child)

# Predict
def predict_one(x, node):
    if node.value is not None:
        return node.value
    if x[node.feature_index] <= node.threshold:
        return predict_one(x, node.left)
    else:
        return predict_one(x, node.right)

def predict(X, tree):
    return np.array([predict_one(row, tree) for row in X])

# Shuffle data manually without sklearn
def manual_shuffle(X, y, random_state=42):
    np.random.seed(random_state)
    indices = np.random.permutation(len(X))
    return X[indices], y[indices]


In [32]:
class RandomForest:
    def __init__(self, n_trees=5, max_depth=5, min_samples_split=30):   # <- FIXED HERE
        self.n_trees = n_trees
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.trees = []

    def fit(self, X_train, y_train):
        self.trees = []
        for i in range(self.n_trees):
            print(f"Training tree {i+1}/{self.n_trees}")  # 🧠 Add this line

            idxs = np.random.choice(len(X_train), len(X_train), replace=True)
            X_sample, y_sample = X_train[idxs], y_train[idxs]
            tree = build_tree(X_sample, y_sample,
                              max_depth=self.max_depth,
                              min_samples_split=self.min_samples_split)
            self.trees.append(tree)

    def predict(self, X):
        tree_preds = np.array([predict(X, tree) for tree in self.trees])
        return np.array([
            np.bincount(tree_preds[:, i]).argmax()
            for i in range(len(X))
        ])


In [33]:
X_shuffled, y_shuffled = manual_shuffle(X_train, y_train, random_state=42)
X_small = X_shuffled[:200]
y_small = y_shuffled[:200]

# Create and train Random Forest
rf = RandomForest(n_trees=10, max_depth=3, min_samples_split=30)
rf.fit(X_small, y_small)

# Test predictions
train_preds = rf.predict(X_small)
train_accuracy = np.sum(train_preds == y_small) / len(y_small)
print("Training Accuracy:", train_accuracy * 100, "%")

test_preds = rf.predict(X_test)
test_accuracy = np.sum(test_preds == y_test) / len(y_test)
print("Test Accuracy:", test_accuracy * 100, "%")


Training tree 1/10
Tree depth: 0, Samples: 200
Tree depth: 1, Samples: 56
Tree depth: 2, Samples: 11
Tree depth: 2, Samples: 45
Tree depth: 3, Samples: 35
Tree depth: 3, Samples: 10
Tree depth: 1, Samples: 144
Tree depth: 2, Samples: 7
Tree depth: 2, Samples: 137
Tree depth: 3, Samples: 131
Tree depth: 3, Samples: 6
Training tree 2/10
Tree depth: 0, Samples: 200
Tree depth: 1, Samples: 124
Tree depth: 2, Samples: 106
Tree depth: 3, Samples: 100
Tree depth: 3, Samples: 6
Tree depth: 2, Samples: 18
Tree depth: 1, Samples: 76
Tree depth: 2, Samples: 60
Tree depth: 3, Samples: 50
Tree depth: 3, Samples: 10
Tree depth: 2, Samples: 16
Training tree 3/10
Tree depth: 0, Samples: 200
Tree depth: 1, Samples: 148
Tree depth: 2, Samples: 136
Tree depth: 3, Samples: 8
Tree depth: 3, Samples: 128
Tree depth: 2, Samples: 12
Tree depth: 1, Samples: 52
Tree depth: 2, Samples: 38
Tree depth: 3, Samples: 5
Tree depth: 3, Samples: 33
Tree depth: 2, Samples: 14
Training tree 4/10
Tree depth: 0, Samples: 20

In [34]:
def fit(self, X_train, y_train):
    self.trees = []
    for i in range(self.n_trees):
        print(f"Training tree {i+1}/{self.n_trees}")

