<a href="https://colab.research.google.com/github/yashkapur0403/Neural-Networks-Practise/blob/main/decision_tree.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [11]:
# STEP 1: Mount Drive and Extract Dataset
from google.colab import drive
import zipfile, os, cv2
import numpy as np

drive.mount('/content/drive')  # allow permission
zip_path = "/content/drive/My Drive/Colab Notebooks/chest_xray.zip"
extract_path = "/content/chest_xray"
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

# STEP 2: Load images and prepare data
def load_images_from_folder(folder_path, label):
    data = []
    for filename in os.listdir(folder_path):
        path = os.path.join(folder_path, filename)
        img = cv2.imread(path, cv2.IMREAD_GRAYSCALE)
        if img is not None:
            img = cv2.resize(img, (64, 64))
            img_flat = img.flatten() / 255.0
            data.append((img_flat, label))
    return data

normal_data = load_images_from_folder("/content/chest_xray/chest_xray/train/NORMAL", 0)
pneumonia_data = load_images_from_folder("/content/chest_xray/chest_xray/train/PNEUMONIA", 1)
all_data = normal_data + pneumonia_data
np.random.shuffle(all_data)

X_train = np.array([x for x, _ in all_data])
y_train = np.array([y for _, y in all_data])

normal_test = load_images_from_folder("/content/chest_xray/chest_xray/test/NORMAL", 0)
pneumonia_test = load_images_from_folder("/content/chest_xray/chest_xray/test/PNEUMONIA", 1)
all_test = normal_test + pneumonia_test
np.random.shuffle(all_test)

X_test = np.array([x for x, _ in all_test])
y_test = np.array([y for _, y in all_test])

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [18]:
import numpy as np

# Gini Impurity
def gini(y):
    classes, counts = np.unique(y, return_counts=True)     # return_counts is a np function # this line gives us 2 variables....ek me kitni classes [0,1] next me count
    probs = counts / counts.sum()
    return 1 - np.sum(probs ** 2)                   # gini probability formula

# Splitting the dataset
def split(X, y, feature_index, threshold):
    left_mask = X[:, feature_index] <= threshold         # less than threshold then left otherwise right leaf
    right_mask = ~left_mask                            # mask is not a np fxn...its just a notation generally used to represent boolean arrays in python
    return X[left_mask], y[left_mask], X[right_mask], y[right_mask]

# Best split - EFFICIENT VERSION
def best_split(X, y):
    m, n = X.shape
    best_gini = 1
    best_idx, best_thresh = None, None

    for feature_index in range(n):
        feature_values = X[:, feature_index]
        min_val, max_val = feature_values.min(), feature_values.max()

        # Use fewer thresholds but better distributed - this prevents infinite loops
        if min_val == max_val:
            continue

        # Sample 15 thresholds between min and max (compromise between your 10 and efficiency)
        thresholds = np.linspace(min_val, max_val, 15)[1:-1]  # exclude min and max

        for t in thresholds:
            _, y_left, _, y_right = split(X, y, feature_index, t)
            if len(y_left) == 0 or len(y_right) == 0:     # if empty then ignore
                continue

            # Add minimum samples requirement to prevent overfitting - STRICTER
            if len(y_left) < 15 or len(y_right) < 15:
                continue

            g = (len(y_left) * gini(y_left) + len(y_right) * gini(y_right)) / m       # total gini
            if g < best_gini:                         # if total calc gini less than current... we repalce and update with the current one
                best_gini = g
                best_idx = feature_index               # jis feature id pe mili gini, usko best idx me update karte jayenge
                best_thresh = t                        # similarly for the threshold part too
    return best_idx, best_thresh

# Build the tree
class Node:
    def __init__(self, feature_index=None, threshold=None, left=None, right=None, value=None):
        self.feature_index = feature_index
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value  # used for leaf node

def build_tree(X, y, depth=0, max_depth=6, min_samples_split=25):  # Much more restrictive parameters
    # Add more stopping conditions to prevent overfitting
    if (len(set(y)) == 1 or
        depth == max_depth or
        len(y) < min_samples_split):
        # Leaf node
        values, counts = np.unique(y, return_counts=True)
        return Node(value=values[np.argmax(counts)])

    feat_idx, thresh = best_split(X, y)
    if feat_idx is None:
        values, counts = np.unique(y, return_counts=True)
        return Node(value=values[np.argmax(counts)])

    X_left, y_left, X_right, y_right = split(X, y, feat_idx, thresh)

    # Ensure minimum samples in each split - STRICTER
    if len(y_left) < 15 or len(y_right) < 15:
        values, counts = np.unique(y, return_counts=True)
        return Node(value=values[np.argmax(counts)])

    left_child = build_tree(X_left, y_left, depth + 1, max_depth, min_samples_split)
    right_child = build_tree(X_right, y_right, depth + 1, max_depth, min_samples_split)
    return Node(feature_index=feat_idx, threshold=thresh, left=left_child, right=right_child)

# Predict
def predict_one(x, node):
    if node.value is not None:
        return node.value
    if x[node.feature_index] <= node.threshold:
        return predict_one(x, node.left)
    else:
        return predict_one(x, node.right)

def predict(X, tree):
    return np.array([predict_one(row, tree) for row in X])

# Shuffle data manually without sklearn
def manual_shuffle(X, y, random_state=42):
    np.random.seed(random_state)
    indices = np.random.permutation(len(X))
    return X[indices], y[indices]

# Use manual shuffle instead of sklearn - USE SMALLER SAMPLE FOR FASTER EXECUTION
X_shuffled, y_shuffled = manual_shuffle(X_train, y_train, random_state=42)
X_small = X_shuffled[:100]  # Reduced from 500 to 100 for faster execution
y_small = y_shuffled[:100]

# Build tree with better parameters
tree = build_tree(X_small, y_small, max_depth=8, min_samples_split=15)
preds = predict(X_small, tree)

accuracy = np.sum(preds == y_small) / len(y_small)
print("Training Accuracy:", accuracy * 100 , "%")

# Evaluate on test set
test_preds = predict(X_test, tree)
test_acc = np.sum(test_preds == y_test) / len(y_test)
print("Test Accuracy:", test_acc * 100, "%")

Training Accuracy: 91.0 %
Test Accuracy: 64.90384615384616 %


In [None]:
from sklearn.metrics import confusion_matrix

print("Confusion Matrix on Training Data:")
print(confusion_matrix(y_small, preds))
