<a href="https://colab.research.google.com/github/yashkapur0403/Lung-X-Ray-Pnuemonia/blob/main/XGBoost_new.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [11]:

from google.colab import drive
import zipfile, os, cv2
import numpy as np

drive.mount('/content/drive')  # allow permission
zip_path = "/content/drive/My Drive/Colab Notebooks/chest_xray.zip"
extract_path = "/content/chest_xray"
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

# STEP 2: Load images and prepare data
def load_images_from_folder(folder_path, label):
    data = []
    for filename in os.listdir(folder_path):
        path = os.path.join(folder_path, filename)
        img = cv2.imread(path, cv2.IMREAD_GRAYSCALE)
        if img is not None:
            img = cv2.resize(img, (64, 64))
            img_flat = img.flatten() / 255.0
            data.append((img_flat, label))
    return data

normal_data = load_images_from_folder("/content/chest_xray/chest_xray/train/NORMAL", 0)
pneumonia_data = load_images_from_folder("/content/chest_xray/chest_xray/train/PNEUMONIA", 1)
all_data = normal_data + pneumonia_data

X_train = np.array([x for x, _ in all_data])
y_train = np.array([y for _, y in all_data])

normal_test = load_images_from_folder("/content/chest_xray/chest_xray/test/NORMAL", 0)
pneumonia_test = load_images_from_folder("/content/chest_xray/chest_xray/test/PNEUMONIA", 1)
all_test = normal_test + pneumonia_test

X_test = np.array([x for x, _ in all_test])
y_test = np.array([y for _, y in all_test])

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [12]:
import numpy as np

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

class XGBoostFullTree:
    def __init__(self, n_estimators=50, learning_rate=0.1, reg_lambda=0.0, max_depth=2):
        self.n_estimators = n_estimators
        self.lr = learning_rate
        self.reg_lambda = reg_lambda
        self.max_depth = max_depth
        self.models = []

    def fit(self, X, y):
        n_samples, n_features = X.shape
        pos_ratio = np.clip(np.mean(y), 1e-6, 1 - 1e-6)
        F = np.full(n_samples, np.log(pos_ratio / (1 - pos_ratio)))

        for i in range(self.n_estimators):
            print(f"Training tree {i+1}/{self.n_estimators}")
            prob = sigmoid(F)
            g = prob - y
            h = prob * (1 - prob)

            tree = self._build_tree(X, g, h, self.max_depth)
            self.models.append(tree)

            F += self.lr * np.array([self._apply_tree(tree, x) for x in X])

    def _build_tree(self, X, g, h, depth):
        if depth == 0 or len(X) <= 1 or np.all(X == X[0]):
            gamma = -np.sum(g) / (np.sum(h) + self.reg_lambda)
            return gamma

        best_feat, best_thresh, best_gain = None, None, -np.inf
        for feat in range(X.shape[1]):
            thresholds = np.linspace(X[:, feat].min(), X[:, feat].max(), 10)[1:-1]
            for t in thresholds:
                left = X[:, feat] <= t
                right = ~left
                if left.sum() == 0 or right.sum() == 0:
                    continue

                G_L, H_L = np.sum(g[left]), np.sum(h[left])
                G_R, H_R = np.sum(g[right]), np.sum(h[right])
                G, H = np.sum(g), np.sum(h)

                gain = 0.5 * (
                    (G_L**2) / (H_L + self.reg_lambda) +
                    (G_R**2) / (H_R + self.reg_lambda) -
                    (G**2) / (H + self.reg_lambda)
                )

                if gain > best_gain:
                    best_feat, best_thresh, best_gain = feat, t, gain

        if best_feat is None:
            gamma = -np.sum(g) / (np.sum(h) + self.reg_lambda)
            return gamma

        left = X[:, best_feat] <= best_thresh
        right = ~left

        return {
            "feat": best_feat,
            "thresh": best_thresh,
            "left": self._build_tree(X[left], g[left], h[left], depth - 1),
            "right": self._build_tree(X[right], g[right], h[right], depth - 1)
        }

    def _apply_tree(self, tree, x):
        if isinstance(tree, dict):
            if x[tree["feat"]] <= tree["thresh"]:
                return self._apply_tree(tree["left"], x)
            else:
                return self._apply_tree(tree["right"], x)
        else:
            return tree

    def predict_proba(self, X):
        F = np.zeros(X.shape[0])
        for tree in self.models:
            F += self.lr * np.array([self._apply_tree(tree, x) for x in X])
        return sigmoid(F)

    def predict(self, X):
        return (self.predict_proba(X) >= 0.5).astype(int)


In [14]:
from sklearn.metrics import accuracy_score

model = XGBoostFullTree(n_estimators=50, learning_rate=0.1, max_depth=2)
model.fit(X_train, y_train)

# Test accuracy
y_pred_test = model.predict(X_test)
print("Test Accuracy:", accuracy_score(y_test, y_pred_test))

# Training accuracy
y_pred_train = model.predict(X_train)
print("Train Accuracy:", accuracy_score(y_train, y_pred_train))


Training tree 1/50
Training tree 2/50
Training tree 3/50
Training tree 4/50
Training tree 5/50
Training tree 6/50
Training tree 7/50
Training tree 8/50
Training tree 9/50
Training tree 10/50
Training tree 11/50
Training tree 12/50
Training tree 13/50
Training tree 14/50
Training tree 15/50
Training tree 16/50
Training tree 17/50
Training tree 18/50
Training tree 19/50
Training tree 20/50
Training tree 21/50
Training tree 22/50
Training tree 23/50
Training tree 24/50
Training tree 25/50
Training tree 26/50
Training tree 27/50
Training tree 28/50
Training tree 29/50
Training tree 30/50
Training tree 31/50
Training tree 32/50
Training tree 33/50
Training tree 34/50
Training tree 35/50
Training tree 36/50
Training tree 37/50
Training tree 38/50
Training tree 39/50
Training tree 40/50
Training tree 41/50
Training tree 42/50
Training tree 43/50
Training tree 44/50
Training tree 45/50
Training tree 46/50
Training tree 47/50
Training tree 48/50
Training tree 49/50
Training tree 50/50
Test Accu