<a href="https://colab.research.google.com/github/yashkapur0403/Lung-X-Ray-Pnuemonia/blob/main/Gradient_boosting.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

from google.colab import drive
import zipfile, os, cv2
import numpy as np

drive.mount('/content/drive')  # allow permission
zip_path = "/content/drive/My Drive/Colab Notebooks/chest_xray.zip"
extract_path = "/content/chest_xray"
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

# STEP 2: Load images and prepare data
def load_images_from_folder(folder_path, label):
    data = []
    for filename in os.listdir(folder_path):
        path = os.path.join(folder_path, filename)
        img = cv2.imread(path, cv2.IMREAD_GRAYSCALE)
        if img is not None:
            img = cv2.resize(img, (64, 64))
            img_flat = img.flatten() / 255.0
            data.append((img_flat, label))
    return data

normal_data = load_images_from_folder("/content/chest_xray/chest_xray/train/NORMAL", 0)
pneumonia_data = load_images_from_folder("/content/chest_xray/chest_xray/train/PNEUMONIA", 1)
all_data = normal_data + pneumonia_data

X_train = np.array([x for x, _ in all_data])
y_train = np.array([y for _, y in all_data])

normal_test = load_images_from_folder("/content/chest_xray/chest_xray/test/NORMAL", 0)
pneumonia_test = load_images_from_folder("/content/chest_xray/chest_xray/test/PNEUMONIA", 1)
all_test = normal_test + pneumonia_test

X_test = np.array([x for x, _ in all_test])
y_test = np.array([y for _, y in all_test])

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Sigmoid function
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

class GBMClassifier:
    def __init__(self, n_estimators=50, learning_rate=0.1):
        self.n_estimators = n_estimators
        self.lr = learning_rate
        self.models = []

    def fit(self, X, y):
        n_samples, n_features = X.shape
        # Initial prediction: log(odds)
        p = np.clip(np.mean(y), 1e-6, 1 - 1e-6)          #1e-6 is 0.00001 and 0.99999... to avoid extreme probab 0 and 1(log undefined)
        F = np.full(n_samples, np.log(p / (1 - p)))      #base probab ie log of odds

        for _ in range(self.n_estimators):      # estimator means  how much boosting steps (trees) required
            a = sigmoid(F)                      # F is passed through sigmoid to get log of odds
            residuals = y - a                   # residuals

            # slump tree making
            best_feat, best_thresh, best_gain = None, None, -np.inf      # finding best stump
            for f in range(n_features):
                thresholds = np.linspace(np.min(X[:, f]), np.max(X[:, f]), 10)[1:-1]     # trying only 10 thresholds to avoid unnec time & avoid overfitting
                for t in thresholds:
                    left = residuals[X[:, f] <= t]
                    right = residuals[X[:, f] > t]
                    if len(left) == 0 or len(right) == 0:
                        continue
                    gain = np.abs(left.sum()) + np.abs(right.sum())   # np.abs is to make abolute value for an integer (remove sign)
                    if gain > best_gain:
                        best_feat, best_thresh = f, t
                        best_gain = gain

            left_idx = X[:, best_feat] <= best_thresh
            right_idx = ~left_idx

            r_left, a_left = residuals[left_idx], a[left_idx]
            r_right, a_right = residuals[right_idx], a[right_idx]

            # Newton-Raphson formula to transform the output of leaf nodes
            gamma_left = np.sum(r_left) / (np.sum(a_left * (1 - a_left)) + 1e-8)
            gamma_right = np.sum(r_right) / (np.sum(a_right * (1 - a_right)) + 1e-8)

            F[left_idx] += self.lr * gamma_left        #continue updating for all estimaators
            F[right_idx] += self.lr * gamma_right

            self.models.append((best_feat, best_thresh, gamma_left, gamma_right))   # apend in vari. "model" so twe can use it in def predict_probab

    # this is for test_data kyuki upar ka fit is for train_data, isko test_data se compare karenge
    def predict_proba(self, X):
        F = np.zeros(X.shape[0])
        for feat, thresh, gamma_left, gamma_right in self.models:
            left = X[:, feat] <= thresh
            right = ~left
            F[left] += self.lr * gamma_left           # final shrinkage added directly for left and right node seprately
            F[right] += self.lr * gamma_right
        return sigmoid(F)                            # returns final prbab for, which is passed onto def predict, which tells if > than 0.5 or not
    def predict(self, X):
        return (self.predict_proba(X) >= 0.5).astype(int)


In [None]:
# dont think much of this cell... this only imports sklearn to predict accuracy for out model

from sklearn.metrics import accuracy_score

# Training the GBM model
model123 = GBMClassifier(n_estimators=50, learning_rate=0.1)
model123.fit(X_train, y_train)

# Predict
acc = accuracy_score(y_test, model123.predict(X_test))
acc1 = accuracy_score(y_train, model123.predict(X_train))

print(f"Training Accuracy: {acc1 * 100: }%")
print(f"Test Accuracy: {acc * 100: }%")


Training Accuracy:  87.68227168073676%
Test Accuracy:  79.96794871794873%


In [None]:
from sklearn.ensemble import GradientBoostingClassifier

sk_model = GradientBoostingClassifier()
sk_model.fit(X_train, y_train)
print("Sklearn Test Accuracy:", sk_model.score(X_test, y_test))


Sklearn Test Accuracy: 0.717948717948718
