In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import os
import numpy as np
import cv2
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

# Path to dataset in Google Drive
data_path = "/content/drive/MyDrive/chest_xray"

# Categories (Classes)
categories = ["NORMAL", "PNEUMONIA"]

# Lists to store data
X, y = [], []

# Read images and preprocess
for category in categories:
    folder_path = os.path.join(data_path, "train", category)  # Change 'train' to 'test' for testing data
    label = categories.index(category)  # Convert labels to 0 (Normal) & 1 (Pneumonia)

    for file in os.listdir(folder_path):
        img_path = os.path.join(folder_path, file)
        img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)  # Convert to grayscale
        img = cv2.resize(img, (128, 128))  # Resize to 128x128
        X.append(img)
        y.append(label)

# Convert to numpy arrays
X = np.array(X) / 255.0  # Normalize pixel values
y = np.array(y)

# Flatten images (for ML models)
X = X.reshape(len(X), -1)

# Split into train & test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# ***`Logistic Regression`***

In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

log_reg = LogisticRegression(max_iter=5000)
log_reg.fit(X_train, y_train)

# Predictions
y_pred = log_reg.predict(X_test)

# Evaluation
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Logistic Regression Accuracy: 0.9588122605363985
              precision    recall  f1-score   support

           0       0.94      0.91      0.92       287
           1       0.97      0.98      0.97       757

    accuracy                           0.96      1044
   macro avg       0.95      0.94      0.95      1044
weighted avg       0.96      0.96      0.96      1044



# ***`🌳 Decision Tree Classifier`***

In [6]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(max_depth=5)
dt.fit(X_train, y_train)

y_pred_dt = dt.predict(X_test)

print("Decision Tree Accuracy:", accuracy_score(y_test, y_pred_dt))
print(classification_report(y_test, y_pred_dt))


Decision Tree Accuracy: 0.8831417624521073
              precision    recall  f1-score   support

           0       0.82      0.74      0.78       287
           1       0.90      0.94      0.92       757

    accuracy                           0.88      1044
   macro avg       0.86      0.84      0.85      1044
weighted avg       0.88      0.88      0.88      1044



# ***`🌲 Random Forest Classifier`***

In [7]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)
rf.fit(X_train, y_train)

y_pred_rf = rf.predict(X_test)

print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))


Random Forest Accuracy: 0.9521072796934866
              precision    recall  f1-score   support

           0       0.96      0.86      0.91       287
           1       0.95      0.99      0.97       757

    accuracy                           0.95      1044
   macro avg       0.96      0.92      0.94      1044
weighted avg       0.95      0.95      0.95      1044



# ***`🔥 XGBoost Classifier`***

In [8]:
from xgboost import XGBClassifier

xgb = XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=6, use_label_encoder=False, eval_metric='logloss')
xgb.fit(X_train, y_train)

y_pred_xgb = xgb.predict(X_test)

print("XGBoost Accuracy:", accuracy_score(y_test, y_pred_xgb))
print(classification_report(y_test, y_pred_xgb))


Parameters: { "use_label_encoder" } are not used.



XGBoost Accuracy: 0.9597701149425287
              precision    recall  f1-score   support

           0       0.96      0.89      0.92       287
           1       0.96      0.99      0.97       757

    accuracy                           0.96      1044
   macro avg       0.96      0.94      0.95      1044
weighted avg       0.96      0.96      0.96      1044



# ***`📌 Hyperparameter Tuning 🎯 Hyperparameter Tuning for Random Forest`***

In [9]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

param_grid = {
    'n_estimators': [50, 100],  # Reduce values
    'max_depth': [5, 10],       # Reduce depth
    'min_samples_split': [2, 5]  # Reduce options
}

# Reduce cv to 3
grid = GridSearchCV(RandomForestClassifier(), param_grid, cv=3, scoring='accuracy', n_jobs=-1, verbose=2)
grid.fit(X_train, y_train)
# Best hyperparameters
print("Best Parameters:", grid.best_params_)

Fitting 3 folds for each of 8 candidates, totalling 24 fits
Best Parameters: {'max_depth': 10, 'min_samples_split': 5, 'n_estimators': 100}


# ***`📌Save the Best Model`***

In [10]:
# Best hyperparameters
print("Best Parameters:", grid.best_params_)

# Save best model
import joblib
joblib.dump(grid.best_estimator_, 'optimized_rf_model.pkl')

# Load & use model
loaded_model = joblib.load('optimized_rf_model.pkl')
y_pred = loaded_model.predict(X_test)

Best Parameters: {'max_depth': 10, 'min_samples_split': 5, 'n_estimators': 100}


# ***`📌Load the Model & Predict on New Image`***

In [12]:
model = joblib.load("optimized_rf_model.pkl")

# Load a new image
new_image_path = "/content/drive/MyDrive/chest_xray/test/NORMAL/IM-0001-0001.jpeg"

# Preprocess the image
img = cv2.imread(new_image_path, cv2.IMREAD_GRAYSCALE)
img = cv2.resize(img, (128, 128)) / 255.0
img = img.flatten().reshape(1, -1)  # Reshape for model

# Predict
prediction = model.predict(img)
print("Prediction:", "Pneumonia" if prediction[0] == 1 else "Normal")


Prediction: Normal
