# Applying SMOTE


In [1]:
# import libraries
import os
import cv2
import numpy as np
import tensorflow as tf
from tensorflow.keras.applications.vgg16 import VGG16, preprocess_input
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from imblearn.over_sampling import SMOTE

In [None]:
from google.colab import drive
drive.mount('/content/drive')

from google.colab import userdata
import os

# Load Kaggle API credentials
os.environ["KAGGLE_USERNAME"] = userdata.get('KAGGLE_USER')
os.environ["KAGGLE_KEY"] = userdata.get('KAGGLE_KEY')

# Download and unzip dataset
!kaggle datasets download -d jtiptj/chest-xray-pneumoniacovid19tuberculosis
!unzip -q chest-xray-pneumoniacovid19tuberculosis.zip -d /content/

# Define dataset directory
DATASET_DIR = "/content/archive/train"


In [None]:
# Set parameters
# Using the training set from the dataset structure: archive/train/
image_size = (224, 224)

In [3]:
from tensorflow.keras.preprocessing.image import load_img, img_to_array

def load_dataset_rgb(dataset_dir, image_size=(224, 224)):
    images = []
    labels = []
    class_names = sorted([d for d in os.listdir(dataset_dir) if os.path.isdir(os.path.join(dataset_dir, d))])
    label_map = {cls: i for i, cls in enumerate(class_names)}

    for cls in class_names:
        cls_dir = os.path.join(dataset_dir, cls)
        for img_file in os.listdir(cls_dir):
            img_path = os.path.join(cls_dir, img_file)
            try:
                img = load_img(img_path, target_size=image_size)  # RGB by default
                img = img_to_array(img)
                images.append(img)
                labels.append(label_map[cls])
            except Exception as e:
                print(f"Error loading {img_path}: {e}")

    return np.array(images), np.array(labels), class_names


In [4]:
# Load the training dataset
X, y, class_names = load_dataset_rgb(DATASET_DIR)
print("Loaded training images shape:", X.shape)
print("Loaded labels shape:", y.shape)
print("Classes found:", class_names)

Loaded training images shape: (6326, 224, 224, 3)
Loaded labels shape: (6326,)
Classes found: ['COVID19', 'NORMAL', 'PNEUMONIA', 'TURBERCULOSIS']


In [5]:
# Preprocess images for VGG16
X_preprocessed = preprocess_input(X)

In [7]:
# Load the VGG16 model without the top classification layers for feature extraction
base_model = VGG16(weights='imagenet', include_top=False, input_shape=(224, 224, 3))
global_avg_pool = tf.keras.layers.GlobalAveragePooling2D()

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/vgg16/vgg16_weights_tf_dim_ordering_tf_kernels_notop.h5
[1m58889256/58889256[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 0us/step


In [8]:
# Extract deep features from images
features = base_model.predict(X_preprocessed)
features = global_avg_pool(features).numpy()
print("Extracted features shape:", features.shape)

[1m198/198[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2888s[0m 15s/step
Extracted features shape: (6326, 512)


In [9]:
# Apply SMOTE on the extracted features to balance the dataset
smote = SMOTE(random_state=42)
features_bal, y_bal = smote.fit_resample(features, y)
print("After SMOTE - features shape:", features_bal.shape)
print("After SMOTE - labels shape:", y_bal.shape)

After SMOTE - features shape: (15500, 512)
After SMOTE - labels shape: (15500,)


In [10]:
# Save the balanced features and labels 
np.save('features_balanced.npy', features_bal)
np.save('labels_balanced.npy', y_bal)
print("Balanced dataset saved as 'features_balanced.npy' and 'labels_balanced.npy'.")

Balanced dataset saved as 'features_balanced.npy' and 'labels_balanced.npy'.
