In [None]:
# import system libs
import os
import time
import shutil
import pathlib
import itertools
from PIL import Image

# import data handling tools
import cv2
import numpy as np
import pandas as pd
import seaborn as sns
sns.set_style('darkgrid')
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

# import Deep learning Libraries
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam, Adamax
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Activation, Dropout, BatchNormalization
from tensorflow.keras import regularizers

# Ignore Warnings
import warnings
warnings.filterwarnings("ignore")

print ('modules loaded')

In [None]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split

# ----------------- 1. Dataset directory -----------------
data_dir = '/kaggle/input/luekemia-detection-blood-smear-images/augumented_dataset'

# ----------------- 2. Generate file paths and labels -----------------
filepaths = []
labels = []

# Loop through each class folder (healthy, leukemia)
for folder_name in os.listdir(data_dir):
    folder_path = os.path.join(data_dir, folder_name)
    
    # Skip if not a directory
    if not os.path.isdir(folder_path):
        continue
    
    for file_name in os.listdir(folder_path):
        file_path = os.path.join(folder_path, file_name)
        filepaths.append(file_path)
        labels.append(folder_name)  # class name

# ----------------- 3. Create DataFrame -----------------
df = pd.DataFrame({
    'filepaths': filepaths,
    'labels': labels
})

# ----------------- 4. Encode labels as integers -----------------
# healthy = 0, leukemia = 1
df['labels'] = df['labels'].map({'healthy': 0, 'cancerous': 1})

# ----------------- 5. Shuffle the dataframe -----------------
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# ----------------- 6. Split into training and validation -----------------
train_df, val_df = train_test_split(df, test_size=0.2, stratify=df['labels'], random_state=42)

# ----------------- 7. Print summary -----------------
print("Total images:", len(df))
print("Training set:", len(train_df))
print("Validation set:", len(val_df))
print("Sample data:\n", train_df.head())


In [None]:
# train dataframe
train_df, dummy_df = train_test_split(df,  train_size= 0.8, shuffle= True, random_state= 123)

# valid and test dataframe
valid_df, test_df = train_test_split(dummy_df,  train_size= 0.6, shuffle= True, random_state= 123)

In [None]:
# crobed image size
batch_size = 16
img_size = (224, 224)
channels = 3
img_shape = (img_size[0], img_size[1], channels)

tr_gen = ImageDataGenerator()
ts_gen = ImageDataGenerator()

train_gen = tr_gen.flow_from_dataframe( train_df, x_col= 'filepaths', y_col= 'labels', target_size= img_size, class_mode= 'raw',
                                    color_mode= 'rgb', shuffle= True, batch_size= batch_size)

valid_gen = ts_gen.flow_from_dataframe( valid_df, x_col= 'filepaths', y_col= 'labels', target_size= img_size, class_mode= 'raw',
                                    color_mode= 'rgb', shuffle= True, batch_size= batch_size)

test_gen = ts_gen.flow_from_dataframe( test_df, x_col= 'filepaths', y_col= 'labels', target_size= img_size, class_mode= 'raw',
                                    color_mode= 'rgb', shuffle= False, batch_size= batch_size)

In [11]:
from tensorflow.keras import Sequential, layers, regularizers
from tensorflow.keras.optimizers import Adamax
import tensorflow as tf

# ------------------ Model Setup ------------------
img_size = (224, 224)
channels = 3
img_shape = (img_size[0], img_size[1], channels)

# Pretrained EfficientNetB3
base_model = tf.keras.applications.EfficientNetB3(
    include_top=False, 
    weights="imagenet", 
    input_shape=img_shape, 
    pooling='max'
)
# Freeze backbone initially
base_model.trainable = False

# Model head
model = Sequential([
    base_model,
    layers.BatchNormalization(),
    layers.Dense(
        256,
        activation='relu',
        kernel_regularizer=regularizers.l2(1e-4),
        activity_regularizer=regularizers.l1(1e-5),
        bias_regularizer=regularizers.l1(1e-5)
    ),
    layers.Dropout(0.45),
    layers.Dense(1, activation='sigmoid')  # Binary classification
])

# Compile
model.compile(
    optimizer=Adamax(learning_rate=1e-4),
    loss='binary_crossentropy',
    metrics=['accuracy']
)

model.summary()


I0000 00:00:1759318308.451389      36 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 15513 MB memory:  -> device: 0, name: Tesla P100-PCIE-16GB, pci bus id: 0000:00:04.0, compute capability: 6.0


Downloading data from https://storage.googleapis.com/keras-applications/efficientnetb3_notop.h5
[1m43941136/43941136[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 0us/step


In [None]:
epochs = 10   # number of all epochs in training

history = model.fit(x= train_gen, epochs= epochs, verbose= 1, validation_data= valid_gen, 
                    validation_steps= None, shuffle= False)

Epoch 1/10


I0000 00:00:1759318363.026573     114 service.cc:148] XLA service 0x7ee7e4001e60 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1759318363.027461     114 service.cc:156]   StreamExecutor device (0): Tesla P100-PCIE-16GB, Compute Capability 6.0
I0000 00:00:1759318366.117681     114 cuda_dnn.cc:529] Loaded cuDNN version 90300


[1m  3/500[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m32s[0m 65ms/step - accuracy: 0.4826 - loss: 0.9329   

I0000 00:00:1759318380.959886     114 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m105s[0m 126ms/step - accuracy: 0.6761 - loss: 0.7272 - val_accuracy: 0.7950 - val_loss: 0.4868
Epoch 2/10
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 69ms/step - accuracy: 0.7662 - loss: 0.5838 - val_accuracy: 0.8142 - val_loss: 0.4636
Epoch 3/10
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 68ms/step - accuracy: 0.7706 - loss: 0.5597 - val_accuracy: 0.8183 - val_loss: 0.4494
Epoch 4/10
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 68ms/step - accuracy: 0.7984 - loss: 0.5063 - val_accuracy: 0.8275 - val_loss: 0.4433
Epoch 5/10
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 69ms/step - accuracy: 0.8004 - loss: 0.5070 - val_accuracy: 0.8317 - val_loss: 0.4345
Epoch 6/10
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 77ms/step - accuracy: 0.8055 - loss: 0.4930 - val_accuracy: 0.8325 - val_loss: 0.4297
Epoch 7/10
[1m500/500[0

In [None]:
# Unfreeze top 100 layers for fine-tuning
for layer in base_model.layers[-100:]:
    layer.trainable = True

# Lower learning rate for fine-tuning
model.compile(
    optimizer=Adamax(learning_rate=1e-5),
    loss='binary_crossentropy',
    metrics=['accuracy']
)


In [None]:
plt.figure(figsize=(8,6))
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Accuracy vs Epochs')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.grid(True)
plt.show()
