# CNN, Convolutional Neural Network

CNN on the HEp2 Cell Dataset.  Default hyperparameters and GridSearch hyperparameter tuning.

The first set of code blocks are of function definitions.  The last ones are calling the functions.  You need to run these first ones in order, the the last set of function calls.

# Label Related Functions

Loading of the matlab file and displaying some information on it, mainly the distribution of the labels.

In [31]:
import scipy.io
import numpy as np

def load_labels(mat_file, num_labels=63445):
    print("Loading labels...")
    labels = scipy.io.loadmat(mat_file)['labels'].flatten()[:num_labels]
    print(f"Loaded {len(labels)} labels.\n")
    return labels

def print_label_distribution(labels):
    unique, counts = np.unique(labels, return_counts=True)
    total = len(labels)
    print("Label Distribution:")
    for label, count in zip(unique, counts):
        percentage = (count / total) * 100
        print(f"Label {label}: {count} ({percentage:.2f}%)")


# Preprocessing Function

Preprocessing of 64 x 64 image sizes, normalization 0-1 values, and grayscale.

In [32]:
import cv2
import numpy as np
import os

def preprocess_images(image_dir, num_images):
    print("Preprocessing images...")
    images = []
    for i in range(1, num_images + 1):
        img_path = os.path.join(image_dir, f"{i}.png")
        if not os.path.exists(img_path):
            print(f"Warning: Image {img_path} not found")
            continue

        image = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
        image = cv2.resize(image, (64, 64))
        image = image / 255.0
        images.append(image)

        if i % 5000 == 0:
            print(f"Processed images {i-4999}-{i}")
    
    print("Finished preprocessing images.\n")
    return np.array(images)


# Data Splitting, CNN Model Building, Training, Validation, Testing

Here we split the dataset, build the cnn model, train it, validate it, and test it.

In [41]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam

def train_and_evaluate(X, y, sample_size=25000, learning_rate=0.001, dropout_rate=0.5, conv_filters=32):
    """
    Train and evaluate CNN model with configurable hyperparameters.
    
    Args:
        X: Input features
        y: Target labels
        sample_size: Number of samples to use
        learning_rate: Learning rate for Adam optimizer
        dropout_rate: Dropout rate for regularization
        conv_filters: Number of filters in first conv layer (second layer will be doubled)
    """
    print("Starting training and evaluation...")

    # Convert labels to categorical
    y = to_categorical(y)

    # Sample 25,000 images directly from the original dataset
    X_sampled, _, y_sampled, _ = train_test_split(X, y, train_size=sample_size, random_state=42)
    print(f"Sampled data shape: {X_sampled.shape}")

    # Split the sampled data into new training, validation, and testing datasets
    X_train, X_temp, y_train, y_temp = train_test_split(X_sampled, y_sampled, test_size=0.4, random_state=42)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

    print(f"Sampled data split: {len(X_train)} training, {len(X_val)} validation, {len(X_test)} testing samples")

    # Build the CNN model with configurable hyperparameters
    model = Sequential([
        Conv2D(conv_filters, (3, 3), activation='relu', input_shape=(64, 64, 1)),
        MaxPooling2D((2, 2)),
        Conv2D(conv_filters * 2, (3, 3), activation='relu'),
        MaxPooling2D((2, 2)),
        Flatten(),
        Dense(128, activation='relu'),
        Dropout(dropout_rate),
        Dense(y.shape[1], activation='softmax')
    ])

    # Compile the model with configurable learning rate
    model.compile(optimizer=Adam(learning_rate=learning_rate), 
                 loss='categorical_crossentropy', 
                 metrics=['accuracy'])

    # Train the model
    print(f"Training CNN model with learning_rate={learning_rate}, dropout_rate={dropout_rate}, conv_filters={conv_filters}")
    model.fit(X_train, y_train, epochs=10, validation_data=(X_val, y_val), batch_size=32)
    print("CNN model trained")

    # Evaluate the model
    print("Evaluating model...")
    val_loss, val_accuracy = model.evaluate(X_val, y_val)
    print(f"Validation Accuracy: {val_accuracy}")
    print("Validation Classification Report:")
    y_val_pred = model.predict(X_val)
    y_val_pred = np.argmax(y_val_pred, axis=1)
    y_val = np.argmax(y_val, axis=1)
    print(classification_report(y_val, y_val_pred))

    test_loss, test_accuracy = model.evaluate(X_test, y_test)
    print(f"Test Accuracy: {test_accuracy}")
    print("Test Classification Report:")
    y_test_pred = model.predict(X_test)
    y_test_pred = np.argmax(y_test_pred, axis=1)
    y_test = np.argmax(y_test, axis=1)
    print(classification_report(y_test, y_test_pred))
    

# Function Calls

The following are the calling of the functions we just defined.

## Label Related

In [34]:
# Local directory paths
image_dir = 'cells'  # Directory containing images
mat_file = 'labels.mat'  # Labels file
num_images = 63445

# Load labels and preprocess images
labels = load_labels(mat_file, num_labels=num_images)
print_label_distribution(labels)
images = preprocess_images(image_dir, num_images)

Loading labels...
Loaded 63445 labels.

Label Distribution:
Label 1: 14367 (22.64%)
Label 2: 14655 (23.10%)
Label 3: 13257 (20.90%)
Label 4: 13737 (21.65%)
Label 5: 5086 (8.02%)
Label 6: 2343 (3.69%)
Preprocessing images...
Processed images 1-5000
Processed images 5001-10000
Processed images 10001-15000
Processed images 15001-20000
Processed images 20001-25000
Processed images 25001-30000
Processed images 30001-35000
Processed images 35001-40000
Processed images 40001-45000
Processed images 45001-50000
Processed images 50001-55000
Processed images 55001-60000
Finished preprocessing images.



## Reshape for CNN

In [35]:

# Reshape images for CNN
X = np.array(images).reshape(-1, 64, 64, 1)
y = np.array(labels)

## Data Split, Model Building, Hyperparameters, Training, Validation, Testing

These settings listed below, that you can comment out and test, do not perform as well as the settings in cnn.ipynb. The best of them is 83% which is close to the best model of 86% in the other cnn python notebook. 

In [None]:
# Train and evaluate the model

# Ensure y has the correct number of classes
num_classes = len(np.unique(y))
print(f"Number of classes: {num_classes}")

#train_and_evaluate(X, y)

# Conservative/lightweight setup
#train_and_evaluate(X, y, learning_rate=0.0001, dropout_rate=0.3, conv_filters=16)

# Aggressive learning setup
#train_and_evaluate(X, y, learning_rate=0.01, dropout_rate=0.4, conv_filters=64)

# Complex feature detection setup
train_and_evaluate(X, y, learning_rate=0.001, dropout_rate=0.5, conv_filters=128)
# this setting performs the best of these pre-defined settings.  83% accuracy on the test set.

# Careful learning setup
#train_and_evaluate(X, y, learning_rate=0.00001, dropout_rate=0.6, conv_filters=32)

Number of classes: 6
Starting training and evaluation...
Sampled data shape: (25000, 64, 64, 1)
Sampled data split: 15000 training, 5000 validation, 5000 testing samples
Training CNN model with learning_rate=0.001, dropout_rate=0.5, conv_filters=128
Epoch 1/10


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m74s[0m 156ms/step - accuracy: 0.4397 - loss: 1.3483 - val_accuracy: 0.7028 - val_loss: 0.8027
Epoch 2/10
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m74s[0m 158ms/step - accuracy: 0.6708 - loss: 0.8464 - val_accuracy: 0.7318 - val_loss: 0.6981
Epoch 3/10
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m72s[0m 155ms/step - accuracy: 0.7214 - loss: 0.7170 - val_accuracy: 0.7592 - val_loss: 0.6066
Epoch 4/10
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m73s[0m 156ms/step - accuracy: 0.7427 - loss: 0.6551 - val_accuracy: 0.7838 - val_loss: 0.5769
Epoch 5/10
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m75s[0m 161ms/step - accuracy: 0.7738 - loss: 0.6153 - val_accuracy: 0.8076 - val_loss: 0.5236
Epoch 6/10
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m75s[0m 159ms/step - accuracy: 0.7904 - loss: 0.5465 - val_accuracy: 0.8188 - val_loss: 0.4967
Epoch 7/10
[1m469/46

83% is not bad, and the best of these. The other file has am 86% accuracy model.