# CNN, Convolutional Neural Network

CNN on the HEp2 Cell Dataset.  Default hyperparameters and GridSearch hyperparameter tuning.

The first set of code blocks are of function definitions.  The last ones are calling the functions.  You need to run these first ones in order, the the last set of function calls.

# Label Related Functions

Loading of the matlab file and displaying some information on it, mainly the distribution of the labels.

In [4]:
import scipy.io
import numpy as np

def load_labels(mat_file, num_labels=63445):
    print("Loading labels...")
    labels = scipy.io.loadmat(mat_file)['labels'].flatten()[:num_labels]
    print(f"Loaded {len(labels)} labels.\n")
    return labels

def print_label_distribution(labels):
    unique, counts = np.unique(labels, return_counts=True)
    total = len(labels)
    print("Label Distribution:")
    for label, count in zip(unique, counts):
        percentage = (count / total) * 100
        print(f"Label {label}: {count} ({percentage:.2f}%)")


# Preprocessing Function

Preprocessing of 64 x 64 image sizes, normalization 0-1 values, and grayscale.

In [5]:
import cv2
import numpy as np
import os

def preprocess_images(image_dir, num_images):
    print("Preprocessing images...")
    images = []
    for i in range(1, num_images + 1):
        img_path = os.path.join(image_dir, f"{i}.png")
        if not os.path.exists(img_path):
            print(f"Warning: Image {img_path} not found")
            continue

        image = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
        image = cv2.resize(image, (64, 64))
        image = image / 255.0
        images.append(image)

        if i % 5000 == 0:
            print(f"Processed images {i-4999}-{i}")
    
    print("Finished preprocessing images.\n")
    return np.array(images)


# Data Splitting, CNN Model Building, Training, Validation, Testing

Here we split the dataset, build the cnn model, train it, validate it, and test it.

In [14]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam

def train_and_evaluate(X, y, sample_size=63000):
    print("Starting training and evaluation...")

    # Convert labels to categorical
    y = to_categorical(y)

    # Sample images directly from the original dataset
    X_sampled, _, y_sampled, _ = train_test_split(X, y, train_size=sample_size, random_state=42)
    print(f"Sampled data shape: {X_sampled.shape}")

    # Split the sampled data into new training, validation, and testing datasets
    X_train, X_temp, y_train, y_temp = train_test_split(X_sampled, y_sampled, test_size=0.4, random_state=42)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

    print(f"Sampled data split: {len(X_train)} training, {len(X_val)} validation, {len(X_test)} testing samples")

    # Build the CNN model
    model = Sequential([
        Conv2D(32, (3, 3), activation='relu', input_shape=(64, 64, 1)),
        MaxPooling2D((2, 2)),
        Conv2D(64, (3, 3), activation='relu'),
        MaxPooling2D((2, 2)),
        Flatten(),
        Dense(128, activation='relu'),
        Dropout(0.5),
        Dense(y.shape[1], activation='softmax')
    ])

    # Compile the model
    model.compile(optimizer=Adam(), loss='categorical_crossentropy', metrics=['accuracy'])

    # Train the model
    print("Training CNN model...")
    model.fit(X_train, y_train, epochs=10, validation_data=(X_val, y_val), batch_size=32)
    print("CNN model trained")

    # Evaluate the model
    print("Evaluating model...")
    val_loss, val_accuracy = model.evaluate(X_val, y_val)
    print(f"Validation Accuracy: {val_accuracy}")
    print("Validation Classification Report:")
    y_val_pred = model.predict(X_val)
    y_val_pred = np.argmax(y_val_pred, axis=1)
    y_val = np.argmax(y_val, axis=1)
    print(classification_report(y_val, y_val_pred))

    test_loss, test_accuracy = model.evaluate(X_test, y_test)
    print(f"Test Accuracy: {test_accuracy}")
    print("Test Classification Report:")
    y_test_pred = model.predict(X_test)
    y_test_pred = np.argmax(y_test_pred, axis=1)
    y_test = np.argmax(y_test, axis=1)
    print(classification_report(y_test, y_test_pred))


# Function Calls

The following are the calling of the functions we just defined.

## Label Related

In [7]:
# Local directory paths
image_dir = 'cells'  # Directory containing images
mat_file = 'labels.mat'  # Labels file
num_images = 63445

# Load labels and preprocess images
labels = load_labels(mat_file, num_labels=num_images)
print_label_distribution(labels)
images = preprocess_images(image_dir, num_images)

Loading labels...
Loaded 63445 labels.

Label Distribution:
Label 1: 14367 (22.64%)
Label 2: 14655 (23.10%)
Label 3: 13257 (20.90%)
Label 4: 13737 (21.65%)
Label 5: 5086 (8.02%)
Label 6: 2343 (3.69%)
Preprocessing images...
Processed images 1-5000
Processed images 5001-10000
Processed images 10001-15000
Processed images 15001-20000
Processed images 20001-25000
Processed images 25001-30000
Processed images 30001-35000
Processed images 35001-40000
Processed images 40001-45000
Processed images 45001-50000
Processed images 50001-55000
Processed images 55001-60000
Finished preprocessing images.



## Reshape for CNN

In [8]:

# Reshape images for CNN
X = np.array(images).reshape(-1, 64, 64, 1)
y = np.array(labels)

## Data Split, Model Building, Training, Validation, and Testing

In [15]:
# Train and evaluate the model
train_and_evaluate(X, y)

Starting training and evaluation...
Sampled data shape: (63000, 64, 64, 1)
Sampled data split: 37800 training, 12600 validation, 12600 testing samples
Training CNN model...


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m1182/1182[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 25ms/step - accuracy: 0.5317 - loss: 1.1501 - val_accuracy: 0.7543 - val_loss: 0.6695
Epoch 2/10
[1m1182/1182[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 24ms/step - accuracy: 0.7372 - loss: 0.6919 - val_accuracy: 0.7916 - val_loss: 0.5809
Epoch 3/10
[1m1182/1182[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 24ms/step - accuracy: 0.7760 - loss: 0.6010 - val_accuracy: 0.7936 - val_loss: 0.5326
Epoch 4/10
[1m1182/1182[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 25ms/step - accuracy: 0.7971 - loss: 0.5434 - val_accuracy: 0.8447 - val_loss: 0.4547
Epoch 5/10
[1m1182/1182[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 24ms/step - accuracy: 0.8133 - loss: 0.5032 - val_accuracy: 0.8477 - val_loss: 0.4426
Epoch 6/10
[1m1182/1182[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 25ms/step - accuracy: 0.8243 - loss: 0.4797 - val_accuracy: 0.8544 - val_loss: 0.4319
Epoc

Accuracy for validation and testing of about 86%.