In [1]:
import os
import glob
import cv2
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Image size for resizing (keep it relatively small for MLP)
IMG_SIZE = 64
DATA_ROOT = "data/ct_sections"
RANDOM_STATE = 69

In [2]:
import os
import kagglehub

# Download latest version
path = kagglehub.dataset_download("kabil007/lungcancer4types-imagedataset")
print("Path to dataset files:", path)

# Many Kaggle datasets put the actual splits under a 'Data' folder.
candidate = os.path.join(path, "Data")
DATA_ROOT = candidate if os.path.isdir(candidate) else path
print("Using DATA_ROOT:", DATA_ROOT)


  from .autonotebook import tqdm as notebook_tqdm


Path to dataset files: /Users/cristiantiut/.cache/kagglehub/datasets/kabil007/lungcancer4types-imagedataset/versions/1
Using DATA_ROOT: /Users/cristiantiut/.cache/kagglehub/datasets/kabil007/lungcancer4types-imagedataset/versions/1/Data


In [3]:
def _normalize_label(folder_name: str, canonical_classes):
    """Map folder names like 'adenocarcinoma_N0_M0_Ib' -> 'adenocarcinoma'."""
    f = folder_name.strip().lower()
    for c in canonical_classes:
        c = c.lower()
        if f == c or f.startswith(c + "_"):
            return c
    return None


def load_dataset(root_dir, img_size=64, canonical_classes=None, class_names=None):
    """Load images from a single split folder (e.g., DATA_ROOT/train).

    If canonical_classes is provided, folder names are normalized to those canonical labels.
    If class_names is provided, it is used as the fixed label order (important for train vs test consistency).
    """
    X, y = [], []

    class_folders = sorted(
        [d for d in os.listdir(root_dir) if os.path.isdir(os.path.join(root_dir, d))]
    )

    # Determine final class_names (label order)
    if canonical_classes is not None:
        present = set()
        for d in class_folders:
            lbl = _normalize_label(d, canonical_classes)
            if lbl is not None:
                present.add(lbl)
        if class_names is None:
            class_names = sorted(present)
    else:
        if class_names is None:
            class_names = class_folders

    class_to_idx = {cls: idx for idx, cls in enumerate(class_names)}

    exts = ("*.png", "*.jpg", "*.jpeg", "*.bmp")

    for folder in class_folders:
        if canonical_classes is not None:
            lbl = _normalize_label(folder, canonical_classes)
        else:
            lbl = folder

        if lbl is None or lbl not in class_to_idx:
            continue

        class_dir = os.path.join(root_dir, folder)
        for ext in exts:
            for img_path in glob.glob(os.path.join(class_dir, ext)):
                img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
                if img is None:
                    continue
                img = cv2.resize(img, (img_size, img_size))
                img = img.astype(np.float32) / 255.0
                X.append(img.flatten())
                y.append(class_to_idx[lbl])

    X = np.array(X, dtype=np.float32)
    y = np.array(y, dtype=np.int64)
    return X, y, class_names


assert DATA_ROOT is not None, "DATA_ROOT is not set. Run the dataset download cell first."

# Canonical 4 types for this dataset (handles train folders that include staging suffixes)
CANONICAL_CLASSES = [
    "adenocarcinoma",
    "large.cell.carcinoma",
    "normal",
    "squamous.cell.carcinoma",
]


In [4]:
# If the dataset already provides train/valid/test folders, use them directly.
train_dir = os.path.join(DATA_ROOT, "train")
valid_dir = os.path.join(DATA_ROOT, "valid")
test_dir  = os.path.join(DATA_ROOT, "test")

if os.path.isdir(train_dir) and os.path.isdir(test_dir):
    # Train on train (+ valid if present), test on test
    X_train, y_train, class_names = load_dataset(train_dir, IMG_SIZE, CANONICAL_CLASSES)

    if os.path.isdir(valid_dir):
        X_valid, y_valid, _ = load_dataset(valid_dir, IMG_SIZE, CANONICAL_CLASSES, class_names=class_names)
        X_train = np.vstack([X_train, X_valid])
        y_train = np.concatenate([y_train, y_valid])

    X_test, y_test, _ = load_dataset(test_dir, IMG_SIZE, CANONICAL_CLASSES, class_names=class_names)

    print("Train shape:", X_train.shape, "Test shape:", X_test.shape)
    print("Classes:", class_names)

else:
    # Fallback: single-folder layout -> do a random split
    X, y, class_names = load_dataset(DATA_ROOT, IMG_SIZE)
    print("Dataset shape:", X.shape, "Labels shape:", y.shape)
    print("Classes:", class_names)

    X_train, X_test, y_train, y_test = train_test_split(
        X,
        y,
        test_size=0.2,
        stratify=y,
        random_state=RANDOM_STATE
    )

# Standardize features (zero mean, unit variance)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


Train shape: (685, 4096) Test shape: (315, 4096)
Classes: ['adenocarcinoma', 'large.cell.carcinoma', 'normal', 'squamous.cell.carcinoma']


In [5]:
mlp = MLPClassifier(
    hidden_layer_sizes=(512, 256),   # two hidden layers
    activation='relu',
    solver='adam',
    alpha=1e-4,                      # L2 regularization
    batch_size=64,
    learning_rate_init=1e-3,
    max_iter=50,                     # increase if underfitting
    shuffle=True,
    random_state=RANDOM_STATE,
    early_stopping=True,             # use part of training data as validation
    validation_fraction=0.1,
    n_iter_no_change=10,
    verbose=True                     # show training progress
)

mlp.fit(X_train_scaled, y_train)

Iteration 1, loss = 1.75652055
Validation score: 0.652174
Iteration 2, loss = 0.59466768
Validation score: 0.811594
Iteration 3, loss = 0.32949687
Validation score: 0.840580
Iteration 4, loss = 0.14681798
Validation score: 0.811594
Iteration 5, loss = 0.09482001
Validation score: 0.898551
Iteration 6, loss = 0.06226850
Validation score: 0.855072
Iteration 7, loss = 0.07118482
Validation score: 0.855072
Iteration 8, loss = 0.01593531
Validation score: 0.855072
Iteration 9, loss = 0.02166781
Validation score: 0.855072
Iteration 10, loss = 0.01502532
Validation score: 0.840580
Iteration 11, loss = 0.00727767
Validation score: 0.855072
Iteration 12, loss = 0.02903664
Validation score: 0.869565
Iteration 13, loss = 0.01798000
Validation score: 0.869565
Iteration 14, loss = 0.00878219
Validation score: 0.869565
Iteration 15, loss = 0.00771772
Validation score: 0.869565
Iteration 16, loss = 0.00653728
Validation score: 0.869565
Validation score did not improve more than tol=0.000100 for 10 co

0,1,2
,"hidden_layer_sizes  hidden_layer_sizes: array-like of shape(n_layers - 2,), default=(100,) The ith element represents the number of neurons in the ith hidden layer.","(512, ...)"
,"activation  activation: {'identity', 'logistic', 'tanh', 'relu'}, default='relu' Activation function for the hidden layer. - 'identity', no-op activation, useful to implement linear bottleneck,  returns f(x) = x - 'logistic', the logistic sigmoid function,  returns f(x) = 1 / (1 + exp(-x)). - 'tanh', the hyperbolic tan function,  returns f(x) = tanh(x). - 'relu', the rectified linear unit function,  returns f(x) = max(0, x)",'relu'
,"solver  solver: {'lbfgs', 'sgd', 'adam'}, default='adam' The solver for weight optimization. - 'lbfgs' is an optimizer in the family of quasi-Newton methods. - 'sgd' refers to stochastic gradient descent. - 'adam' refers to a stochastic gradient-based optimizer proposed  by Kingma, Diederik, and Jimmy Ba For a comparison between Adam optimizer and SGD, see :ref:`sphx_glr_auto_examples_neural_networks_plot_mlp_training_curves.py`. Note: The default solver 'adam' works pretty well on relatively large datasets (with thousands of training samples or more) in terms of both training time and validation score. For small datasets, however, 'lbfgs' can converge faster and perform better.",'adam'
,"alpha  alpha: float, default=0.0001 Strength of the L2 regularization term. The L2 regularization term is divided by the sample size when added to the loss. For an example usage and visualization of varying regularization, see :ref:`sphx_glr_auto_examples_neural_networks_plot_mlp_alpha.py`.",0.0001
,"batch_size  batch_size: int, default='auto' Size of minibatches for stochastic optimizers. If the solver is 'lbfgs', the classifier will not use minibatch. When set to ""auto"", `batch_size=min(200, n_samples)`.",64
,"learning_rate  learning_rate: {'constant', 'invscaling', 'adaptive'}, default='constant' Learning rate schedule for weight updates. - 'constant' is a constant learning rate given by  'learning_rate_init'. - 'invscaling' gradually decreases the learning rate at each  time step 't' using an inverse scaling exponent of 'power_t'.  effective_learning_rate = learning_rate_init / pow(t, power_t) - 'adaptive' keeps the learning rate constant to  'learning_rate_init' as long as training loss keeps decreasing.  Each time two consecutive epochs fail to decrease training loss by at  least tol, or fail to increase validation score by at least tol if  'early_stopping' is on, the current learning rate is divided by 5. Only used when ``solver='sgd'``.",'constant'
,"learning_rate_init  learning_rate_init: float, default=0.001 The initial learning rate used. It controls the step-size in updating the weights. Only used when solver='sgd' or 'adam'.",0.001
,"power_t  power_t: float, default=0.5 The exponent for inverse scaling learning rate. It is used in updating effective learning rate when the learning_rate is set to 'invscaling'. Only used when solver='sgd'.",0.5
,"max_iter  max_iter: int, default=200 Maximum number of iterations. The solver iterates until convergence (determined by 'tol') or this number of iterations. For stochastic solvers ('sgd', 'adam'), note that this determines the number of epochs (how many times each data point will be used), not the number of gradient steps.",50
,"shuffle  shuffle: bool, default=True Whether to shuffle samples in each iteration. Only used when solver='sgd' or 'adam'.",True


In [6]:
y_pred = mlp.predict(X_test_scaled)

print("Classification report:")
print(classification_report(y_test, y_pred, target_names=class_names))

print("Confusion matrix:")
print(confusion_matrix(y_test, y_pred))

Classification report:
                         precision    recall  f1-score   support

         adenocarcinoma       0.52      0.57      0.55       120
   large.cell.carcinoma       0.33      0.02      0.04        51
                 normal       0.69      0.91      0.78        54
squamous.cell.carcinoma       0.49      0.59      0.54        90

               accuracy                           0.55       315
              macro avg       0.51      0.52      0.48       315
           weighted avg       0.51      0.55      0.50       315

Confusion matrix:
[[69  1 15 35]
 [25  1  5 20]
 [ 5  0 49  0]
 [34  1  2 53]]


In [8]:
def predict_ct_section(img_path, model, scaler, img_size=64, class_names=None):
    img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
    if img is None:
        raise ValueError(f"Could not read image: {img_path}")

    img = cv2.resize(img, (img_size, img_size))
    img = img.astype(np.float32) / 255.0
    x = img.flatten().reshape(1, -1)

    x_scaled = scaler.transform(x)
    probs = model.predict_proba(x_scaled)[0]
    pred_idx = np.argmax(probs)
    pred_class = class_names[pred_idx] if class_names else pred_idx
    return pred_class, probs


test_img = "some_ct_section.png"
pred_class, probs = predict_ct_section(test_img, mlp, scaler, IMG_SIZE, class_names)
print("Predicted class:", pred_class)
print("Probabilities:", probs)

Predicted class: normal
Probabilities: [1.0857916e-01 8.7999309e-11 8.9142084e-01 2.5088964e-09]


In [10]:
import joblib

os.makedirs('models', exist_ok=True)
joblib.dump(mlp, 'models/mlp.joblib')            # save model
joblib.dump(scaler, 'models/scaler.joblib')      # save scaler
np.save('models/class_names.npy', np.array(class_names))