## Galaxy AI Automated Galaxy Morphology Classification System

An end-to-end Machine Learning pipeline for classifying galaxy images into morphological categories, with cloud deployment, monitoring, and retraining capabilities.

- Domain: Astronomy and Astrophysics
- Data Type: RGB Galaxy Images (Non-tabular)
- Model Type: Multi-class Classification (10 classes)

## Data Sources
- Primary Dataset: Galaxy10 DECaLS
- Source: astroNN Galaxy10 DECaLS Dataset
- URL:  https://astronn.readthedocs.io/en/latest/galaxy10.html

Specifications:
- Image Size: 256×256 pixels (RGB)
- Format: HDF5 file
- Total Images: 17,736
- Classes: 10 morphological categories

Importing Libraries

In [None]:
!pip install astroNN tensorflow matplotlib pandas scikit-learn h5py pillow
import h5py
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from PIL import Image
from astroNN.datasets import galaxy10
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras import layers, Model
from tensorflow.keras.applications import EfficientNetV2S
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    confusion_matrix, classification_report, roc_auc_score
)
from sklearn.model_selection import train_test_split


Collecting astroNN
  Downloading astroNN-1.1.0-py3-none-any.whl.metadata (5.0 kB)
Collecting astroquery (from astroNN)
  Downloading astroquery-0.4.11-py3-none-any.whl.metadata (6.5 kB)
Collecting pyvo>=1.5 (from astroquery->astroNN)
  Downloading pyvo-1.8-py3-none-any.whl.metadata (4.7 kB)
Downloading astroNN-1.1.0-py3-none-any.whl (9.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.3/9.3 MB[0m [31m93.7 MB/s[0m eta [36m0:00:00[0mta [36m0:00:01[0m
[?25hDownloading astroquery-0.4.11-py3-none-any.whl (11.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.1/11.1 MB[0m [31m141.0 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
[?25hDownloading pyvo-1.8-py3-none-any.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m76.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyvo, astroquery, astroNN


Access Data

In [4]:


file_path = 'Galaxy10_DECals.h5'

# Check if file exists
if os.path.exists(file_path):
    with h5py.File(file_path, 'r') as f:
        print("=" * 60)
        print("Galaxy10_DECals.h5 File Contents")
        print("=" * 60)
        
        # List all datasets
        print("\nDatasets in file:")
        for key in f.keys():
            print(f"  - {key}")
        
        # Get detailed information about each dataset
        print("\nDetailed Information:")
        print("-" * 60)
        
        if 'images' in f:
            images_data = f['images']
            print(f"\nImages Dataset:")
            print(f"  Shape: {images_data.shape}")
            print(f"  Data type: {images_data.dtype}")
            print(f"  Size: {images_data.size:,} elements")
            print(f"  Memory: {images_data.nbytes / (1024**2):.2f} MB")
        
        if 'labels' in f:
            labels_data = f['labels']
            print(f"\nLabels Dataset:")
            print(f"  Shape: {labels_data.shape}")
            print(f"  Data type: {labels_data.dtype}")
            print(f"  Unique classes: {len(np.unique(labels_data[:]))}")
            print(f"  Class distribution:")
            
            # Show class distribution
            unique, counts = np.unique(labels_data[:], return_counts=True)
            for class_id, count in zip(unique, counts):
                print(f"    Class {class_id}: {count:,} images ({count/len(labels_data[:])*100:.2f}%)")
        
        print("\n" + "=" * 60)
else:
    print(f"File '{file_path}' not found in current directory.")
    print(f"Current directory: {os.getcwd()}")
    print(f"Files in directory: {os.listdir('.')}")

File 'Galaxy10_DECals.h5' not found in current directory.
Current directory: /content
Files in directory: ['.config', 'sample_data']


Loading Data

In [5]:
def load_galaxy_data():
    """Load Galaxy10 DECaLS dataset."""
    images, labels = galaxy10.load_data()
    images = images.astype('float32') / 255.0  # Normalize
    return images, labels



### Data Processing
Train/Val/Test split

In [6]:
def split_data(images, labels, test_size=0.15, val_size=0.15):
    """Split into train/val/test sets."""
    X_train, X_test, y_train, y_test = train_test_split(
        images, labels, test_size=test_size, stratify=labels, random_state=42
    )
    X_train, X_val, y_train, y_val = train_test_split(
        X_train, y_train, test_size=val_size/(1-test_size), 
        stratify=y_train, random_state=42
    )
    return (X_train, y_train), (X_val, y_val), (X_test, y_test)



### Model Creation

Data augmentation for galaxy images

In [7]:
def create_augmentation_layer():
    """Data augmentation for galaxy images."""
    return tf.keras.Sequential([
        tf.keras.layers.RandomRotation(0.5),      # Galaxies have no orientation
        tf.keras.layers.RandomFlip("horizontal_and_vertical"),
        tf.keras.layers.RandomZoom(0.2),
        tf.keras.layers.RandomBrightness(0.2),
        tf.keras.layers.RandomContrast(0.2),
    ])

Save images to train/test folder structure## Data preprocessing


In [8]:
def save_images_to_folders(images, labels, base_path):

    class_names = [f"class_{i}" for i in range(10)]
    for class_name in class_names:
        os.makedirs(f"{base_path}/{class_name}", exist_ok=True)
    
    for idx, (img, label) in enumerate(zip(images, labels)):
        img_pil = Image.fromarray((img * 255).astype(np.uint8))
        img_pil.save(f"{base_path}/class_{label}/galaxy_{idx}.png")


Model Architecture

In [9]:
def create_galaxy_classifier(num_classes=10, input_shape=(256, 256, 3)):
    """Create EfficientNetV2-S based classifier."""
    
    # Base model with pre-trained weights
    base_model = EfficientNetV2S(
        include_top=False,
        weights='imagenet',
        input_shape=input_shape,
        pooling='avg'
    )
    
    # Freeze base initially
    base_model.trainable = False
    
    # Build model
    inputs = tf.keras.Input(shape=input_shape)
    x = base_model(inputs, training=False)
    x = layers.Dropout(0.3)(x)
    x = layers.Dense(512, activation='relu', 
                     kernel_regularizer=tf.keras.regularizers.l2(0.01))(x)
    x = layers.BatchNormalization()(x)
    x = layers.Dropout(0.4)(x)
    x = layers.Dense(256, activation='relu')(x)
    outputs = layers.Dense(num_classes, activation='softmax')(x)
    
    model = Model(inputs, outputs, name='GalaxAI_Classifier')
    return model, base_model

def compile_model(model, learning_rate=1e-4):
    """Compile model with optimizer and loss."""
    model.compile(
        optimizer=tf.keras.optimizers.AdamW(learning_rate=learning_rate),
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy', 
                 tf.keras.metrics.SparseTopKCategoricalAccuracy(k=2, name='top2_acc')]
    )
    return model

def train_model(model, train_data, val_data, epochs=50, callbacks=None):
    """Train the model."""
    if callbacks is None:
        callbacks = [
            tf.keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True),
            tf.keras.callbacks.ReduceLROnPlateau(factor=0.5, patience=5),
            tf.keras.callbacks.ModelCheckpoint('models/best_model.h5', save_best_only=True)
        ]
    
    history = model.fit(
        train_data[0], train_data[1],
        validation_data=val_data,
        epochs=epochs,
        batch_size=32,
        callbacks=callbacks,
        class_weight=compute_class_weights(train_data[1])
    )
    return history

def compute_class_weights(labels):
    """Compute balanced class weights."""
    from sklearn.utils.class_weight import compute_class_weight
    import numpy as np
    
    weights = compute_class_weight('balanced', classes=np.unique(labels), y=labels)
    return dict(enumerate(weights))

def save_model(model, path='models/galaxai_model.h5'):
    """Save trained model."""
    model.save(path)
    
def load_model(path='models/galaxai_model.h5'):
    """Load trained model."""
    return tf.keras.models.load_model(path)

In [None]:
# prediction


CLASS_NAMES = [
    "Disturbed", "Merging", "Round Smooth", "In-between Smooth",
    "Cigar-shaped", "Barred Spiral", "Unbarred Tight Spiral",
    "Unbarred Loose Spiral", "Edge-on No Bulge", "Edge-on With Bulge"
]

def preprocess_image(image_path_or_bytes, target_size=(256, 256)):
    """Preprocess a single image for prediction."""
    if isinstance(image_path_or_bytes, bytes):
        import io
        img = Image.open(io.BytesIO(image_path_or_bytes))
    else:
        img = Image.open(image_path_or_bytes)
    
    img = img.convert('RGB')
    img = img.resize(target_size)
    img_array = np.array(img) / 255.0
    return np.expand_dims(img_array, axis=0)

def predict_single(model, image):
    """Predict class for a single image."""
    processed = preprocess_image(image)
    predictions = model.predict(processed, verbose=0)[0]
    
    class_id = int(np.argmax(predictions))
    confidence = float(predictions[class_id])
    
    return {
        "class_id": class_id,
        "class_name": CLASS_NAMES[class_id],
        "confidence": confidence,
        "all_probabilities": {
            CLASS_NAMES[i]: float(predictions[i]) 
            for i in range(len(CLASS_NAMES))
        }
    }

def predict_batch(model, image_paths):
    """Predict classes for multiple images."""
    results = []
    for path in image_paths:
        result = predict_single(model, path)
        result["image_path"] = str(path)
        results.append(result)
    return results

: 

In [11]:
# Load and split the data
(images, labels) = load_galaxy_data()
(X_train, y_train), (X_val, y_val), (X_test, y_test) = split_data(images, labels)



/root/.astroNN/datasets/Galaxy10_DECals.h5 was found!


: 

: 

In [None]:
model, base_model = create_galaxy_classifier(num_classes=10, input_shape=(256, 256, 3))


NameError: name 'create_galaxy_classifier' is not defined

In [None]:
# COMPILE THE MODEL
model = compile_model(model, learning_rate=1e-4)


NameError: name 'compile_model' is not defined

In [None]:
print("Training model...")
history = train_model(
    model, 
    (X_train, y_train), 
    (X_val, y_val), 
    epochs=50
)

Training model...


NameError: name 'X_train' is not defined

In [None]:


# TRAIN THE MODEL


# NOW you can make predictions
print("\nMaking predictions on test set...")
y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)

print(f"Predictions shape: {y_pred.shape}")
print(f"Predicted classes shape: {y_pred_classes.shape}")

In [None]:





# Metrics
print("=== MODEL EVALUATION METRICS ===")
print(f"Accuracy: {accuracy_score(y_test, y_pred_classes):.4f}")
print(f"Precision (macro): {precision_score(y_test, y_pred_classes, average='macro'):.4f}")
print(f"Recall (macro): {recall_score(y_test, y_pred_classes, average='macro'):.4f}")
print(f"F1-Score (macro): {f1_score(y_test, y_pred_classes, average='macro'):.4f}")
print(f"F1-Score (weighted): {f1_score(y_test, y_pred_classes, average='weighted'):.4f}")

# Per-class metrics
print("\n=== CLASSIFICATION REPORT ===")
print(classification_report(y_test, y_pred_classes, target_names=CLASS_NAMES))

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred_classes)
plt.figure(figsize=(12, 10))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=CLASS_NAMES, yticklabels=CLASS_NAMES)
plt.title('Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.savefig('confusion_matrix.png')

# ROC-AUC (One-vs-Rest)
roc_auc = roc_auc_score(y_test, y_pred, multi_class='ovr')
print(f"\nROC-AUC (OvR): {roc_auc:.4f}")

# 6. SAVE MODEL
model.save('models/galaxai_model.h5')

NameError: name 'model' is not defined