In [None]:
import os, glob, re
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import tensorflow as tf

In [None]:
labels = []
with open("labels.csv", "r") as f:
    for line in f:
        parts = [p.strip() for p in line.split(",")]
        if len(parts) >= 3 and parts[2] != "":
            labels.append(parts[:3])  # ra, dec, label

labels_df = pd.DataFrame(labels, columns=["ra", "dec", "label"])
labels_df["ra"] = labels_df["ra"].astype(float)
labels_df["dec"] = labels_df["dec"].astype(float)
print("✅ Loaded labels:", len(labels_df))

✅ Loaded labels: 2178


In [None]:
image_folders = ["typ/typ_PNG", "exo/exo_PNG"]
image_paths = []

for folder in image_folders:
    # Recursive search for any PNG inside subfolders
    image_paths.extend(glob.glob(os.path.join(folder, "**", "*.png"), recursive=True))

print(f"✅ Found {len(image_paths)} images total")

✅ Found 2107 images total


In [None]:
def extract_coords(filename):
    match = re.match(r"([-+]?\d*\.?\d+)\s+([-+]?\d*\.?\d+)", os.path.basename(filename))
    if match:
        return float(match.group(1)), float(match.group(2))
    return None, None

coords = [extract_coords(p) for p in image_paths]
ra_vals, dec_vals = zip(*coords)

images_df = pd.DataFrame({
    "path": image_paths,
    "ra": ra_vals,
    "dec": dec_vals
})

In [None]:
def find_closest_label(row, max_dist=0.05):
    diffs = np.sqrt((labels_df["ra"] - row["ra"])**2 + (labels_df["dec"] - row["dec"])**2)
    idx = diffs.idxmin()
    if diffs[idx] <= max_dist:
        return labels_df.loc[idx, "label"]
    else:
        return None

images_df["label"] = images_df.apply(find_closest_label, axis=1)
images_df = images_df.dropna(subset=["label"])

print(f"✅ Matched {len(images_df)} images with labels")


✅ Matched 2007 images with labels


In [None]:
train_df, val_df = train_test_split(images_df, test_size=0.2, stratify=images_df["label"], random_state=42)


In [None]:
from imblearn.over_sampling import RandomOverSampler

# Separate features and target in the training data
X_train = train_df.drop("label", axis=1)
y_train = train_df["label"]

# Apply RandomOverSampler
oversampler = RandomOverSampler(random_state=42)
X_train_resampled, y_train_resampled = oversampler.fit_resample(X_train, y_train)

# Create a new balanced training DataFrame
train_df_balanced = pd.DataFrame(X_train_resampled, columns=X_train.columns)
train_df_balanced["label"] = y_train_resampled

print(f"✅ Original training data size: {len(train_df)}")
print(f"✅ Resampled training data size: {len(train_df_balanced)}")
print("✅ Ready to use balanced training data for ImageDataGenerator.")

✅ Original training data size: 1605
✅ Resampled training data size: 4833
✅ Ready to use balanced training data for ImageDataGenerator.


In [None]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator

img_size = (128, 128)
batch_size = 32

# Data augmentation + normalization
train_datagen = ImageDataGenerator(
    rescale=1./255,
    horizontal_flip=True,
    vertical_flip=True,
    rotation_range=20,  # Increased rotation
    zoom_range=0.2,  # Added zoom
    width_shift_range=0.2,  # Added width shift
    height_shift_range=0.2 # Added height shift
)

val_datagen = ImageDataGenerator(
    rescale=1./255
)

train_ds = train_datagen.flow_from_dataframe(
    train_df_balanced, # Use the balanced training data
    x_col="path",
    y_col="label",
    target_size=img_size,
    batch_size=batch_size,
    class_mode="categorical"
)

val_ds = val_datagen.flow_from_dataframe(
    val_df,
    x_col="path",
    y_col="label",
    target_size=img_size,
    batch_size=batch_size,
    class_mode="categorical"
)

print("✅ Ready for training!")

Found 4833 validated image filenames belonging to 9 classes.
Found 402 validated image filenames belonging to 9 classes.
✅ Ready for training!


In [None]:
from tensorflow.keras.applications import EfficientNetB0
from tensorflow.keras.layers import GlobalAveragePooling2D, Dense, Dropout, Input
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam

IMG_SIZE = 128
num_classes = 9

# --- Build the model ---
inputs = Input(shape=(IMG_SIZE, IMG_SIZE, 3))
base_model = EfficientNetB0(include_top=False, weights='imagenet', input_tensor=inputs)
base_model.trainable = True
# Optionally, freeze earlier layers and only train top layers
for layer in base_model.layers[:-200]:  # keep first layers frozen
    layer.trainable = False
  # freeze base for initial training

x = base_model.output
x = GlobalAveragePooling2D()(x)
x = Dropout(0.5)(x)
outputs = Dense(num_classes, activation='softmax')(x)

model = Model(inputs, outputs)
model.compile(
    optimizer=Adam(learning_rate=0.01), # Reduced learning rate
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

model.summary()

In [None]:
history_finetune = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=10,  # fewer epochs for fine-tuning
)

  self._warn_if_super_not_called()


Epoch 1/10
[1m152/152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m339s[0m 2s/step - accuracy: 0.2097 - loss: 2.2360 - val_accuracy: 0.0025 - val_loss: 7.2436
Epoch 2/10
[1m152/152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m294s[0m 2s/step - accuracy: 0.3714 - loss: 1.6864 - val_accuracy: 0.0025 - val_loss: 5.2882
Epoch 3/10
[1m152/152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m292s[0m 2s/step - accuracy: 0.4374 - loss: 1.5316 - val_accuracy: 0.0100 - val_loss: 14.0448
Epoch 4/10
[1m152/152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2s/step - accuracy: 0.4593 - loss: 1.4652

KeyboardInterrupt: 