# Model Definition and Evaluation
## Table of Contents
1. [Model Selection](#model-selection)
2. [Feature Engineering](#feature-engineering)
3. [Hyperparameter Tuning](#hyperparameter-tuning)
4. [Implementation](#implementation)
5. [Evaluation Metrics](#evaluation-metrics)
6. [Comparative Analysis](#comparative-analysis)


In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, mean_squared_error, classification_report
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import keras_tuner as kt
import matplot as plt

## Model Selection

[Discuss the type(s) of models you consider for this task, and justify the selection.]



## Feature Engineering

[Describe any additional feature engineering you've performed beyond what was done for the baseline model.]


In [None]:
import os
base_dir = os.path.dirname(os.getcwd())
path_images = os.path.join(base_dir, "1_DatasetCharacteristics", "EruptionImages", "sorted images", "train_val")
path_images_synth = os.path.join(base_dir, "1_DatasetCharacteristics", "EruptionImages", "synth")
print(base_dir, path_images)

In [None]:
IMG_SIZE = (224, 224)
BATCH_SIZE = 32

dataset = tf.keras.utils.image_dataset_from_directory(
    path_images,
    image_size=IMG_SIZE,
    batch_size=BATCH_SIZE,
    label_mode='binary',
    shuffle=True,   # randomize
    color_mode='rgb',   # this strips alpha if present
    seed=42
)

# Get the class names
print("Class names:", dataset.class_names)

counts = {0: 0, 1: 0}

for images, labels in dataset:
    unique, counts_batch = np.unique(labels.numpy(), return_counts=True)
    for u, c in zip(unique, counts_batch):
        counts[u] += c

print(f"Label counts: {counts}")

In [None]:
import tensorflow as tf
import os

# 1. Setup paths
input_dir = path_images
output_dir = path_images_synth

os.makedirs(output_dir, exist_ok=True)

# 2. Define augmentation function
def augment_image(image):
    # Randomly rotate by 0°, 90°, 180°, or 270°
    k = tf.random.uniform(shape=[], minval=0, maxval=4, dtype=tf.int32)
    image = tf.image.rot90(image, k)

    # Random brightness and contrast
    image = tf.image.random_brightness(image, max_delta=0.2)
    image = tf.image.random_contrast(image, lower=0.8, upper=1.2)

    image = tf.clip_by_value(image, 0, 255)
    return image

# 3. Process each image
for filename in os.listdir(input_dir):
    if filename.endswith('.jpg') or filename.endswith('.png'):
        img_path = os.path.join(input_dir, filename)
        img_raw = tf.io.read_file(img_path)
        img = tf.image.decode_image(img_raw, channels=3)
        img = tf.cast(img, tf.float32)

        for i in range(5):
            aug_img = augment_image(img)
            aug_img = tf.cast(aug_img, tf.uint8)

            new_filename = f"{os.path.splitext(filename)[0]}_aug{i}.jpg"
            output_path = os.path.join(output_dir, new_filename)

            encoded_img = tf.io.encode_jpeg(aug_img)
            tf.io.write_file(output_path, encoded_img)

print(f"Done. Augmented images saved to: {output_dir}")


In [None]:
total_batches = tf.data.experimental.cardinality(dataset).numpy()
train_batches = int(total_batches * 0.8)
val_batches = total_batches - train_batches

train_ds = dataset.take(train_batches)
val_ds = dataset.skip(train_batches)

def count_labels(ds):
    counts = {0: 0, 1: 0}
    for _, labels in ds:
        unique, counts_batch = np.unique(labels.numpy(), return_counts=True)
        for u, c in zip(unique, counts_batch):
            counts[int(u)] += c
    return counts

print("Train:", count_labels(train_ds))
print("Validation:", count_labels(val_ds))




In [None]:
path_images_test = os.path.join(base_dir, "1_DatasetCharacteristics", "EruptionImages", "sorted images", "test")

IMG_SIZE = (224, 224)
BATCH_SIZE = 32

# Replace this with your actual test folder path
TEST_DIR = path_images_test

test_df = tf.keras.utils.image_dataset_from_directory(
    TEST_DIR,
    labels='inferred',
    label_mode='binary',
    image_size=IMG_SIZE,
    batch_size=BATCH_SIZE,
    shuffle=False 
)

print("Test dataset loaded")
print(f"Class names: {test_df.class_names}")

counts = {0: 0, 1: 0}

for images, labels in test_df:
    unique, counts_batch = np.unique(labels.numpy(), return_counts=True)
    for u, c in zip(unique, counts_batch):
        counts[u] += c

print(f"Label counts: {counts}")



In [None]:
# # Load the dataset
# # Replace 'your_dataset.csv' with the path to your actual dataset
# df = pd.read_csv('your_dataset.csv')

# # Perform any feature engineering steps
# # Example: df['new_feature'] = df['feature1'] + df['feature2']

# # Feature and target variable selection
# X = df[['your', 'selected', 'features']]
# y = df['target_variable']

# # Split the dataset
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


## Model

In [None]:
model = keras.Sequential([
    layers.Rescaling(1./255, input_shape=(224, 224, 3)),
    layers.Conv2D(32, (3, 3), activation='relu'),
    layers.MaxPooling2D(),
    layers.Conv2D(64, (3, 3), activation='relu'),
    layers.MaxPooling2D(),
    layers.Conv2D(128, (3, 3), activation='relu'),
    layers.MaxPooling2D(),
    layers.Flatten(),
    layers.Dense(128, activation='relu'),
    layers.Dense(1, activation='sigmoid')  # Binary output
])


In [None]:
model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy']
)

history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=10
)

In [None]:
import matplotlib
matplotlib.use('Agg')  # Or 'TkAgg'
import matplotlib.pyplot as plt

acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs_range = range(len(acc))

plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
plt.plot(epochs_range, acc, label='Training Accuracy')
plt.plot(epochs_range, val_acc, label='Validation Accuracy')
plt.legend()
plt.title('Accuracy')

plt.subplot(1, 2, 2)
plt.plot(epochs_range, loss, label='Training Loss')
plt.plot(epochs_range, val_loss, label='Validation Loss')
plt.legend()
plt.title('Loss')

plt.savefig("training_plot.png")
print("✅ Saved: training_plot.png")


## Hyperparameter Tuning

[Discuss any hyperparameter tuning methods you've applied, such as Grid Search or Random Search, and the rationale behind them.]


In [None]:
# Implement hyperparameter tuning
# Example using GridSearchCV with a DecisionTreeClassifier
# param_grid = {'max_depth': [2, 4, 6, 8]}
# grid_search = GridSearchCV(DecisionTreeClassifier(), param_grid, cv=5)
# grid_search.fit(X_train, y_train)


## Implementation

[Implement the final model(s) you've selected based on the above steps.]


In [None]:
# Implement the final model(s)
# Example: model = YourChosenModel(best_hyperparameters)
# model.fit(X_train, y_train)


## Evaluation Metrics

[Clearly specify which metrics you'll use to evaluate the model performance, and why you've chosen these metrics.]


In [None]:
# Evaluate the model using your chosen metrics
# Example for classification
# y_pred = model.predict(X_test)
# print(classification_report(y_test, y_pred))

# Example for regression
# mse = mean_squared_error(y_test, y_pred)

# Your evaluation code here


## Comparative Analysis

[Compare the performance of your model(s) against the baseline model. Discuss any improvements or setbacks and the reasons behind them.]


In [None]:
# Comparative Analysis code (if applicable)
# Example: comparing accuracy of the baseline model and the new model
# print(f"Baseline Model Accuracy: {baseline_accuracy}, New Model Accuracy: {new_model_accuracy}")
