In [None]:
from google.colab import drive
import os

print("Mounting Google Drive...")
drive.mount('/content/drive')

# Define paths
base_path = '/content/drive/MyDrive/DS Project 3'
train_path = os.path.join(base_path, 'training_images')
test_path = os.path.join(base_path, 'test_images')

print(f"Training path: {train_path}")
print(f"Testing path: {test_path}")


In [None]:
import tensorflow as tf
from sklearn.model_selection import GroupKFold

# Parameters
image_size = (125, 125)  # Updated size compatible with MobileNetV2
batch_size = 32

def load_images_and_labels(folder_path):
    filenames = []
    labels = []
    groups = []

    for file in os.listdir(folder_path):
        if file.lower().endswith('.jpg'):  # Only process .jpg files
            file_path = os.path.join(folder_path, file)
            try:
                # Extract the phase (class label) from the filename
                label = int(file.split('_')[-1].split('.')[0])  # Last number before ".jpg"
                if 1 <= label <= 5:  # Validate label is within bounds
                    filenames.append(file_path)
                    labels.append(label - 1)  # Adjust labels to range [0, 4]

                    # Group by prefix (e.g., "Tam_d07")
                    group = "_".join(file.split('_')[:2])
                    groups.append(group)
                else:
                    print(f"Skipping file with invalid label: {file}")
            except ValueError:
                print(f"Skipping file due to label extraction issue: {file}")

    print(f"Loaded {len(filenames)} images and labels from {folder_path}")
    return filenames, labels, groups

# Load training and testing datasets
print("Loading training dataset...")
train_filenames, train_labels, train_groups = load_images_and_labels(train_path)

print("Loading testing dataset...")
test_filenames, test_labels, _ = load_images_and_labels(test_path)

def preprocess_image(file_path, label):
    # Load and preprocess the image
    image = tf.io.read_file(file_path)
    image = tf.image.decode_jpeg(image, channels=3)
    image = tf.image.resize(image, image_size) / 255.0  # Normalize pixel values
    return image, label

def create_dataset(filenames, labels, batch_size):
    dataset = tf.data.Dataset.from_tensor_slices((filenames, labels))
    dataset = dataset.map(preprocess_image, num_parallel_calls=tf.data.AUTOTUNE)
    dataset = dataset.batch(batch_size).prefetch(tf.data.AUTOTUNE)
    return dataset


In [None]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Dropout, GlobalAveragePooling2D
from tensorflow.keras.applications import MobileNetV2

gkf = GroupKFold(n_splits=5)
fold_results = []

for fold, (train_idx, val_idx) in enumerate(gkf.split(train_filenames, train_labels, train_groups)):
    print(f"Processing Fold {fold + 1}...")

    # Split training and validation sets for this fold
    train_files = [train_filenames[i] for i in train_idx]
    train_fold_labels = [train_labels[i] for i in train_idx]
    val_files = [train_filenames[i] for i in val_idx]
    val_fold_labels = [train_labels[i] for i in val_idx]

    # Create datasets
    train_dataset = create_dataset(train_files, train_fold_labels, batch_size)
    val_dataset = create_dataset(val_files, val_fold_labels, batch_size)

    # Build MobileNetV2 model
    base_model = MobileNetV2(weights='imagenet', include_top=False, input_shape=(200, 200, 3))
    x = GlobalAveragePooling2D()(base_model.output)
    x = Dropout(0.3)(x)
    predictions = Dense(5, activation='softmax')(x)  # Assuming 5 ripeness stages
    model = Model(inputs=base_model.input, outputs=predictions)

    # Freeze base layers
    for layer in base_model.layers:
        layer.trainable = False

    # Compile the model
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

    # Train the model
    history = model.fit(
        train_dataset,
        validation_data=val_dataset,
        epochs=3,  # Reduced epochs for faster cross-validation
        verbose=1
    )

    # Evaluate the model on the validation set
    val_loss, val_accuracy = model.evaluate(val_dataset, verbose=1)
    fold_results.append(val_accuracy)

print(f"Cross-validation accuracies: {fold_results}")


In [None]:
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix
from scipy.stats import ttest_1samp

# Prepare the test dataset
test_dataset = create_dataset(test_filenames, test_labels, batch_size)

# Evaluate the model on the testing dataset
print("Evaluating the model on the testing dataset...")
test_loss, test_accuracy = model.evaluate(test_dataset, verbose=1)

# Generate predictions
predictions = []
true_labels = []

for images, labels in test_dataset:
    preds = tf.argmax(model.predict(images), axis=1).numpy()
    predictions.extend(preds)
    true_labels.extend(labels.numpy())

# Metrics for testing set
print("Classification Report:")
print(classification_report(true_labels, predictions))

print("Confusion Matrix:")
print(confusion_matrix(true_labels, predictions))

# One-sample t-test to compare cross-validation accuracies to random guessing (20% for 5 classes)
random_guess_accuracy = 0.2
t_stat, p_value = ttest_1samp(fold_results, random_guess_accuracy)
print(f"T-Test Results: t-stat={t_stat:.4f}, p-value={p_value:.4f}")

if p_value < 0.05:
    print("The model's performance is significantly better than random guessing.")
else:
    print("The model's performance is NOT significantly better than random guessing.")
