In [10]:
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, Conv2D, MaxPooling2D
from tensorflow.keras.optimizers import Adam
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
from datasets import load_dataset
from PIL import Image


In [11]:
# Configuration
im_shape = (250, 250)
seed = 10
BATCH_SIZE = 16
TRAIN_LIMIT = 10000  # Limit training samples
TEST_LIMIT = None    # Use all test samples

# Set random seeds for reproducibility
np.random.seed(seed)
tf.random.set_seed(seed)


In [12]:
# Load dataset from Hugging Face Hub
print("Loading dataset from Hugging Face Hub...")
dataset = load_dataset("ysif9/fruit-recognition")

print(f"Original dataset sizes:")
print(f"  Train: {len(dataset['train'])} samples")
print(f"  Test: {len(dataset['test'])} samples")


Loading dataset from Hugging Face Hub...
Original dataset sizes:
  Train: 25659 samples
  Test: 7070 samples


In [15]:
# Preprocessing function to handle images
def preprocess_image(example, im_shape=(250, 250)):
    """
    Preprocess images: convert to RGB, resize, and normalize.
    Handles RGBA, grayscale, and palette images.
    """
    from PIL import Image
    import numpy as np
    img = example['image']
    
    # Convert PIL Image to RGB if needed (handles RGBA, P, L modes)
    if isinstance(img, Image.Image):
        if img.mode != 'RGB':
            img = img.convert('RGB')
    
    # Resize image
    img = img.resize(im_shape)
    
    # Convert to numpy array and normalize to [0, 1]
    img_array = np.array(img, dtype=np.float32) / 255.0
    
    example['image'] = img_array
    return example


In [None]:
# Apply preprocessing and set format for TensorFlow
print("Preprocessing datasets...")

# Apply limit to training data if specified
train_dataset = dataset['train']
if TRAIN_LIMIT is not None and TRAIN_LIMIT < len(train_dataset):
    train_dataset = train_dataset.select(range(TRAIN_LIMIT))
    print(f"Using {TRAIN_LIMIT} training samples")

# Apply limit to test data if specified
test_dataset = dataset['test']
if TEST_LIMIT is not None and TEST_LIMIT < len(test_dataset):
    test_dataset = test_dataset.select(range(TEST_LIMIT))
    print(f"Using {TEST_LIMIT} test samples")

# Preprocess images
train_dataset = train_dataset.map(preprocess_image, num_proc=4)
test_dataset = test_dataset.map(preprocess_image, num_proc=4)


Preprocessing datasets...
Using 10000 training samples


Map (num_proc=4):  40%|███▉      | 3985/10000 [00:24<01:30, 66.45 examples/s]  

In [None]:
# Get number of classes
num_classes = len(set(train_dataset['label']))
print(f"Number of classes: {num_classes}")


In [None]:
# Convert to TensorFlow datasets using the built-in method
print("Converting to TensorFlow datasets...")

# Convert to tf.data.Dataset
train_tf_dataset = train_dataset.to_tf_dataset(
    columns=['image'],
    label_cols=['label'],
    batch_size=BATCH_SIZE,
    shuffle=True,
    collate_fn=None
)

test_tf_dataset = test_dataset.to_tf_dataset(
    columns=['image'],
    label_cols=['label'],
    batch_size=BATCH_SIZE,
    shuffle=False,
    collate_fn=None
)


In [None]:
# Create validation split from training data (20%)
train_size = len(train_dataset)
val_size = int(0.2 * train_size)
train_size_adjusted = train_size - val_size

# Split the dataset
train_tf_dataset_split = train_tf_dataset.take(train_size_adjusted // BATCH_SIZE)
validation_tf_dataset = train_tf_dataset.skip(train_size_adjusted // BATCH_SIZE)

print(f"Training batches: ~{train_size_adjusted // BATCH_SIZE}")
print(f"Validation batches: ~{val_size // BATCH_SIZE}")
print(f"Test batches: ~{len(test_dataset) // BATCH_SIZE}")


In [None]:
# One-hot encode labels
def one_hot_encode(image, label):
    label = tf.one_hot(label, depth=num_classes)
    return image, label

train_tf_dataset_split = train_tf_dataset_split.map(one_hot_encode)
validation_tf_dataset = validation_tf_dataset.map(one_hot_encode)
test_tf_dataset = test_tf_dataset.map(one_hot_encode)

# Prefetch for performance
train_tf_dataset_split = train_tf_dataset_split.prefetch(tf.data.AUTOTUNE)
validation_tf_dataset = validation_tf_dataset.prefetch(tf.data.AUTOTUNE)
test_tf_dataset = test_tf_dataset.prefetch(tf.data.AUTOTUNE)


In [None]:
# Build the model
model = Sequential([
    Conv2D(20, kernel_size=(3, 3), activation='relu', 
           input_shape=(im_shape[0], im_shape[1], 3)),
    MaxPooling2D(pool_size=(2, 2)),
    Conv2D(40, kernel_size=(3, 3), activation='relu'),
    Flatten(),
    Dense(100, activation='relu'),
    Dense(num_classes, activation='softmax')
])

model.summary()

# Compile the model
model.compile(
    loss='categorical_crossentropy',
    optimizer=Adam(),
    metrics=['accuracy']
)


In [None]:
# Training
epochs = 1

callbacks_list = [
    keras.callbacks.ModelCheckpoint(
        filepath='models/model_improved.h5',
        monitor='val_loss', 
        save_best_only=True, 
        verbose=1
    ),
    keras.callbacks.EarlyStopping(
        monitor='val_loss', 
        patience=10, 
        verbose=1
    )
]

history = model.fit(
    train_tf_dataset_split,
    epochs=epochs,
    callbacks=callbacks_list,
    validation_data=validation_tf_dataset,
    verbose=1
)


In [None]:
# Load the best saved model
from tensorflow.keras.models import load_model

model = load_model('models/model_improved.h5')


In [None]:
# Evaluate on validation set
print("Evaluating on validation set...")
val_loss, val_accuracy = model.evaluate(validation_tf_dataset)
print(f'Validation loss: {val_loss:.4f}')
print(f'Validation accuracy: {val_accuracy:.4f}')


In [None]:
# Evaluate on test set
print("Evaluating on test set...")
test_loss, test_accuracy = model.evaluate(test_tf_dataset)
print(f'Test loss: {test_loss:.4f}')
print(f'Test accuracy: {test_accuracy:.4f}')


In [None]:
# Generate predictions for confusion matrix and classification report
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns

print("Generating predictions...")
y_true = []
y_pred = []

for images, labels in test_tf_dataset:
    predictions = model.predict(images, verbose=0)
    y_true.extend(np.argmax(labels.numpy(), axis=1))
    y_pred.extend(np.argmax(predictions, axis=1))

y_true = np.array(y_true)
y_pred = np.array(y_pred)


In [None]:
# Get class names (assuming they're numeric labels 0, 1, 2, ...)
classes = [str(i) for i in range(num_classes)]

# Confusion matrix
cm = confusion_matrix(y_true, y_pred)
plt.figure(figsize=(10, 7))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=classes, yticklabels=classes)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()


In [None]:
# Classification Report
print('Classification Report')
print(classification_report(y_true, y_pred, target_names=classes))
