In [8]:
import os
import shutil
import random
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from sklearn.metrics import classification_report, confusion_matrix
import warnings
warnings.filterwarnings('ignore') 



In [9]:
# Function to split the dataset
def split_dataset(original_dir, base_dir, train_size=0.7, val_size=0.15, test_size=0.15):
    classes = [str(i) for i in range(10)] + [chr(i) for i in range(97, 123)]  # '0-9' + 'a-z'
    
    # Create directories for train, validation, and test sets
    for split in ['train', 'val', 'test']:
        split_dir = os.path.join(base_dir, split)
        if not os.path.exists(split_dir):
            os.makedirs(split_dir)
            for class_name in classes:
                os.makedirs(os.path.join(split_dir, class_name))

    # Split the dataset
    for class_name in classes:
        class_dir = os.path.join(original_dir, class_name)
        images = os.listdir(class_dir)
        random.shuffle(images)
        
        train_idx = int(len(images) * train_size)
        val_idx = int(len(images) * (train_size + val_size))

        train_images = images[:train_idx]
        val_images = images[train_idx:val_idx]
        test_images = images[val_idx:]

        # Copy the images to respective directories
        for image in train_images:
            shutil.copy(os.path.join(class_dir, image), os.path.join(base_dir, 'train', class_name, image))
        for image in val_images:
            shutil.copy(os.path.join(class_dir, image), os.path.join(base_dir, 'val', class_name, image))
        for image in test_images:
            shutil.copy(os.path.join(class_dir, image), os.path.join(base_dir, 'test', class_name, image))

# Paths
original_data_dir = 'C:/Users/sab00/OneDrive/Documents/AB/ML/project/data'  # Original dataset containing folders '0-9' and 'a-z'
base_data_dir = 'C:/Users/sab00/OneDrive/Documents/AB/ML/project/data_split'  # Folder where train/val/test folders will be created

# Split the dataset into train, val, and test (70%, 15%, 15%)
split_dataset(original_data_dir, base_data_dir)

# Updated paths after splitting
train_dir = os.path.join(base_data_dir, 'train')
val_dir = os.path.join(base_data_dir, 'val')
test_dir = os.path.join(base_data_dir, 'test')

# Image data generators


In [14]:
train_datagen = ImageDataGenerator(rescale=1./255)
val_datagen = ImageDataGenerator(rescale=1./255)
test_datagen = ImageDataGenerator(rescale=1./255)

train_generator = train_datagen.flow_from_directory(train_dir, target_size=(64, 64), batch_size=32, class_mode='categorical')
val_generator = val_datagen.flow_from_directory(val_dir, target_size=(64, 64), batch_size=32, class_mode='categorical')
test_generator = test_datagen.flow_from_directory(test_dir, target_size=(64, 64), batch_size=32, class_mode='categorical', shuffle=False)

# CNN model with Input layer to remove the warning
model = Sequential([
    Input(shape=(64, 64, 3)),  # Use Input layer for the input shape
    Conv2D(32, (3, 3), activation='relu'),
    MaxPooling2D(pool_size=(2, 2)),
    
    Conv2D(64, (3, 3), activation='relu'),
    MaxPooling2D(pool_size=(2, 2)),
    
    Conv2D(128, (3, 3), activation='relu'),
    MaxPooling2D(pool_size=(2, 2)),
    
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(36, activation='softmax')  # 36 classes (0-9 and a-z)
])

model.summary()
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
def predict_on_validation_set(val_generator):
    Y_val_pred = model.predict(val_generator)
    return Y_val_pred


Found 1760 images belonging to 36 classes.
Found 360 images belonging to 36 classes.
Found 395 images belonging to 36 classes.


In [15]:
history = model.fit(train_generator, epochs=15, validation_data=val_generator)

val_loss, val_acc= model.evaluate(val_generator)
print(f"Validation Accuracy: {val_acc}")

# Classification report and confusion matrix on the validation set
Y_val_pred = predict_on_validation_set(val_generator)
y_val_pred = tf.argmax(Y_val_pred, axis=1)
print('Confusion Matrix (Validation Set)')
print(confusion_matrix(val_generator.classes, y_val_pred))
print('Classification Report (Validation Set)')
target_names = [str(i) for i in range(10)] + [chr(i) for i in range(97, 123)]  # 36 classes (0-9, a-z)
print(classification_report(val_generator.classes, y_val_pred, target_names=target_names))

Epoch 1/15
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 116ms/step - accuracy: 0.0844 - loss: 3.3731 - val_accuracy: 0.6917 - val_loss: 1.2495
Epoch 2/15
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 95ms/step - accuracy: 0.5980 - loss: 1.3874 - val_accuracy: 0.8222 - val_loss: 0.5622
Epoch 3/15
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 103ms/step - accuracy: 0.7522 - loss: 0.7759 - val_accuracy: 0.8972 - val_loss: 0.3295
Epoch 4/15
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 105ms/step - accuracy: 0.8239 - loss: 0.5318 - val_accuracy: 0.9111 - val_loss: 0.2657
Epoch 5/15
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 101ms/step - accuracy: 0.8512 - loss: 0.4246 - val_accuracy: 0.8944 - val_loss: 0.2728
Epoch 6/15
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 95ms/step - accuracy: 0.8807 - loss: 0.3511 - val_accuracy: 0.9278 - val_loss: 0.1683
Epoch 7/15
[1m55/55[0m [32m

In [17]:
train_datagen = ImageDataGenerator(rescale=1./255, rotation_range=20, width_shift_range=0.2, height_shift_range=0.2, shear_range=0.2, zoom_range=0.2, horizontal_flip=True)
train_generator = train_datagen.flow_from_directory(train_dir, target_size=(64, 64), batch_size=32, class_mode='categorical')
history = model.fit(train_generator, epochs=15, validation_data=val_generator)

val_loss, val_acc = model.evaluate(val_generator)
print(f"Validation Accuracy: {val_acc}")

# Classification report and confusion matrix on the validation set
Y_val_pred = predict_on_validation_set(val_generator)
y_val_pred = tf.argmax(Y_val_pred, axis=1)
print('Confusion Matrix (Validation Set)')
print(confusion_matrix(val_generator.classes, y_val_pred))
print('Classification Report (Validation Set)')
target_names = [str(i) for i in range(10)] + [chr(i) for i in range(97, 123)]  # 36 classes (0-9, a-z)
print(classification_report(val_generator.classes, y_val_pred, target_names=target_names))

Found 1760 images belonging to 36 classes.
Epoch 1/15
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 136ms/step - accuracy: 0.0597 - loss: 5.2282 - val_accuracy: 0.1917 - val_loss: 3.2264
Epoch 2/15
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 156ms/step - accuracy: 0.0734 - loss: 3.3886 - val_accuracy: 0.3139 - val_loss: 2.2895
Epoch 3/15
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 138ms/step - accuracy: 0.1106 - loss: 3.0933 - val_accuracy: 0.3639 - val_loss: 2.0837
Epoch 4/15
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 131ms/step - accuracy: 0.1351 - loss: 2.8813 - val_accuracy: 0.4222 - val_loss: 1.9108
Epoch 5/15
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 146ms/step - accuracy: 0.1845 - loss: 2.7563 - val_accuracy: 0.4056 - val_loss: 1.7692
Epoch 6/15
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 139ms/step - accuracy: 0.1773 - loss: 2.6500 - val_accuracy: 0.4778 - va