In [7]:
# 1_Data_Preprocessing.ipynb

import os
import numpy as np
import cv2
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split

# Set dataset path
DATA_PATH = '../data/GTSRB/Train'
IMG_HEIGHT = 30
IMG_WIDTH = 30
NUM_CLASSES = 43

# Load images and labels
images = []
labels = []

for class_id in range(NUM_CLASSES):
    class_path = os.path.join(DATA_PATH, str(class_id))
    
    # Check if directory exists for the current class
    if not os.path.isdir(class_path):
        print(f"Skipping class {class_id} - No such directory")
        continue
    
    for img_file in os.listdir(class_path):
        img_path = os.path.join(class_path, img_file)
        
        # Try loading the image
        try:
            image = cv2.imread(img_path)
            if image is None:
                print(f"Skipping image: {img_path} (corrupted or not found)")
                continue
            image = cv2.resize(image, (IMG_WIDTH, IMG_HEIGHT))
            images.append(image)
            labels.append(class_id)
        except Exception as e:
            print(f"Error loading image {img_path}: {e}")

# Convert to numpy arrays
X = np.array(images)
y = np.array(labels)

print(f"Loaded {len(X)} images.")

# Check if data is loaded correctly
if len(X) == 0:
    raise ValueError("No images were loaded. Check your dataset path.")

# Normalize and encode labels
X = X / 255.0
y = to_categorical(y, NUM_CLASSES)

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training samples: {len(X_train)}, Testing samples: {len(X_test)}")


Loaded 39209 images.
Training samples: 31367, Testing samples: 7842


In [8]:
# Save arrays for faster reuse
np.save('../data/X_train.npy', X_train)
np.save('../data/X_test.npy', X_test)
np.save('../data/y_train.npy', y_train)
np.save('../data/y_test.npy', y_test)
print("All file saved")

All file saved
