In [None]:
1.0-data-exploration-and-preprocessing.ipynb
This notebook is all about understanding your data, getting it squeaky clean, and preparing it for the machine learning model. Think of it as laying a solid foundation before building your house.

Notebook Sections

Introduction & Setup

Briefly state the notebook's purpose: explore the dataset, preprocess images, and prepare data for training.
Import necessary libraries: pandas, numpy, matplotlib.pyplot, seaborn, tensorflow, keras.preprocessing.image, sklearn.model_selection, os.
Python
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from sklearn.model_selection import train_test_split
from shutil import copyfile

# Set up paths
RAW_DATA_DIR = 'data/raw/dataset' # Adjust if your dataset is in a different subfolder
PROCESSED_DATA_DIR = 'data/processed'
TRAIN_DIR = os.path.join(PROCESSED_DATA_DIR, 'train')
VAL_DIR = os.path.join(PROCESSED_DATA_DIR, 'validation')
TEST_DIR = os.path.join(PROCESSED_DATA_DIR, 'test')
Data Loading & Initial Inspection

Load the raw data (images). Since TrashNet is usually organized in folders by class, you'll iterate through directories.
Get a list of all image paths and their corresponding labels.
Display sample images from each class to get a visual sense of the data.
Python
# Collect all image paths and labels
image_paths = []
labels = []
class_names = sorted(os.listdir(RAW_DATA_DIR))

for class_name in class_names:
    class_path = os.path.join(RAW_DATA_DIR, class_name)
    if os.path.isdir(class_path):
        for img_name in os.listdir(class_path):
            image_paths.append(os.path.join(class_path, img_name))
            labels.append(class_name)

df = pd.DataFrame({'image_path': image_paths, 'label': labels})
print(f"Total images found: {len(df)}")
print("Class distribution:\n", df['label'].value_counts())

# Display a few sample images
plt.figure(figsize=(10, 8))
for i, class_name in enumerate(class_names[:6]): # Display 6 classes
    sample_img_path = df[df['label'] == class_name].sample(1)['image_path'].iloc[0]
    img = plt.imread(sample_img_path)
    plt.subplot(2, 3, i + 1)
    plt.imshow(img)
    plt.title(class_name)
    plt.axis('off')
plt.tight_layout()
plt.show()
Exploratory Data Analysis (EDA)

Class Distribution: Visualize the number of images per class using a bar plot. Check for class imbalance.
Image Dimensions: Analyze the distribution of image heights and widths. This helps in deciding on a target size for resizing.
Identify potential issues: Are there corrupted images? Very small/large images?
Python
# Plot class distribution
plt.figure(figsize=(8, 6))
sns.countplot(data=df, x='label', palette='viridis')
plt.title('Distribution of Waste Categories')
plt.xlabel('Waste Category')
plt.ylabel('Number of Images')
plt.xticks(rotation=45)
plt.show()

# (Optional) Analyze image dimensions - might be slow for very large datasets
# from PIL import Image
# img_widths = []
# img_heights = []
# for img_path in df['image_path']:
#     try:
#         with Image.open(img_path) as img:
#             width, height = img.size
#             img_widths.append(width)
#             img_heights.append(height)
#     except Exception as e:
#         print(f"Error opening image {img_path}: {e}")
# print(f"Average image width: {np.mean(img_widths):.0f}, height: {np.mean(img_heights):.0f}")
Data Splitting

Split the dataset into training, validation, and test sets. A common split is 70% train, 15% validation, 15% test.
Crucially, stratify the split to ensure each subset has a similar class distribution as the original dataset.
Python
# Split the data into train (70%), validation (15%), and test (15%)
train_df, temp_df = train_test_split(df, test_size=0.3, stratify=df['label'], random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, stratify=temp_df['label'], random_state=42) # 0.5 of 30% is 15%

print(f"Train samples: {len(train_df)}")
print(f"Validation samples: {len(val_df)}")
print(f"Test samples: {len(test_df)}")

# Verify distribution in splits (optional)
print("\nTrain class distribution:\n", train_df['label'].value_counts(normalize=True))
print("\nValidation class distribution:\n", val_df['label'].value_counts(normalize=True))
print("\nTest class distribution:\n", test_df['label'].value_counts(normalize=True))
Data Preprocessing & Augmentation

Create directories for processed data (data/processed/train, data/processed/validation, data/processed/test).
Copy images to their respective new directories. This makes it easy for ImageDataGenerator to pick them up.
Define ImageDataGenerator for training (with augmentation) and validation/testing (only rescaling).
Rescaling: Normalize pixel values (e.g., to [0, 1]).
Augmentation: Apply transformations like rotation, zoom, flips, shifts. This helps the model generalize better by seeing diverse versions of the same image.
Python
# Create target directories if they don't exist
for directory in [TRAIN_DIR, VAL_DIR, TEST_DIR]:
    for class_name in class_names:
        os.makedirs(os.path.join(directory, class_name), exist_ok=True)

# Function to copy images to their respective processed folders
def copy_images_to_processed(dataframe, target_dir):
    for index, row in dataframe.iterrows():
        src_path = row['image_path']
        dest_path = os.path.join(target_dir, row['label'], os.path.basename(src_path))
        copyfile(src_path, dest_path)

print("Copying train images...")
copy_images_to_processed(train_df, TRAIN_DIR)
print("Copying validation images...")
copy_images_to_processed(val_df, VAL_DIR)
print("Copying test images...")
copy_images_to_processed(test_df, TEST_DIR)

# Define image size for the model
IMG_SIZE = (224, 224) # Common size for pre-trained CNNs
BATCH_SIZE = 32

# Data Generators
train_datagen = ImageDataGenerator(
    rescale=1./255,             # Normalize pixel values to [0, 1]
    rotation_range=20,          # Random rotation
    width_shift_range=0.2,      # Random horizontal shift
    height_shift_range=0.2,     # Random vertical shift
    shear_range=0.2,            # Shear transformations
    zoom_range=0.2,             # Random zoom
    horizontal_flip=True,       # Random horizontal flips
    fill_mode='nearest'         # Strategy for filling in new pixels created by transformations
)

# Validation and test generators should only rescale
val_test_datagen = ImageDataGenerator(rescale=1./255)

# Flow images from directories
train_generator = train_datagen.flow_from_directory(
    TRAIN_DIR,
    target_size=IMG_SIZE,
    batch_size=BATCH_SIZE,
    class_mode='categorical',
    shuffle=True
)

validation_generator = val_test_datagen.flow_from_directory(
    VAL_DIR,
    target_size=IMG_SIZE,
    batch_size=BATCH_SIZE,
    class_mode='categorical',
    shuffle=False # No need to shuffle validation data
)

test_generator = val_test_datagen.flow_from_directory(
    TEST_DIR,
    target_size=IMG_SIZE,
    batch_size=BATCH_SIZE,
    class_mode='categorical',
    shuffle=False # No need to shuffle test data
)

print("\nData generators created successfully!")
print(f"Number of classes: {train_generator.num_classes}")
print(f"Class indices: {train_generator.class_indices}")

# Save class names mapping for later use (e.g., in predict.py)
class_indices_df = pd.DataFrame(train_generator.class_indices.items(), columns=['class_name', 'index'])
class_indices_df.to_csv(os.path.join(PROCESSED_DATA_DIR, 'class_indices.csv'), index=False)
Conclusion

Summarize the data preparation steps and the readiness of the data for model training.