# Exploration et Prétraitement des Données

Ce notebook explore le jeu de données Plant Disease, visualise les classes et met en place l'augmentation des données.

In [None]:
import os
import cv2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator

# Définir le style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 8)

## 1. Structure du Dataset et Distribution des Classes

In [None]:
TRAIN_DIR = '../Dataset/train'
VALID_DIR = '../Dataset/valid'

def get_class_distribution(dir_path):
    classes = sorted(os.listdir(dir_path))
    # Filtrer les fichiers cachés comme .DS_Store
    classes = [c for c in classes if not c.startswith('.')]
    count = []
    for c in classes:
        count.append(len(os.listdir(os.path.join(dir_path, c))))
    return pd.DataFrame({'Class': classes, 'Count': count})

train_dist = get_class_distribution(TRAIN_DIR)
valid_dist = get_class_distribution(VALID_DIR)

print(f"Total classes Entraînement : {len(train_dist)}")
print(f"Total classes Validation : {len(valid_dist)}")

# Affichage graphique
plt.figure(figsize=(20, 10))
sns.barplot(data=train_dist, x='Count', y='Class', color='skyblue')
plt.title('Distribution des classes (Entraînement)')
plt.show()

## 2. Visualiser des Images Exemples

In [None]:
def visualize_samples(dir_path, num_samples=5):
    classes = [c for c in os.listdir(dir_path) if not c.startswith('.')]
    random_classes = np.random.choice(classes, num_samples, replace=False)
    
    plt.figure(figsize=(15, 5))
    for i, c in enumerate(random_classes):
        class_dir = os.path.join(dir_path, c)
        images = os.listdir(class_dir)
        img_name = np.random.choice(images)
        img_path = os.path.join(class_dir, img_name)
        
        img = cv2.imread(img_path)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        
        plt.subplot(1, num_samples, i+1)
        plt.imshow(img)
        plt.title(c.split('___')[-1], fontsize=10)
        plt.axis('off')
    plt.suptitle("Images Exemples du Set d'Entraînement")
    plt.show()

visualize_samples(TRAIN_DIR, num_samples=5)

## 3. Configuration de l'Augmentation des Données
Nous utiliserons ImageDataGenerator pour augmenter les images à la volée.

In [None]:
datagen = ImageDataGenerator(
    rescale=1./255,
    rotation_range=20,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,
    fill_mode='nearest'
)

# Choisir une image pour démontrer l'augmentation
classes = [c for c in os.listdir(TRAIN_DIR) if not c.startswith('.')]
sample_class = classes[0]
sample_img_name = os.listdir(os.path.join(TRAIN_DIR, sample_class))[0]
sample_img_path = os.path.join(TRAIN_DIR, sample_class, sample_img_name)

img = cv2.imread(sample_img_path)
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
img = np.expand_dims(img, 0) # Ajouter dimension batch

plt.figure(figsize=(15, 5))
i = 0
print("Images Augmentées :")
for batch in datagen.flow(img, batch_size=1):
    plt.subplot(1, 5, i+1)
    plt.imshow(batch[0])
    plt.axis('off')
    i += 1
    if i >= 5:
        break
plt.show()