In [None]:
import os
import json
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import cv2
from PIL import Image
import hashlib
import os
import cv2

In [None]:
BASE_DIR = "/kaggle/input/cassava-leaf-disease-classification"

In [None]:
with open("/kaggle/input/cassava-leaf-disease-classification/label_num_to_disease_map.json") as file:
    print("yes")

In [None]:
# Step 1: Load and inspect label map (mapping from numerical labels to disease names)
with open(os.path.join(BASE_DIR, "label_num_to_disease_map.json")) as file:
    map_classes = json.loads(file.read())
    map_classes = {int(k): v for k, v in map_classes.items()}

# Display the mapping
print("Class Mapping: ")
print(json.dumps(map_classes, indent=4))

# Check the contents of the train_images folder
os.listdir(os.path.join(BASE_DIR, "train_images"))

In [None]:
# Step 2: Load training image filenames and display the count
input_files = os.listdir(os.path.join(BASE_DIR, "train_images"))
print(f"Number of train images: {len(input_files)}")

In [None]:
# Step 3: Load train.csv and add a human-readable class name based on the mapping
df_train = pd.read_csv(os.path.join(BASE_DIR, "train.csv"))
print(df_train.head())

# Map the numerical label to the actual disease name
df_train["class_name"] = df_train["label"].map(map_classes)

# Display the dataframe with the new column
df_train

In [None]:
df_train['class_name'].value_counts()

In [None]:
# Step 4: Check class distribution
class_distribution = df_train['class_name'].value_counts()

# Plot the class distribution
plt.figure(figsize=(10, 6))
class_distribution.plot(kind='bar')
plt.title('Class Distribution of Cassava Leaf Disease')
plt.ylabel('Number of Images')
plt.xlabel('Disease Class')
plt.xticks(rotation=45)
plt.show()


In [None]:
# Alternatively, use seaborn for a countplot visualization
plt.figure(figsize=(8, 4))
sns.countplot(y="class_name", data=df_train)
plt.title('Class Distribution (Seaborn)')
plt.show()

In [None]:
print('Dataset info :')
print(df_train.info())

In [None]:
print('\nDataset summary statistics :')
print(df_train.describe())


In [None]:
print('Missing values in each column :')
print(df_train.isnull().sum())

In [None]:
print('No of duplicated rows:')
print(df_train.duplicated().sum())

In [None]:
# Step 7: Analyze image shapes (size dimensions) for a sample of 300 images
# Dictionary to store image shapes and their counts
img_shapes = {}
for image_name in os.listdir(os.path.join(BASE_DIR, "train_images"))[:500]:
    image = cv2.imread(os.path.join(BASE_DIR, "train_images", image_name))
    img_shapes[image.shape] = img_shapes.get(image.shape, 0) + 1

# Display image shapes
print("\nSample Image Shapes and their Frequencies (from 1000 images):")
print(img_shapes)

In [None]:
df_train.head()

In [None]:
import matplotlib.pyplot as plt
import os
from PIL import Image

# Step 8: Function to plot sample images from a specific class
def plot_images_from_class(class_id, num_images=9):
    """
    Plot sample images from a specific class in a 3x3 grid.
    
    Parameters:
        class_id (int): The class label to filter images.
        num_images (int): The number of images to plot.
    """
    # Filter images for the specified class
    class_images = df_train[df_train['label'] == class_id]
    num_images = min(len(class_images), num_images) # Adjust if fewer images than
    
    plt.figure(figsize=(15, 15)) # Set figure size for better visualization
    images = class_images.sample(num_images) # Randomly sample images
    
    # Plot images in a 3x3 grid
    for i, (_, row) in enumerate(images.iterrows()):
        img_path = os.path.join(BASE_DIR, "train_images", row['image_id'])
        img = Image.open(img_path)
        plt.subplot(3, 3, i + 1)
        plt.imshow(img)
        plt.title(map_classes[class_id]) # Use class name for the title
        plt.axis('off') # Hide axis for better visualization
        
    plt.tight_layout() # Adjust layout to prevent overlap
    plt.show()

In [None]:
plot_images_from_class(0)

In [None]:
for i in range(5):
    print(f"Displaying sample images for class: {map_classes[i]}")
    plot_images_from_class(i)

In [None]:
df_train['image_shape'] = df_train['image_id'].apply(lambda x: cv2.imread(os.path.join(BASE_DIR,"train_images",x)).shape)
shape_class_dist = df_train.groupby(['class_name','image_shape']).size().unstack(fill_value=0)
shape_class_dist.T.plot(kind='bar', stacked=True, figsize=(12, 6))
plt.title('Image Size Distribution by Class')
plt.xlabel('Image Shape')
plt.ylabel('Number of Images')
plt.legend(title='Class Name', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.xticks(rotation=45)
plt.show()

In [None]:
df_train['label_names'] = df_train['label'].map(map_classes)

In [None]:
def get_image_hash(image_path):
    """Generate an MD5 hash for the image."""
    with open(image_path, "rb") as f:
        file_hash = hashlib.md5(f.read()).hexdigest()
    return file_hash

# Dictionary to store image hashes and their file names
image_hashes = {}

# Check for duplicate images
duplicate_images = []

# Note: BASE_DIR must be defined before running this loop
for image_name in os.listdir(os.path.join(BASE_DIR, "train_images")):
    image_path = os.path.join(BASE_DIR, "train_images", image_name)
    image_hash = get_image_hash(image_path)
    
    if image_hash in image_hashes:
        # If hash exists, it's a duplicate. Append the current name and the original name.
        duplicate_images.append((image_name, image_hashes[image_hash]))
    else:
        image_hashes[image_hash] = image_name  # Store the hash

print(f"Found {len(duplicate_images)} exact duplicate images.")
for dup in duplicate_images:
    print(f"Duplicate pair: {dup[0]} and {dup[1]}")

In [None]:
len(image_hashes)

In [None]:
import tensorflow as tf
from tensorflow.keras import models, layers

model = models.Sequential()

# 1st Convolutional block
model.add(layers.Conv2D(32, (3, 3), activation='relu', input_shape=(224, 224, 3)))
model.add(layers.BatchNormalization())
model.add(layers.MaxPooling2D((2, 2)))

# 2nd Convolutional block
model.add(layers.Conv2D(64, (3, 3), activation='relu'))
model.add(layers.BatchNormalization())
model.add(layers.MaxPooling2D((2, 2)))

# 3rd Convolutional block
model.add(layers.Conv2D(128, (3, 3), activation='relu'))
model.add(layers.BatchNormalization())
model.add(layers.MaxPooling2D((2, 2)))

# 4th Convolutional block
model.add(layers.Conv2D(256, (3, 3), activation='relu'))
model.add(layers.BatchNormalization())
model.add(layers.MaxPooling2D((2, 2)))

# Flatten the output to feed it into fully connected layers
model.add(layers.Flatten())

# Dense layer with 512 units
model.add(layers.Dense(512, activation='relu'))
model.add(layers.BatchNormalization())
model.add(layers.Dropout(0.5))

# Output layer for classification (assuming 5 classes)
model.add(layers.Dense(5, activation='softmax'))

# Compile the model
model.compile(optimizer='adam',
              loss='categorical_crossentropy', # Assuming sparse labels (integers)
              metrics=['accuracy'])

# Model summary
model.summary()

In [None]:
#Depracated techniques

In [None]:
images_dir = '/kaggle/input/cassava-leaf-disease-classification/train_images'

In [None]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator
# 2. Create an ImageDataGenerator to preprocess images
# normalizing pixel values (1./255) and setting a validation split of 20%
datagen = ImageDataGenerator(rescale=1./255,
                             validation_split=0.2)

# 3. Generate Training Dataset
train_dataset = datagen.flow_from_dataframe(
    dataframe=df_train,
    directory=images_dir,
    x_col="image_id",       # Column containing file names
    y_col="class_name",     # Column containing labels (must match map_classes values)
    target_size=(224, 224), # Resize images to match model input
    batch_size=32,
    class_mode="categorical",
    subset="training",
    shuffle=True
)

# 4. Generate Validation Dataset
val_dataset = datagen.flow_from_dataframe(
    dataframe=df_train,
    directory=images_dir,
    x_col="image_id",
    y_col="class_name",
    target_size=(224, 224),
    batch_size=32,
    class_mode="categorical",
    subset="validation",
    shuffle=True
)

In [None]:
# 5. Train the model for 10 epochs
history = model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=10
)

In [None]:
#Using tensorflow dataset

In [None]:
# Option 1: Using pandas' vectorized string operations
# This prepends the directory path to the filename in the 'image_id' column
df_train['image_id'] = "/kaggle/input/cassava-leaf-disease-classification/train_images/" + df_train['image_id']

# Display the first few rows to verify the updated paths
df_train.head()

In [None]:
from sklearn.model_selection import train_test_split

# Define the validation split ratio
VALIDATION_SPLIT = 0.2  # 20% for validation

# Perform stratified split to maintain class distribution
# This ensures that both train and validation sets have the same percentage of each disease class
train_df, val_df = train_test_split(
    df_train,
    test_size=VALIDATION_SPLIT,
    stratify=df_train['label'],
    random_state=42
)

print(f"Training samples: {len(train_df)}")
print(f"Validation samples: {len(val_df)}")

In [None]:
print("Training class distribution:")
print(train_df['label'].value_counts())

print("\nValidation class distribution:")
print(val_df['label'].value_counts())

In [None]:
# 1. Configuration Constants
IMG_HEIGHT = 224
IMG_WIDTH = 224
CHANNELS = 3
BATCH_SIZE = 32
BUFFER_SIZE = 1000
AUTOTUNE = tf.data.AUTOTUNE

In [None]:
# 2. Define the Image Processing Function
def process_image(file_path, label):
    """
    Reads an image from a file path, decodes it, resizes it, and normalizes it.
    """
    # Read the image from disk
    image = tf.io.read_file(file_path)
    
    # Decode the image (assuming JPEG format)
    image = tf.image.decode_jpeg(image, channels=CHANNELS)
    
    # Resize the image
    image = tf.image.resize(image, [IMG_HEIGHT, IMG_WIDTH])
    
    # Normalize pixel values to [0, 1]
    image = image / 255.0
    
    return image, label

# 3. Create Training Dataset

In [None]:
# 3. Create TensorFlow Dataset from training DataFrame
# Note: Assumes 'train_df' is already defined and has 'image_id' (full paths) and 'label' columns
train_ds = tf.data.Dataset.from_tensor_slices((train_df['image_id'].values, train_df['label'].values))

# Map the processing function to each (image, label) pair
train_ds = train_ds.map(process_image, num_parallel_calls=AUTOTUNE)

In [None]:
# Data Augmentation (optional but recommended)
data_augmentation = tf.keras.Sequential([
    layers.RandomFlip('horizontal'),
    layers.RandomRotation(0.1),
    layers.RandomZoom(0.1),
    layers.RandomContrast(0.1),
])

def augment(image, label):
    """Applies data augmentation to the image."""
    image = data_augmentation(image)
    return image, label

# Apply augmentation to the training dataset
# Note: 'train_ds' must be defined from the previous step before running this
train_ds = train_ds.map(augment, num_parallel_calls=AUTOTUNE)

# Shuffle, batch, and prefetch the dataset for optimal performance
# Note: 'BUFFER_SIZE', 'BATCH_SIZE', and 'AUTOTUNE' should be defined previously
train_ds = train_ds.shuffle(BUFFER_SIZE).batch(BATCH_SIZE).prefetch(AUTOTUNE)

In [None]:
# Create TensorFlow Dataset from validation DataFrame
# Note: Assumes 'val_df' is already defined from the train_test_split step
val_ds = tf.data.Dataset.from_tensor_slices((val_df['image_id'].values, val_df['label'].values))

# Map the processing function
# Note: 'process_image' function and 'AUTOTUNE' must be defined previously
val_ds = val_ds.map(process_image, num_parallel_calls=AUTOTUNE)

# Batch and prefetch
# We do NOT shuffle the validation data
val_ds = val_ds.batch(BATCH_SIZE).prefetch(AUTOTUNE)

In [None]:
from tensorflow.keras import regularizers

num_classes = 5

# Define the CNN model architecture
def create_cnn_model(input_shape=(IMG_HEIGHT, IMG_WIDTH, CHANNELS), num_classes=5):
    model = models.Sequential([
        # 1st Convolutional block
        layers.Input(shape=input_shape),
        layers.Conv2D(32, (3, 3), activation='relu', padding='same'),
        layers.BatchNormalization(),
        layers.MaxPooling2D((2, 2)),

        # 2nd Convolutional block
        layers.Conv2D(64, (3, 3), activation='relu', padding='same'),
        layers.BatchNormalization(),
        layers.MaxPooling2D((2, 2)),

        # 3rd Convolutional block
        layers.Conv2D(128, (3, 3), activation='relu', padding='same'),
        layers.BatchNormalization(),
        layers.MaxPooling2D((2, 2)),

        # 4th Convolutional block
        layers.Conv2D(256, (3, 3), activation='relu', padding='same'),
        layers.BatchNormalization(),
        layers.MaxPooling2D((2, 2)),

        # Flatten and Dense layers
        layers.Flatten(),
        layers.Dense(512, activation='relu'),
        layers.BatchNormalization(),
        layers.Dropout(0.5),

        # Output layer
        layers.Dense(num_classes, activation='softmax')
    ])
    
    return model

# Instantiate the model
# Note: IMG_HEIGHT, IMG_WIDTH, CHANNELS must be defined from previous steps
model = create_cnn_model(input_shape=(IMG_HEIGHT, IMG_WIDTH, CHANNELS), num_classes=num_classes)

# Display the model architecture
model.summary()

In [None]:
# Compile the model
model.compile(
    optimizer='adam',  # You can experiment with different optimizers
    loss='sparse_categorical_crossentropy',  # Suitable for integer-encoded labels
    metrics=['accuracy']
)

In [None]:
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau

# Early stopping to prevent overfitting
early_stop = EarlyStopping(
    monitor='val_loss',
    patience=3,
    restore_best_weights=True,
    verbose=1
)

# Model checkpoint to save the best model
checkpoint = ModelCheckpoint(
    'best_cnn_model.keras',
    monitor='val_accuracy',
    save_best_only=True,
    verbose=1
)

# Reduce learning rate when a metric has stopped improving
reduce_lr = ReduceLROnPlateau(
    monitor='val_loss',
    factor=0.2,
    patience=3,
    verbose=1,
    min_lr=1e-6
)

callbacks = [early_stop , checkpoint , reduce_lr]

In [None]:
# Define the number of epochs
EPOCHS = 10  # Adjust based on your requirements

# Group the individual callbacks from the previous step into a list
callbacks = [early_stop, checkpoint, reduce_lr]

# Train the model
history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=EPOCHS,
    callbacks=callbacks
)

In [None]:
# Step 14: Evaluate the Model
val_loss, val_accuracy = model.evaluate(val_ds)
print(f"\nValidation Loss: {val_loss:.4f}")
print(f"Validation Accuracy: {val_accuracy:.4f}")

In [None]:
# Retrieve metrics from history
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs_range = range(len(acc))

# Plot Accuracy
plt.figure(figsize=(14, 6))
plt.subplot(1, 2, 1)
plt.plot(epochs_range, acc, label='Training Accuracy')
plt.plot(epochs_range, val_acc, label='Validation Accuracy')
plt.legend(loc='lower right')
plt.title('Training and Validation Accuracy')

# Plot Loss
plt.subplot(1, 2, 2)
plt.plot(epochs_range, loss, label='Training Loss')
plt.plot(epochs_range, val_loss, label='Validation Loss')
plt.legend(loc='upper right')
plt.title('Training and Validation Loss')