In [None]:
!pip install kaggle

In [None]:
! mkdir ~/.kaggle

In [None]:
! cp kaggle.json ~/.kaggle/

In [None]:
! chmod 600 ~/.kaggle/kaggle.json

In [None]:
! kaggle datasets list

In [None]:
!kaggle competitions download -c cassava-leaf-disease-classification

In [None]:
!unzip -q cassava-leaf-disease-classification.zip

In [None]:
import pandas as pd

df = pd.read_csv("train.csv")
print("train.csv shape:", df.shape)
df.head()


train.csv shape: (21397, 2)


Unnamed: 0,image_id,label
0,1000015157.jpg,0
1,1000201771.jpg,3
2,100042118.jpg,1
3,1000723321.jpg,1
4,1000812911.jpg,3


In [None]:
import json

with open("label_num_to_disease_map.json", "r") as f:
    label_map = json.load(f)

print(label_map)


{'0': 'Cassava Bacterial Blight (CBB)', '1': 'Cassava Brown Streak Disease (CBSD)', '2': 'Cassava Green Mottle (CGM)', '3': 'Cassava Mosaic Disease (CMD)', '4': 'Healthy'}


In [None]:
import os

# Create top-level directory
os.makedirs("data/images_by_class", exist_ok=True)

# Create subfolders for each label
for label_int in df['label'].unique():
    os.makedirs(f"data/images_by_class/{label_int}", exist_ok=True)


In [None]:
import shutil
from tqdm import tqdm

# Let's assume the original images are in "train_images/"
train_image_folder = "train_images"

for idx, row in tqdm(df.iterrows(), total=len(df)):
    img_id = row['image_id']
    label_int = row['label']

    src = os.path.join(train_image_folder, img_id)
    dst = os.path.join("data/images_by_class", str(label_int), img_id)
    shutil.move(src, dst)


In [None]:
import tensorflow as tf

batch_size = 32
img_size = (224, 224)  # We'll resize to 224x224, typical for many pretrained models
seed = 42

train_ds = tf.keras.utils.image_dataset_from_directory(
    "data/images_by_class",
    validation_split=0.2,  # 80% train, 20% validation
    subset="training",
    seed=seed,
    image_size=img_size,
    batch_size=batch_size
)

val_ds = tf.keras.utils.image_dataset_from_directory(
    "data/images_by_class",
    validation_split=0.2,
    subset="validation",
    seed=seed,
    image_size=img_size,
    batch_size=batch_size
)

In [None]:
class_names = train_ds.class_names
print("Classes:", class_names)

for image_batch, label_batch in train_ds.take(1):
    print("Image batch shape:", image_batch.shape)
    print("Label batch shape:", label_batch.shape)
    print("Labels:", label_batch.numpy())


In [None]:
import numpy as np

# Summarize the distribution in the training set
label_counts = {label: 0 for label in class_names}

for images, labels in train_ds.unbatch():
    for l in labels.numpy().flatten():
        label_counts[str(l)] = label_counts.get(str(l), 0) + 1

print("Train Label Counts:", label_counts)

Train Label Counts: {'0': 871, '1': 1758, '2': 1895, '3': 10589, '4': 2005}


In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 10))
for images, labels in train_ds.take(1):
    for i in range(9):
        ax = plt.subplot(3, 3, i+1)
        plt.imshow(images[i].numpy().astype("uint8"))
        plt.title(f"Class: {labels[i].numpy()}")
        plt.axis("off")
plt.show()


In [None]:
AUTOTUNE = tf.data.AUTOTUNE

train_ds = train_ds.cache().shuffle(1000).prefetch(buffer_size=AUTOTUNE)
val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)

In [None]:
print(train_ds.take(1))

<_TakeDataset element_spec=(TensorSpec(shape=(None, 224, 224, 3), dtype=tf.float32, name=None), TensorSpec(shape=(None,), dtype=tf.int32, name=None))>


In [None]:
from tensorflow import keras
from tensorflow.keras import layers

data_augmentation = keras.Sequential([
    layers.RandomFlip("horizontal"),
    layers.RandomRotation(0.1),
    layers.RandomZoom(0.1),
])

In [None]:
train_ds = train_ds.map(
    lambda x, y: (data_augmentation(x, training=True), y),
    num_parallel_calls=AUTOTUNE
).prefetch(AUTOTUNE)

In [None]:
for images, labels in train_ds.take(1):
    print("Image batch shape:", images.shape)
    print("Label batch shape:", labels.shape)
    break


In [None]:
import tensorflow as tf
from tensorflow.keras import layers, models

num_classes = 5  # (Cassava dataset has 5 classes: 0-4)

model = models.Sequential([
    layers.Rescaling(1./255, input_shape=(224, 224, 3)),  # Scale [0,255] to [0,1]

    layers.Conv2D(32, 3, padding='same', activation='relu'),
    layers.MaxPooling2D(),

    layers.Conv2D(64, 3, padding='same', activation='relu'),
    layers.MaxPooling2D(),

    layers.Conv2D(128, 3, padding='same', activation='relu'),
    layers.MaxPooling2D(),

    layers.Flatten(),
    layers.Dense(256, activation='relu'),
    layers.Dropout(0.5),   # helps reduce overfitting
    layers.Dense(num_classes, activation='softmax')
])

model.summary()


In [None]:
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

callbacks = [
    EarlyStopping(patience=3, restore_best_weights=True),
    ModelCheckpoint('baseline_cnn_best.h5', save_best_only=True)
]


In [None]:
model.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

In [None]:
epochs = 10

history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=epochs,
    callbacks=callbacks
)


In [None]:
import matplotlib.pyplot as plt

acc = history.history['accuracy']
val_acc = history.history['val_accuracy']

loss = history.history['loss']
val_loss = history.history['val_loss']

epochs_range = range(len(acc))

plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(epochs_range, acc, label='Training Accuracy')
plt.plot(epochs_range, val_acc, label='Validation Accuracy')
plt.legend(loc='lower right')
plt.title('Training and Validation Accuracy')

plt.subplot(1, 2, 2)
plt.plot(epochs_range, loss, label='Training Loss')
plt.plot(epochs_range, val_loss, label='Validation Loss')
plt.legend(loc='upper right')
plt.title('Training and Validation Loss')
plt.show()
