In [None]:
import os
import zipfile
import cv2
import numpy as np
from kaggle.api.kaggle_api_extended import KaggleApi
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras import layers, models

# Step 1: Set Kaggle API credentials
os.environ['KAGGLE_USERNAME'] = 'bhargavsaisanapala'
os.environ['KAGGLE_KEY'] = 'a6cbb0f1353e12f36a5b936c5c1219d3'

# Step 2: Initialize Kaggle API
api = KaggleApi()
api.authenticate()

# Step 3: Download the dataset using the Kaggle API
dataset = 'tushar5harma/plant-village-dataset-updated'  # Specify the dataset path
output_dir = './'  # Specify the output directory where you want to save the dataset

# Download the dataset and save as a zip file
api.dataset_download_files(dataset, path=output_dir, unzip=False)

# Step 4: Unzip the downloaded dataset
zip_file_path = os.path.join(output_dir, 'plant-village-dataset-updated.zip')

# Unzipping the dataset
print("Zip file found, extracting...")
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(output_dir)
print("Extraction completed.")

# Step 5: Check the current directory for the extracted folder
print("Current working directory:", os.getcwd())
print("Files and directories in the current directory:", os.listdir(output_dir))

# Step 6: Identify the correct extracted folder
data_dir = None

# Check for directories that could contain the images
for item in os.listdir(output_dir):
    item_path = os.path.join(output_dir, item)
    if os.path.isdir(item_path) and item not in ['.config', 'sample_data']:
        data_dir = item_path
        break

if not data_dir:
    raise FileNotFoundError("No valid dataset directory found.")

print("Using dataset directory:", data_dir)

# Step 7: Load images and labels
train_dir = os.path.join(data_dir, 'Train')  # Path for training images
images = []
labels = []

valid_extensions = ['.jpg', '.jpeg', '.png']  # Valid image formats

# Initialize categories list
categories = []

# Load images from the 'Train' directory
if os.path.exists(train_dir):
    # Populate the categories list
    categories = os.listdir(train_dir)

    for category in categories:
        category_path = os.path.join(train_dir, category)

        # Check if the path is indeed a directory
        if not os.path.isdir(category_path):
            print(f"Skipping non-directory: {category_path}")
            continue

        print(f"Loading images from category: {category}")  # Debug info

        for img_name in os.listdir(category_path):
            if not any(img_name.lower().endswith(ext) for ext in valid_extensions):
                continue
            img_path = os.path.join(category_path, img_name)
            img = cv2.imread(img_path)
            if img is None:
                print(f"Warning: {img_path} could not be read.")
                continue
            img = cv2.resize(img, (128, 128))  # Resize images for uniformity
            images.append(img)
            labels.append(categories.index(category))  # Use index of category in the list
else:
    raise FileNotFoundError(f"Training directory '{train_dir}' not found.")

# Check if images were loaded
if not images:
    raise ValueError("No images were loaded. Please check the image paths and formats.")

# Convert lists to NumPy arrays
X = np.array(images)
y = np.array(labels)

# Step 9: Normalize the image data
X = X.astype('float32') / 255.0  # Scale pixel values to [0, 1]

# Step 10: Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 11: Build a simple CNN model
model = models.Sequential([
    layers.Conv2D(32, (3, 3), activation='relu', input_shape=(128, 128, 3)),
    layers.MaxPooling2D(pool_size=(2, 2)),
    layers.Conv2D(64, (3, 3), activation='relu'),
    layers.MaxPooling2D(pool_size=(2, 2)),
    layers.Conv2D(128, (3, 3), activation='relu'),
    layers.MaxPooling2D(pool_size=(2, 2)),
    layers.Flatten(),
    layers.Dense(128, activation='relu'),
    layers.Dense(len(categories), activation='softmax')  # Output layer for number of categories
])

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Step 12: Train the model
model.fit(X_train, y_train, epochs=10, validation_data=(X_test, y_test))


Dataset URL: https://www.kaggle.com/datasets/tushar5harma/plant-village-dataset-updated
Zip file found, extracting...
Extraction completed.
Current working directory: /content
Files and directories in the current directory: ['.config', 'Tomato', 'Strawberry', 'plant-village-dataset-updated.zip', 'content', 'Potato', 'Corn (Maize)', 'Bell Pepper', 'Grape', 'Peach', 'Apple', 'Cherry', 'sample_data']
Using dataset directory: ./Tomato
Loading images from category: Healthy
Loading images from category: Early Blight
Loading images from category: Bacterial Spot
Loading images from category: Yellow Leaf Curl Virus
Loading images from category: Septoria Leaf Spot
Loading images from category: Late Blight


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m278/278[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m317s[0m 1s/step - accuracy: 0.5194 - loss: 1.1882 - val_accuracy: 0.8213 - val_loss: 0.4896
Epoch 2/10
[1m278/278[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m311s[0m 1s/step - accuracy: 0.8483 - loss: 0.4181 - val_accuracy: 0.7821 - val_loss: 0.5518
Epoch 3/10
[1m278/278[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m320s[0m 1s/step - accuracy: 0.8922 - loss: 0.2990 - val_accuracy: 0.8379 - val_loss: 0.4649
Epoch 4/10
[1m278/278[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m295s[0m 1s/step - accuracy: 0.9238 - loss: 0.2086 - val_accuracy: 0.9199 - val_loss: 0.2331
Epoch 5/10
[1m278/278[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m299s[0m 1s/step - accuracy: 0.9589 - loss: 0.1210 - val_accuracy: 0.9230 - val_loss: 0.2672
Epoch 6/10
[1m278/278[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m317s[0m 1s/step - accuracy: 0.9683 - loss: 0.0936 - val_accuracy: 0.8915 - val_loss: 0.3128
Epoch 7/10
[1m278/278

<keras.src.callbacks.history.History at 0x784cd6f0d420>