In [1]:
import pandas as pd
import numpy as np
import cv2
import os
from sklearn.model_selection import train_test_split

In [2]:
def create_dataset(dataset_dir):
    classes = ["healthy", "early_blight", "late_blight", "bacterial_spot", "mosaic_virus"]

    image_array_list = []
    label_list = []

    for i in classes:
        path = os.path.join(dataset_dir, i)
        for img in os.listdir(path):
            img_array = cv2.imread(os.path.join(path,img))
            img_resized = cv2.resize(img_array, (128, 128))

            image_array_list.append(img_resized.flatten())
            label_list.append(classes.index(i))

    image_data = np.array(image_array_list)
    label_data = np.array(label_list)

    df = pd.DataFrame(image_data) 
    df["target"] = label_data

    X = df.iloc[:,:-1]
    y = df.iloc[:,-1]

    return X, y

### PlantDoc

In [3]:
X_train, y_train = create_dataset("W:/PlantDoc/train")
X_test, y_test = create_dataset("W:/PlantDoc/test")

In [4]:
np.savez_compressed(
    "PlantDoc_v2.npz",
    X_train = X_train,
    X_test = X_test,
    y_train = y_train,
    y_test = y_test,
)

### PlantVillage

In [None]:
X, y = create_dataset("W:/PlantVillage")

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

np.savez_compressed(
    "PlantVillage.npz",
    X_train = X_train,
    X_test = X_test,
    y_train = y_train,
    y_test = y_test,
)