In [1]:
import os
import numpy as np
from PIL import Image
from sklearn.utils import resample

In [None]:
train_dir = "New Plant Diseases Dataset(Augmented)/train"

In [6]:
features = []
labels = []
for class_folder in os.listdir(train_dir):
    class_path = os.path.join(train_dir, class_folder)
    if os.path.isdir(class_path):  # Check if it's a directory
        for image_file in os.listdir(class_path):
            image_path = os.path.join(class_path, image_file)
            try:
                # Load and preprocess the image
                img = Image.open(image_path).resize((64, 64))  # Resize to 64x64
                img_array = np.array(img).flatten()  # Flatten the image

                # Append the image and its label
                features.append(img_array)
                labels.append(class_folder)  # Use folder name as the label
            except Exception as e:
                print(f"Error loading image {image_path}: {e}")

In [7]:
features = np.array(features)
labels = np.array(labels)

print(f"Loaded {features.shape[0]} images with {len(np.unique(labels))} classes.")

Loaded 60930 images with 33 classes.


In [8]:
n_samples = len(features)  # Number of samples in the dataset
n_bootstrap_samples = 100  # Adjust based on how many trees you want to train
X_bootstrap, y_bootstrap = resample(features, labels, n_samples=n_samples, replace=True, random_state=42)
print(f"Bootstrap sample size: {X_bootstrap.shape}")
print(f"Classes in bootstrap sample: {np.unique(y_bootstrap)}")

Bootstrap sample size: (60930, 12288)
Classes in bootstrap sample: ['Apple___Apple_scab' 'Apple___Black_rot' 'Apple___Cedar_apple_rust'
 'Apple___healthy' 'Cherry_(including_sour)___Powdery_mildew'
 'Cherry_(including_sour)___healthy'
 'Corn_(maize)___Cercospora_leaf_spot Gray_leaf_spot'
 'Corn_(maize)___Common_rust_' 'Corn_(maize)___Northern_Leaf_Blight'
 'Corn_(maize)___healthy' 'Grape___Black_rot'
 'Grape___Esca_(Black_Measles)'
 'Grape___Leaf_blight_(Isariopsis_Leaf_Spot)' 'Grape___healthy'
 'Peach___Bacterial_spot' 'Peach___healthy'
 'Pepper,_bell___Bacterial_spot' 'Pepper,_bell___healthy'
 'Potato___Early_blight' 'Potato___Late_blight' 'Potato___healthy'
 'Strawberry___Leaf_scorch' 'Strawberry___healthy'
 'Tomato___Bacterial_spot' 'Tomato___Early_blight' 'Tomato___Late_blight'
 'Tomato___Leaf_Mold' 'Tomato___Septoria_leaf_spot'
 'Tomato___Spider_mites Two-spotted_spider_mite' 'Tomato___Target_Spot'
 'Tomato___Tomato_Yellow_Leaf_Curl_Virus' 'Tomato___Tomato_mosaic_virus'
 'Tomato_

In [9]:
from sklearn.ensemble import RandomForestClassifier

# Initialize Random Forest with OOB score enabled
rf = RandomForestClassifier(n_estimators=100, oob_score=True, random_state=42)

# Train the Random Forest on the original training data
rf.fit(features, labels)

# Print the OOB score
print(f"OOB Score: {rf.oob_score_ * 100:.2f}%")


OOB Score: 66.72%


In [10]:
# Path to the validation folder
valid_dir = "New Plant Diseases Dataset(Augmented)/valid" 

# Load validation data (similar to loading training data)
features_valid = []
labels_valid = []

for class_folder in os.listdir(valid_dir):
    class_path = os.path.join(valid_dir, class_folder)
    if os.path.isdir(class_path):
        for image_file in os.listdir(class_path):
            image_path = os.path.join(class_path, image_file)
            try:
                img = Image.open(image_path).resize((64, 64))  # Resize
                img_array = np.array(img).flatten()  # Flatten

                features_valid.append(img_array)
                labels_valid.append(class_folder)  # Use folder name as label
            except Exception as e:
                print(f"Error loading image {image_path}: {e}")

# Convert to numpy arrays
features_valid = np.array(features_valid)
labels_valid = np.array(labels_valid)

# Evaluate the model on validation data
accuracy = rf.score(features_valid, labels_valid)
print(f"Validation Accuracy: {accuracy * 100:.2f}%")


Validation Accuracy: 61.74%


In [11]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# Define the hyperparameter grid
param_grid = {
    'n_estimators': [50, 100, 200],  # Number of trees
    'max_depth': [10, 20, None],     # Maximum tree depth
    'min_samples_split': [2, 5, 10], # Min samples to split a node
    'min_samples_leaf': [1, 2, 4],   # Min samples in a leaf node
    'max_features': ['sqrt', 'log2'] # Number of features to consider at each split
}

# Initialize the Random Forest model
rf = RandomForestClassifier(random_state=42)

# Use GridSearchCV for fine-tuning
grid_search = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    scoring='accuracy',  # Evaluation metric
    cv=3,  # Cross-validation folds
    verbose=2,
    n_jobs=-1  # Use all available cores
)

# Perform the grid search on the training and validation data
grid_search.fit(features_valid, labels_valid)

# Print the best parameters and score
print(f"Best Hyperparameters: {grid_search.best_params_}")
print(f"Best Validation Accuracy: {grid_search.best_score_ * 100:.2f}%")

# Use the best model
best_rf = grid_search.best_estimator_


Fitting 3 folds for each of 162 candidates, totalling 486 fits
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=50; total time=  42.1s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=50; total time=  42.3s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=  42.5s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=  42.5s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=50; total time=  42.5s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=  42.7s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 1.3min
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; to



[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 4.8min
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 4.8min
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimators=100; total time= 2.2min
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimators=100; total time= 2.2min
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimators=100; total time= 2.2min
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time= 4.6min
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time= 4.6min
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time= 4.7min
[CV] END max_depth=20, max_features=s

  _data = np.array(data, dtype=dtype, copy=copy,


Best Hyperparameters: {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best Validation Accuracy: 62.91%
