In [1]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split

# Path to main dataset directory (with 4 subfolders, one per class)
dataset_dir = "../imgs_with_good_background"
final_combined_split_csv_name = "combined_split_updated.csv"
train_col_name = "train"
test_col_name = "test"
final_comb_split_csv_cols = ["filename", "split"]
list_of_bact_blight_folder_names = ["bkgd1", "bkgd2", "bkgd_none"]

image_paths = []
labels = []
list_of_training_only_filenames = ["soybean_rust_90.JPG",
                                   "bacterial_blight_233.jpg",
                                   "bacterial_blight_127.jpg",
                                   "healthy_34.jpg",
                                   "frogeye_1063.jpg",
                                   "bacterial_blight_60.jpg"]
img_file_exts_tuple = (".jpg", ".jpeg", ".png", ".bmp", ".tif", ".tiff")
test_size_ratio = 0.3

# Collect image paths and labels from each subfolder
for class_name in os.listdir(dataset_dir):
    class_path = os.path.join(dataset_dir, class_name)
    if os.path.isdir(class_path):
        for img_file in os.listdir(class_path):
            if img_file.lower().endswith(img_file_exts_tuple):
                image_paths.append(img_file)
                labels.append(class_name)
            elif any([img_file == dir_name \
                          for dir_name in list_of_bact_blight_folder_names]):
                for sub_img_file in os.listdir(os.path.join(class_path, img_file)):
                    if sub_img_file.lower().endswith(img_file_exts_tuple):
                        image_paths.append(sub_img_file)
                        labels.append(class_name)

training_specific_in_test_paths = True
random_state = 11293

def do_test_paths_contain_training_points(test_paths):
    for path_to_be_checked in test_paths:
        if any([path_to_be_checked == train_only_filename \
                    for train_only_filename in list_of_training_only_filenames]):
            return True

while training_specific_in_test_paths:

    # Perform stratified split
    train_paths, test_paths, _, _ = train_test_split(
        image_paths,
        labels,
        test_size=test_size_ratio,
        stratify=labels,
        random_state=random_state
    )

    if do_test_paths_contain_training_points(test_paths):
        random_state += 1
    else:
        training_specific_in_test_paths = False

# Build DataFrame with only filename and split label
combined_data = []

for path in train_paths:
    filename = os.path.basename(path)
    combined_data.append([filename, train_col_name])

for path in test_paths:
    filename = os.path.basename(path)
    combined_data.append([filename, test_col_name])

# Create and save CSV
combined_df = pd.DataFrame(combined_data, columns=final_comb_split_csv_cols)
combined_df.to_csv(final_combined_split_csv_name, index=False)

print(f"✅ {final_combined_split_csv_name} created with columns: {final_comb_split_csv_cols}")
print(f"\nSplit seed ended up used: {random_state}\n")

✅ combined_split_updated.csv created with columns: ['filename', 'split']

Split seed ended up used: 11304



In [2]:
import os
import cv2
import numpy as np

base_path = dataset_dir
output_path = "./rohit_preprocessed_images_round_one"

# HSV threshold values to isolate green leaf regions
lower_green = np.array([25, 40, 40])
upper_green = np.array([85, 255, 255])
img_new_size = (256, 256)

# Create output directory
os.makedirs(output_path, exist_ok=True)

# Preprocessing function
def preprocess_image(img_path):
    img = cv2.imread(img_path)
    if img is None:
        return None

    # Resize image
    img = cv2.resize(img, img_new_size)

    # Convert to HSV
    hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)

    # Apply CLAHE on V channel to enhance contrast
    clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
    hsv[:, :, 2] = clahe.apply(hsv[:, :, 2])

    # Apply Gaussian Blur to reduce noise
    hsv_blurred = cv2.GaussianBlur(hsv, (5, 5), 0)

    # Create mask for green leaf area
    mask = cv2.inRange(hsv_blurred, lower_green, upper_green)

    # Apply the mask to the original image
    masked = cv2.bitwise_and(img, img, mask=mask)

    return masked

# Loop through each class folder
for label in os.listdir(base_path):
    class_folder = os.path.join(base_path, label)
    if not os.path.isdir(class_folder):
        continue

    # Create output subfolder
    output_class_folder = os.path.join(output_path, label)
    os.makedirs(output_class_folder, exist_ok=True)

    # Process each image in the class folder
    for filename in os.listdir(class_folder):
              
        if any([filename == dir_name \
                    for dir_name in list_of_bact_blight_folder_names]):
            subdir_path = os.path.join(class_folder, filename)
            for sub_filename in os.listdir(subdir_path):
                img_path = os.path.join(subdir_path, sub_filename)
                preprocessed = preprocess_image(img_path)
                if preprocessed is not None:
                    save_path = os.path.join(output_class_folder, sub_filename)
                    cv2.imwrite(save_path, preprocessed)
        else:
            img_path = os.path.join(class_folder, filename)
            preprocessed = preprocess_image(img_path)
            if preprocessed is not None:
                save_path = os.path.join(output_class_folder, filename)
                cv2.imwrite(save_path, preprocessed)

print("Images preprocessing successful!")

Images preprocessing successful!


In [3]:
import os
import cv2
import numpy as np
import pandas as pd

# Path to preprocessed images
input_dir = output_path
color_features_col_names = ["filename", "mean_H", "std_H", "mean_S", "std_S", "class"]
color_features_filename = "rohit_color_features2.csv"
features = []

for label in os.listdir(input_dir):
    class_dir = os.path.join(input_dir, label)
    for file in os.listdir(class_dir):
        path = os.path.join(class_dir, file)
        img = cv2.imread(path)
        hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
        h, s, _ = cv2.split(hsv)

        mean_h = np.mean(h)
        std_h = np.std(h)
        mean_s = np.mean(s)
        std_s = np.std(s)

        features.append([file, mean_h, std_h, mean_s, std_s, label])

df = pd.DataFrame(features, columns=color_features_col_names)
df.to_csv(color_features_filename, index=False)
print(f"Saved color features to: {color_features_filename}")

Saved color features to: rohit_color_features2.csv


In [4]:
import os
import cv2
import numpy as np
import pandas as pd

# Paths
input_dir = output_path
output_csv = "rohit_lesion_shape_features2.csv"
shape_features_col_names = ["filename", "lesion_count",
                            "mean_lesion_area", "mean_eccentricity",
                            "mean_circularity", "infected_area_pct",
                            "class"]

# Output structure
features = []

def compute_circularity(area, perimeter):
    if perimeter == 0:
        return 0
    return (4 * np.pi * area) / (perimeter ** 2)

def compute_eccentricity(contour):
    if len(contour) < 5:
        return 0
    ellipse = cv2.fitEllipse(contour)
    major_axis = max(ellipse[1])
    minor_axis = min(ellipse[1])
    if major_axis == 0:
        return 0
    return np.sqrt(1 - (minor_axis / major_axis) ** 2)

# Process each image
for label in os.listdir(input_dir):
    class_path = os.path.join(input_dir, label)
    if not os.path.isdir(class_path):
        continue

    for file in os.listdir(class_path):
        path = os.path.join(class_path, file)
        img = cv2.imread(path)

        if img is None:
            continue

        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

        # Apply threshold to isolate lesions (tune if needed)
        _, binary = cv2.threshold(gray, 25, 255, cv2.THRESH_BINARY_INV)

        # Find contours (lesions)
        contours, _ = cv2.findContours(binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

        lesion_areas = []
        eccentricities = []
        circularities = []

        for cnt in contours:
            area = cv2.contourArea(cnt)
            if area < 30:  # skip tiny noise
                continue
            perimeter = cv2.arcLength(cnt, True)

            lesion_areas.append(area)
            eccentricities.append(compute_eccentricity(cnt))
            circularities.append(compute_circularity(area, perimeter))

        # Total leaf area (used for % infected)
        leaf_mask = cv2.inRange(img, (1, 1, 1), (255, 255, 255))
        leaf_area = cv2.countNonZero(leaf_mask)

        total_lesion_area = sum(lesion_areas)
        infected_pct = (total_lesion_area / (leaf_area + total_lesion_area)) * 100 if (leaf_area + total_lesion_area) > 0 else 0

        # Store aggregated stats
        features.append([
            file,
            len(lesion_areas),                           # Lesion count
            np.mean(lesion_areas) if lesion_areas else 0,
            np.mean(eccentricities) if eccentricities else 0,
            np.mean(circularities) if circularities else 0,
            infected_pct,
            label
        ])

# Save to CSV
df = pd.DataFrame(features, columns=shape_features_col_names)
df.to_csv(output_csv, index=False)
print(f"Saved lesion shape features to: {output_csv}")

Saved lesion shape features to: rohit_lesion_shape_features2.csv


In [5]:
import pandas as pd

# Load CSVs
color_df = pd.read_csv(color_features_filename)
lesion_df = pd.read_csv(output_csv)
combined_df_filename = "rohit_combined_features2.csv"

# Merge on filename
merged_df = pd.merge(color_df, lesion_df, on=["filename", "class"])

# Save combined dataset
merged_df.to_csv(combined_df_filename, index=False)
print(f"Merged features saved to {combined_df_filename}")

Merged features saved to rohit_combined_features2.csv
