In [1]:
import numpy as np
import os
from PIL import Image
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from matplotlib.patches import Patch
from ipywidgets import IntSlider, interact, Layout
from IPython.display import display
import zipfile

In [6]:
def unzip(zip_path, extract_to):
    # Ensure the extraction directory exists
    if not os.path.exists(extract_to):
        os.makedirs(extract_to)

    # Open the zip file
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        # Extract all the contents into the directory
        zip_ref.extractall(extract_to)
        print(f"Files extracted to {extract_to}")
unzip("C:/MEDS/EDS_232/EDS232-Discussion/data/plant_disease.zip", "C:/MEDS/EDS_232/EDS232-Discussion/data/plant_disease")


Files extracted to C:/MEDS/EDS_232/EDS232-Discussion/data/plant_disease


In [14]:
# Function to open and standardize images used in model

def load_images(base_path, max_per_folder=20):
    images = [] # Empty list to store images
    labels = [] # Empty list to store label of each images
    class_names = [] # Empty list to store the names of the folders for all images

    for i, folder in enumerate(sorted(os.listdir(base_path))):
        folder_path = os.path.join(base_path, folder) # Join base path with folders to iterate over
        if not os.path.isdir(folder_path):
            continue

        class_names.append(folder)
        print(f"Loading from {folder}...")

        count = 0
        for img_file in os.listdir(folder_path): # Iterate over each item in each folder
            if count >= max_per_folder: # Stop when counter gets to 20 images
                break

            if img_file.lower().endswith(('.png', '.jpg', '.jpeg')): # Ensure file in folder is correct format
                try:
                    img_path = os.path.join(folder_path, img_file)
                    with Image.open(img_path) as img: # Open image
                        img = img.convert('RGB') # Convert it to RGB to standardize color channels
                        img = img.resize((100, 100), Image.Resampling.LANCZOS) # Resize image using LANCZOS resampling method

                    images.append(np.array(img)) # Convert image to array and add to image list
                    labels.append(i) # Add label to label list 
                    count += 1
                except Exception as e: # Print error message if error with a file
                    print(f"Error with {img_file}: {e}")

    return np.array(images), np.array(labels), class_names

data_path = input("C:/MEDS/EDS_232/EDS232-Discussion/data/plant_disease")
images, labels, class_names = load_images(data_path)
print(f"Loaded {len(images)} images from {len(class_names)} disease classes")

FileNotFoundError: [WinError 3] The system cannot find the path specified: ''

In [None]:
# Fucntion to extract features and perform PCA

def extract_features_and_reduce(images):
    features = []
    for img_array in images:
        img = Image.fromarray(img_array) # COnver tback into an image
        histogram = np.array(img.histogram()).astype('float32')
        histogram = histogram / histogram.sum() # Normalize histogram
        features.append(histogram)
    features = np.array(features) # Convert back to array for PCA
    print(f"Features shape: {features.shape}")

    # Perform PCA
    pca = PCA(n_components = 2)
    reduced_features = pca.fit_transform(features)
    print(f"Reduced feature shape: {reduced_features.shape}")
    return reduced_features

features = extract_features_and_reduce(images)

In [None]:
# Function to create interactive widget for clustering with visualization

def create_interactive_widget(images, features, labels, class_names):
    slider = IntSlider(value = 3, min = 2, max = 15, description = "Clusters:", layout = Layout(width = '80%'))

    # Function that updates clusters based on the users slider input
    def update_clusters(cluster_num):

        plt.figure(figsize = (15, 7))

        kmeans = Kmeans(n_clusters = cluster_num, random_state = 42, n_init = 10)
        cluster_labels = kmeans.fit_predict(features)

        # Visualize clusters
        scatter = plt.scatter(features[:, 0], features[:, 1], c = cluster_labels, cmap='viridis')
        plt.title('PCA Reduced Features by Cluster')
        plt.xlabel('PCA Component 1')
        plt.ylabel('PCA Component 2')
        plt.grid(True)

        # Update legend
        colors = plt.cm.viridis(np.linespace(0, 1, cluster_num))

        legends = [Patch(facecolor=clr, label = f'cluster {i + 1}') for i, clr in enumerate(colors)]
        plt.legend(handles = legends, title = 'Clusters')

        # Show plot
        plt.tight_layout()
        plt.show()

        # Print results for each cluster
        print(f"Results for {cluster_num} clusters:")
        for i in range(cluster_num): print(f"Cluster {i + 1}: {np.sum(cluster_labels == i)} samples")

        # Display a few images from each cluster
        for i in range(cluster_num):
            plt_figure(figsize = (15, 5))

            # Find the indices of all images that belong to current cluster
            cluster_indices = np.where(cluster_labels == i)[0]

            # Randomly select 5 indices
            selected_indices = np.random.choice(cluster_indices, min(len(cluster_indices), 5), replace = False)

            for j, idx in enumerate(selected_indices):

                plt.subplot(1, 5, j + 1)
                plt.imshow(images[idx])
                plt.title(f"Cluster{i+1}")
                plt.axis('off')
            plt.show()

    interact(update_clusters, cluster_num = slider)

knn_widget = create_interactive_widget(images, features, labels, class_names)
display(knn_widget)

            