### Dependencies:

In [1]:
import numpy as np
import pandas as pd
import cv2
import PIL
import os
import time
from PIL import Image
from IPython.display import display, clear_output

### Collecting Healthy Data:

In [2]:
#Edit the directory name to where the healthy tomato leaves are saved:
#-----------------------------------------------------#
directory = "Datasets/tomato/train/Tomato___healthy/"
#-----------------------------------------------------#

count = 0

for filename in os.listdir(directory):
    if filename.endswith(".JPG"):
        image = cv2.imread(directory + filename)
        image_vector = np.reshape(image, (image.shape[0] * image.shape[1] * image.shape[2], 1))
        if count == 0:
            X_healthy = image_vector
        else:
            X_healthy = np.concatenate((X_healthy, image_vector), axis = 1)
        clear_output(wait = True)
        print("Number of pictures processed = ", count + 1)
        count += 1
        
X_healthy.shape

Number of pictures processed =  1000


(196608, 1000)

Assign an array of ones as output for healthy data:

In [3]:
Y_healthy = np.ones((1, X_healthy.shape[1]))
Y_healthy.shape

(1, 1000)

Stack the outputs on top of the inputs:

In [4]:
Healthy_Data = np.concatenate((Y_healthy, X_healthy), axis = 0)

### Collecting Diseased Data:

In [5]:
#-----------------------------------------------------#
directory = "Datasets/tomato/train/" #Directory where all datasets are stored
num_pictures_per_disease = 250 #Number of pictures to take from each disease
#-----------------------------------------------------#

count = 0
started = 0
done = 0

for filename in os.listdir(directory):
    if filename != "Tomato___healthy": #directory where healthy dataset is stored, so that it isn't added to unhealthy dataset
        filename = directory + filename + "/"
        for file in os.listdir(filename):
            if (count % num_pictures_per_disease != 0) or count == 0:
                image = cv2.imread(filename + file)
                image_vector = np.reshape(image, (image.shape[0] * image.shape[1] * image.shape[2], 1))
                if started == 0:
                    X_diseased = image_vector
                    started = 1
                else:
                    X_diseased = np.concatenate((X_diseased, image_vector), axis = 1)
                clear_output(wait = True)
                print("Number of pictures processed = ", count + 1)
                count += 1
            else:
                done += 1
                print(done, "Directories done")
                count = 0
                break
    time.sleep(3)
    
print("DONE")
X_diseased.shape

Number of pictures processed =  250
9 Directories done
DONE


(196608, 2250)

Assign an array of zeros as output for diseased data:

In [7]:
Y_diseased = np.zeros((1, X_diseased.shape[1]))
Y_diseased.shape

(1, 2250)

Stack the outputs on top of the inputs:

In [8]:
Diseased_Data = np.concatenate((Y_diseased, X_diseased), axis = 0)

### Forming the Dataset, Visualizing and Shuffling:

Stack the diseased dataset and healthy dataset side by side:

In [9]:
Data = np.concatenate((Diseased_Data, Healthy_Data), axis = 1)
Data.shape

(196609, 3250)

Convert it into a pandas Dataframe, shuffle it and visualize it:

In [10]:
Data_pd = pd.DataFrame(Data.T)
Data_pd = Data_pd.sample(frac = 1)
Data_pd

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,196599,196600,196601,196602,196603,196604,196605,196606,196607,196608
1823,0.0,137.0,118.0,133.0,134.0,115.0,130.0,101.0,82.0,97.0,...,132.0,134.0,117.0,131.0,132.0,115.0,129.0,130.0,113.0,127.0
1400,0.0,95.0,106.0,134.0,99.0,110.0,138.0,79.0,90.0,118.0,...,148.0,140.0,150.0,168.0,151.0,161.0,179.0,137.0,147.0,165.0
1493,0.0,137.0,124.0,132.0,141.0,128.0,136.0,146.0,133.0,141.0,...,91.0,104.0,98.0,115.0,75.0,69.0,86.0,102.0,96.0,113.0
1129,0.0,151.0,130.0,138.0,146.0,125.0,133.0,144.0,123.0,131.0,...,106.0,121.0,100.0,109.0,122.0,101.0,110.0,119.0,98.0,107.0
2085,0.0,106.0,106.0,106.0,106.0,106.0,106.0,106.0,106.0,106.0,...,147.0,153.0,156.0,154.0,155.0,158.0,156.0,155.0,158.0,156.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
77,0.0,122.0,122.0,136.0,107.0,107.0,121.0,112.0,112.0,126.0,...,157.0,148.0,143.0,158.0,150.0,145.0,160.0,152.0,147.0,162.0
1358,0.0,171.0,159.0,165.0,162.0,150.0,156.0,177.0,165.0,171.0,...,141.0,103.0,98.0,113.0,133.0,128.0,143.0,127.0,122.0,137.0
2728,1.0,125.0,115.0,127.0,106.0,96.0,108.0,140.0,130.0,142.0,...,140.0,185.0,167.0,174.0,144.0,126.0,133.0,161.0,143.0,150.0
166,0.0,169.0,169.0,175.0,167.0,167.0,173.0,165.0,165.0,171.0,...,118.0,104.0,105.0,119.0,106.0,107.0,121.0,109.0,110.0,124.0


Convert the shuffled dataset back into a numpy array:

In [11]:
Data_shuffled = Data_pd.to_numpy()
Data_shuffled = Data_shuffled.T

### Save the Dataset:

In [12]:
np.save("tomato_data_1000yes_2250no", Data_shuffled)