## First create folder with name 'new_dataset' and then run following code

15 directory of classes

['Pepper__bell___Bacterial_spot''Pepper__bell___healthy', 'Potato___Early_blight',
'Potato___Late_blight', 'Potato___healthy', 'Tomato_Bacterial_spot', 'Tomato_Early_blight',
'Tomato_Late_blight', 'Tomato_Leaf_Mold', 'Tomato_Septoria_leaf_spot', 'Tomato__Target_Spot',
'Tomato_Spider_mites_Two_spotted_spider_mite', 'Tomato__Tomato_YellowLeaf__Curl_Virus', 
'Tomato__Tomato_mosaic_virus', 'Tomato_healthy']

```
new_dataset
│
└───train
│   │   15 directory of classes
└───train
    │   15 directory of classes
```

In [218]:
import pathlib
from glob import glob
import fnmatch
import os
import pandas as pd
import shutil

PATH = '/content/drive/MyDrive/Colab Notebooks/Kaggle_Project/Plant Disease'

# original directory which all images
MAIN_DATASET_PATH = f'{PATH}/dataset/PlantVillage/'

# new directory which will conain train and test dataset
NEW_DATASET_PATH = f'{PATH}/sample/'

# train and test directory
TRAIN_DATASET_PATH = f'{NEW_DATASET_PATH}train/'
TEST_DATASET_PATH = f'{NEW_DATASET_PATH}test/'


CLASSES = ['Pepper__bell___Bacterial_spot', 'Pepper__bell___healthy', 'Potato___Early_blight',
'Potato___Late_blight', 'Potato___healthy', 'Tomato_Bacterial_spot', 'Tomato_Early_blight',
'Tomato_Late_blight', 'Tomato_Leaf_Mold', 'Tomato_Septoria_leaf_spot', 'Tomato__Target_Spot',
'Tomato_Spider_mites_Two_spotted_spider_mite', 'Tomato__Tomato_YellowLeaf__Curl_Virus', 
'Tomato__Tomato_mosaic_virus', 'Tomato_healthy']

TRAIN_TEST = ['test', 'train']

class CreateDataset:
    '''
    1. Create folder structure for train and test
    2. Create dataframe of path from main dataset
    3. Copy 600 images from each class to train folder

    4. Create dataframe of path from newly copied train images
    5. Move 100 images from each class to test folder
    '''

    def __init__(self):
        self.create_folder_struct(TRAIN_TEST, NEW_DATASET_PATH, CLASSES)

    # create folder structure for train and test
    def create_folder_struct(self, train_test, path, classes):
        for tt in train_test:
            destination = f'{path}{tt}'
            if not os.path.exists(destination):
                os.mkdir(destination)
            for clas in classes:
                destination = f'{path}/{tt}/{clas}'
                if not os.path.exists(destination):
                    os.mkdir(destination)

    # create dataframe containg path and class name (label)
    def create_path_df(self, temp_path):
        path_list_1 = [x for x in glob(temp_path)]
        lesion_list_new_1 = [os.path.basename(os.path.dirname(y)) for y in glob(temp_path)]

        dataframe_dict_new_1 = dict(zip(path_list_1, lesion_list_new_1))
        original_df = pd.DataFrame(list(dataframe_dict_new_1.items()),columns = ['Path','Label'])
        return original_df

    # path
    def temp_path_fun(self, path):
        data_dir_train = pathlib.Path(path)
        temp_path = os.path.join(data_dir_train, '*', '*.JPG')
        return temp_path

    #  copy and move images to train/test folder
    def copy_move_images(self, original_df, destination_path, classes, img_count=580, move=False, copy=False):
        for cls in classes:
            destination = destination_path + cls
            no = img_count
            df_cls = original_df[original_df.Label == cls]

            if len(df_cls) < no:
                no = len(df_cls)

            df_cls = df_cls.sample(n = no)

            for i in range(len(df_cls)):
                source = df_cls.Path.iloc[i]
                #   copy images
                if copy:
                    shutil.copy(source, destination)
                #   move images
                if move:
                    shutil.move(source, destination)
            print(f"Operation done for {cls}: {len(df_cls)}")

    def create_train_dataset(self, classes, img_count):
        # 1. Create dataframe of path from main dataset
        path = MAIN_DATASET_PATH
        main_path = self.temp_path_fun(path)
        main_df = self.create_path_df(main_path)

        # 2. Copy 600 images from each class to train folder
        destination_path = TRAIN_DATASET_PATH
        self.copy_move_images(main_df, destination_path, classes, img_count=img_count, move=False, copy=True)

    def create_test_dataset(self, classes, img_count):
        # 3. Create dataframe of path from newly copied train images
        path = TRAIN_DATASET_PATH
        main_path = self.temp_path_fun(path)
        train_df = self.create_path_df(main_path)

        # 4. Move 100 images from each class to test folder
        destination_path = TEST_DATASET_PATH
        self.copy_move_images(train_df, destination_path, classes, img_count=img_count, move=True, copy=False)


In [219]:
# create class object
data_object = CreateDataset()

In [209]:
# create train dataset
data_object.create_train_dataset(CLASSES, img_count=600)

Operation done for Pepper__bell___Bacterial_spot: 10
Operation done for Pepper__bell___healthy: 10
Operation done for Potato___Early_blight: 10
Operation done for Potato___Late_blight: 10
Operation done for Potato___healthy: 10
Operation done for Tomato_Bacterial_spot: 10
Operation done for Tomato_Early_blight: 10
Operation done for Tomato_Late_blight: 10
Operation done for Tomato_Leaf_Mold: 10
Operation done for Tomato_Septoria_leaf_spot: 10
Operation done for Tomato__Target_Spot: 10
Operation done for Tomato_Spider_mites_Two_spotted_spider_mite: 10
Operation done for Tomato__Tomato_YellowLeaf__Curl_Virus: 10
Operation done for Tomato__Tomato_mosaic_virus: 10
Operation done for Tomato_healthy: 10


In [210]:
# create test dataset
CLASSES = ['Pepper__bell___Bacterial_spot', 'Pepper__bell___healthy', 'Potato___Early_blight',
'Potato___Late_blight', 'Tomato_Bacterial_spot', 'Tomato_Early_blight',
'Tomato_Late_blight', 'Tomato_Leaf_Mold', 'Tomato_Septoria_leaf_spot', 'Tomato__Target_Spot',
'Tomato_Spider_mites_Two_spotted_spider_mite', 'Tomato__Tomato_YellowLeaf__Curl_Virus',  'Tomato_healthy']

data_object.create_test_dataset(CLASSES, img_count=100)

Operation done for Pepper__bell___Bacterial_spot: 4
Operation done for Pepper__bell___healthy: 4
Operation done for Potato___Early_blight: 4
Operation done for Potato___Late_blight: 4
Operation done for Tomato_Bacterial_spot: 4
Operation done for Tomato_Early_blight: 4
Operation done for Tomato_Late_blight: 4
Operation done for Tomato_Leaf_Mold: 4
Operation done for Tomato_Septoria_leaf_spot: 4
Operation done for Tomato__Target_Spot: 4
Operation done for Tomato_Spider_mites_Two_spotted_spider_mite: 4
Operation done for Tomato__Tomato_YellowLeaf__Curl_Virus: 4
Operation done for Tomato_healthy: 4


In [211]:
CLASSES = ['Potato___healthy', 'Tomato__Tomato_mosaic_virus']
data_object.create_test_dataset(CLASSES, img_count=20)

Operation done for Potato___healthy: 4
Operation done for Tomato__Tomato_mosaic_virus: 4
