### Part 1: Unzip the data and split it into train/val/test set

Since the dataset is zipped, first we need to unzip using [zipfile library](https://docs.python.org/3/library/zipfile.html).

We can use [tqdm library](https://tqdm.github.io/), which is awesome, to see the progress of the file as it's being unzipped. 



In [1]:
from zipfile import ZipFile

from tqdm import tqdm

# Open the .zip file
with ZipFile(file='dataset-resized.zip') as zip_file:

    # Loop over each file
    for file in tqdm(iterable=zip_file.namelist(), total=len(zip_file.namelist())):

        # Extract the file to current working directory, no need to specify path
        zip_file.extract(member=file)


100%|██████████| 2540/2540 [00:02<00:00, 1014.43it/s]


### There are 6 classes (categories) of waste in this dataset:
`cardboard`,`glass`,`metal`,`paper`,`plastic`,`trash`.

***The dataset will be split into ~70-15-15% for train, valid, and test data.***

***The following code is adapted from the [blog post](https://towardsdatascience.com/how-to-build-an-image-classifier-for-waste-sorting-6d11d3c9c478).***

In [2]:
## helper functions ##
import os
import shutil
import random

## splits indices for a folder into train, validation, and test indices with random sampling
    ## input: folder path
    ## output: train, valid, and test indices    
def split_indices(folder,seed1,seed2):    
    n = len(os.listdir(folder))
    full_set = list(range(1,n+1))

    ## train indices
    random.seed(seed1)
    train = random.sample(list(range(1,n+1)),int(.7*n))

    ## temp
    remain = list(set(full_set)-set(train))

    ## separate remaining into validation and test (15% for valid and another 15% for test)
    random.seed(seed2)
    valid = random.sample(remain,int(.5*len(remain)))
    test = list(set(remain)-set(valid))
    
    return(train,valid,test)

## gets file names for a particular type of trash, given indices
    ## input: waste category and indices
    ## output: file names 
def get_names(waste_type,indices):
    file_names = [waste_type+str(i)+".jpg" for i in indices]
    return(file_names)    

## moves group of source files to another folder
    ## input: list of source files and destination folder
    ## no output
def move_files(source_files,destination_folder):
    for file in source_files:
        shutil.move(file,destination_folder)

In [3]:
## paths will be train/cardboard, train/glass, etc...
data_split = ['train','valid','test']
waste_classes = ['cardboard','glass','metal','paper','plastic','trash']

## create destination folders for train, valid, test and each waste type
for i in data_split:
    for waste_class in waste_classes:
        folder = os.path.join('data',i,waste_class)
        #if os.path.exists(folder):
          #print(f'folder {folder} already exists.')
          #return
        if not os.path.exists(folder):
            os.makedirs(folder)
            
if not os.path.exists(os.path.join('data','test')):
    os.makedirs(os.path.join('data','test'))
            
## move files to destination folders for each waste class
for waste_class in waste_classes:
    source_folder = os.path.join('dataset-resized',waste_class)
    train_ind, valid_ind, test_ind = split_indices(source_folder,1,1)
    
    ## move source files to train
    train_names = get_names(waste_class,train_ind)
    train_source_files = [os.path.join(source_folder,name) for name in train_names]
    train_dest = "data/train/" + waste_class
    move_files(train_source_files,train_dest)
    
    ## move source files to valid
    valid_names = get_names(waste_class,valid_ind)
    valid_source_files = [os.path.join(source_folder,name) for name in valid_names]
    valid_dest = "data/valid/" + waste_class
    move_files(valid_source_files,valid_dest)
    
    ## move source files to test
    test_names = get_names(waste_class,test_ind)
    test_source_files = [os.path.join(source_folder,name) for name in test_names]
    test_dest = "data/test/" + waste_class
    move_files(test_source_files,test_dest)