In [1]:
import numpy as np
import pandas as pd
import os
import cv2
import sys
from tqdm import tqdm

In [3]:
# returns no. of files in the directory
def scan_dir(label, path):
    dirs = os.listdir( path )
    # print the files in given directory
    return dirs

In [5]:
# training path
path = 'dataset//train//'
dir_list = []
# shows the distribution of images of different categories
for i in range(15):
    label = str(i)
    dirs = scan_dir(label, path + label)
    print(label + ': ' + str(len(dirs)))
    dir_list.append(len(dirs))

0: 784
1: 6186
2: 7936
3: 8203
4: 8835
5: 683
6: 5083
7: 1248
8: 3752
9: 5301
10: 4560
11: 1248
12: 4848
13: 2935
14: 656


In [11]:
np.sum(dir_list)

62258

In [36]:
# training path
#path = 'dataset//train//'

min_h = 999999
min_w = 999999
# find out the minimum dimension out of all images
for i in range(15):
    label = str(i)
    dirs = scan_dir(label, path + label)
    
    # iterate through files
    for i in tqdm(range(len(dirs))):
        filename = dirs[i]
        img = cv2.imread(path + label + '//' + filename, -1)
        h, w, _ = img.shape
        # update minimum height and width if necessary
        if h < min_h:
            min_h = h
        if w < min_w:
            min_w = w
    print(label + ' done.')       
print(label + ': ' + 'min h: ' + str(min_h) + 'min w: ' + str(min_w) )

100%|█████████████████████████████████████████████████████████████████████████████████████████████| 784/784 [00:00<00:00, 909.50it/s]


0 done.


100%|███████████████████████████████████████████████████████████████████████████████████████████| 6186/6186 [00:18<00:00, 335.93it/s]


1 done.


100%|████████████████████████████████████████████████████████████████████████████████████████████| 7936/7936 [01:35<00:00, 82.87it/s]


2 done.


100%|████████████████████████████████████████████████████████████████████████████████████████████| 8203/8203 [01:40<00:00, 81.96it/s]


3 done.


100%|████████████████████████████████████████████████████████████████████████████████████████████| 8835/8835 [01:37<00:00, 90.95it/s]


4 done.


100%|██████████████████████████████████████████████████████████████████████████████████████████████| 683/683 [00:07<00:00, 96.13it/s]


5 done.


100%|████████████████████████████████████████████████████████████████████████████████████████████| 5083/5083 [01:07<00:00, 75.34it/s]


6 done.


100%|████████████████████████████████████████████████████████████████████████████████████████████| 1248/1248 [00:20<00:00, 59.74it/s]


7 done.


100%|████████████████████████████████████████████████████████████████████████████████████████████| 3752/3752 [00:51<00:00, 73.18it/s]


8 done.


100%|████████████████████████████████████████████████████████████████████████████████████████████| 5301/5301 [01:06<00:00, 79.45it/s]


9 done.


100%|████████████████████████████████████████████████████████████████████████████████████████████| 4560/4560 [01:16<00:00, 59.57it/s]


10 done.


100%|████████████████████████████████████████████████████████████████████████████████████████████| 1248/1248 [00:19<00:00, 63.49it/s]


11 done.


100%|████████████████████████████████████████████████████████████████████████████████████████████| 4848/4848 [01:24<00:00, 57.69it/s]


12 done.


100%|████████████████████████████████████████████████████████████████████████████████████████████| 2935/2935 [00:54<00:00, 53.50it/s]


13 done.


100%|██████████████████████████████████████████████████████████████████████████████████████████████| 656/656 [00:16<00:00, 40.73it/s]


14 done.
14: min h: 92min w: 92


## Correcting Data Distribution
Data distribution is not good and so we will generate images to make it less skewed. Moreover we will create a validation dataset by augmenting images from Training.

In [54]:
from keras.preprocessing.image import ImageDataGenerator, array_to_img, img_to_array, load_img

datagen = ImageDataGenerator(
        rotation_range=40,
        width_shift_range=0.2,
        height_shift_range=0.2,
        shear_range=0.2,
        zoom_range=0.2,
        horizontal_flip=True,
        fill_mode='nearest')

In [55]:
# generates given number of images for an image
def generate_image(label, filename, samples_per_img, dest):
    img = load_img('dataset/train//' + label + '//' + filename)  # this is a PIL image
    x = img_to_array(img)  
    x = x.reshape((1,) + x.shape)  
    i = 0
    for batch in datagen.flow(x, batch_size=1,
                              save_to_dir=dest + '//' + label + '//', save_prefix='sample', save_format='jpg'):
        i += 1
        if i > samples_per_img-1:
            break  # otherwise the generator would loop indefinitely

In [56]:
# now we will augment and save images for classes with less data
len(dir_list)

15

So we will augment data as following:<br>
0--> 5000<br>
1--> 0<br>
2--> 0<br>
3--> 0<br>
4 --> 0<br>
5 --> 5000<br>
6--> 1000<br>
7--> 5000<br>
8--> 3000<br>
9--> 1000<br>
10--> 5000<br>
11--> 2000<br>
12--> 3000<br>
13--> 5000<br>

In [63]:
# training path
path = 'dataset//train//'

dir_list = []
# iterate through different folders and get list of files in each folder
for i in range(15):
    label = str(i)
    dirs = scan_dir(label, path + label)
    dir_list.append(dirs)

In [64]:
# for augmenting images from a folder
def augment_images(dir_list, dest, samples_per_img, req_img):
    # generate required images for each folder
    for i, curr_dir in tqdm(enumerate(dir_list)):
        print(i)
        label = str(i)
        for _,filename in tqdm(enumerate(curr_dir)):
            if req_img[i] <= 0:
                break
            generate_image(label, filename, samples_per_img, dest)
            req_img[i] = req_img[i] - samples_per_img


0it [00:00, ?it/s]

0




0it [00:00, ?it/s]

2it [00:00, 18.03it/s]

4it [00:00, 17.70it/s]

6it [00:00, 17.29it/s]

8it [00:00, 17.62it/s]

11it [00:00, 18.54it/s]

13it [00:00, 17.26it/s]

15it [00:00, 16.98it/s]

17it [00:01, 16.68it/s]

19it [00:01, 16.36it/s]

21it [00:01, 15.06it/s]

24it [00:01, 15.54it/s]

26it [00:01, 15.59it/s]

28it [00:01, 15.51it/s]

30it [00:01, 15.31it/s]

32it [00:02, 15.21it/s]

34it [00:02, 15.35it/s]

36it [00:02, 15.04it/s]

38it [00:02, 14.66it/s]

40it [00:02, 14.66it/s]

42it [00:02, 14.81it/s]

44it [00:02, 14.78it/s]

46it [00:03, 14.86it/s]

49it [00:03, 15.12it/s]


1it [00:03,  3.40s/it]

1




0it [00:00, ?it/s]

1it [00:00,  6.66it/s]

3it [00:00, 11.06it/s]

6it [00:00, 15.10it/s]
Exception in thread Thread-8:
Traceback (most recent call last):
  File "C:\Users\SUSANTA\Anaconda3\envs\tensorflow\lib\threading.py", line 916, in _bootstrap_inner
    self.run()
  File "C:\Users\SUSANTA\Anaconda3\envs\tensorflow\lib\site-packages\tqdm\_tqdm.py", line 144, in run
    for instance in self.tqdm_cls._instances:
  File "C:\Users\SUSANTA\Anaconda3\envs\tensorflow\lib\_weakrefset.py", line 60, in __iter__
    for itemref in self.data:
RuntimeError: Set changed size during iteration


8it [00:00, 14.75it/s]
10it [00:00, 13.48it/s]
12it [00:01, 10.94it/s]
14it [00:01, 11.68it/s]
16it [00:01, 12.04it/s]
18it [00:01, 12.59it/s]
21it [00:01, 13.41it/s]
23it [00:01, 13.40it/s]
25it [00:01, 13.53it/s]
28it [00:02, 13.73it/s]
30it [00:02, 13.36it/s]
33it [00:02, 13.80it/s]
35it [00:02, 13.73it/s]
37it [00:02, 13.41it/s]
39it [00:02, 13.47it/s]
41it [00:03, 13.59it/s]
43it [00:03, 13.66it/s]

2



0it [00:00, ?it/s]
2it [00:00, 16.87it/s]
4it [00:00, 15.88it/s]
6it [00:00, 16.57it/s]
9it [00:00, 17.91it/s]
11it [00:00, 18.08it/s]
13it [00:00, 17.55it/s]
15it [00:00, 17.23it/s]
17it [00:01, 16.56it/s]
19it [00:01, 16.28it/s]
21it [00:01, 15.74it/s]
24it [00:01, 16.44it/s]
26it [00:01, 16.45it/s]
28it [00:01, 16.38it/s]
30it [00:01, 15.82it/s]
33it [00:02, 16.23it/s]
36it [00:02, 16.39it/s]
38it [00:02, 15.14it/s]
40it [00:02, 15.04it/s]
42it [00:02, 14.36it/s]
45it [00:03, 14.72it/s]
47it [00:03, 14.56it/s]
49it [00:03, 14.66it/s]
3it [00:10,  3.49s/it]

3



0it [00:00, ?it/s]
3it [00:00, 19.69it/s]
5it [00:00, 18.49it/s]
7it [00:00, 18.91it/s]
10it [00:00, 21.08it/s]
12it [00:00, 20.25it/s]
14it [00:00, 19.43it/s]
16it [00:00, 18.53it/s]
18it [00:01, 16.75it/s]
20it [00:01, 16.90it/s]
22it [00:01, 16.95it/s]
24it [00:01, 16.82it/s]
26it [00:01, 16.62it/s]
28it [00:01, 15.91it/s]
30it [00:01, 15.06it/s]
33it [00:02, 15.43it/s]
36it [00:02, 15.75it/s]
38it [00:02, 15.64it/s]
40it [00:02, 15.50it/s]
42it [00:02, 15.32it/s]
44it [00:02, 15.39it/s]
46it [00:02, 15.46it/s]
48it [00:03, 15.45it/s]
50it [00:03, 14.68it/s]
4it [00:13,  3.47s/it]

4



0it [00:00, ?it/s]
3it [00:00, 27.52it/s]
6it [00:00, 25.63it/s]
8it [00:00, 18.86it/s]
11it [00:00, 20.17it/s]
13it [00:00, 20.03it/s]
16it [00:00, 20.79it/s]
19it [00:00, 21.05it/s]
22it [00:01, 20.34it/s]
25it [00:01, 20.61it/s]
28it [00:01, 20.79it/s]
31it [00:01, 20.82it/s]
34it [00:01, 20.54it/s]
37it [00:01, 20.52it/s]
40it [00:01, 20.63it/s]
43it [00:02, 20.61it/s]
46it [00:02, 20.64it/s]
49it [00:02, 20.49it/s]
5it [00:16,  3.27s/it]

5



0it [00:00, ?it/s]
3it [00:00, 23.98it/s]
6it [00:00, 24.98it/s]
9it [00:00, 23.19it/s]
11it [00:00, 22.40it/s]
13it [00:00, 17.30it/s]
15it [00:00, 17.15it/s]
17it [00:01, 16.89it/s]
19it [00:01, 15.49it/s]
21it [00:01, 14.35it/s]
23it [00:01, 12.22it/s]
25it [00:01, 12.54it/s]
27it [00:02, 12.64it/s]
29it [00:02, 12.54it/s]
31it [00:02, 12.53it/s]
33it [00:02, 12.82it/s]
35it [00:02, 13.03it/s]
37it [00:02, 12.97it/s]
39it [00:02, 13.21it/s]
41it [00:03, 13.33it/s]
43it [00:03, 13.27it/s]
45it [00:03, 13.30it/s]
47it [00:03, 12.84it/s]
49it [00:03, 12.59it/s]
6it [00:20,  3.40s/it]

6



0it [00:00, ?it/s]
1it [00:00,  5.46it/s]
3it [00:00,  8.66it/s]
6it [00:00, 12.16it/s]
8it [00:00, 13.41it/s]
11it [00:00, 15.76it/s]
13it [00:00, 16.04it/s]
16it [00:00, 16.83it/s]
18it [00:01, 16.89it/s]
21it [00:01, 17.42it/s]
23it [00:01, 17.49it/s]
26it [00:01, 15.60it/s]
28it [00:01, 15.45it/s]
30it [00:02, 13.75it/s]
34it [00:02, 14.72it/s]
36it [00:02, 14.93it/s]
39it [00:02, 15.51it/s]
42it [00:02, 15.29it/s]
44it [00:03, 14.29it/s]
47it [00:03, 14.65it/s]
50it [00:03, 14.98it/s]
7it [00:23,  3.39s/it]

7



0it [00:00, ?it/s]
3it [00:00, 19.34it/s]
5it [00:00, 17.97it/s]
6it [00:00, 15.74it/s]
8it [00:00, 16.05it/s]
10it [00:00, 14.98it/s]
12it [00:00, 15.39it/s]
14it [00:00, 15.17it/s]
17it [00:01, 11.47it/s]
19it [00:01, 10.64it/s]
21it [00:01, 10.85it/s]
23it [00:02, 10.19it/s]
24it [00:02, 10.03it/s]
25it [00:02,  9.89it/s]
26it [00:02,  9.87it/s]
28it [00:02,  9.71it/s]
29it [00:02,  9.71it/s]
31it [00:03,  9.81it/s]
33it [00:03, 10.07it/s]
35it [00:03, 10.23it/s]
37it [00:03, 10.43it/s]
39it [00:03, 10.02it/s]
41it [00:04, 10.04it/s]
43it [00:04, 10.06it/s]
45it [00:04, 10.21it/s]
47it [00:04, 10.30it/s]
49it [00:04, 10.43it/s]
8it [00:28,  3.57s/it]

8



0it [00:00, ?it/s]
1it [00:00,  9.80it/s]
3it [00:00, 12.87it/s]
5it [00:00, 12.10it/s]
7it [00:00, 12.65it/s]
9it [00:00, 13.07it/s]
11it [00:00, 12.65it/s]
13it [00:01, 12.71it/s]
15it [00:01, 12.62it/s]
17it [00:01, 12.93it/s]
20it [00:01, 13.95it/s]
23it [00:01, 14.79it/s]
25it [00:01, 15.10it/s]
27it [00:02, 12.44it/s]
29it [00:02, 12.28it/s]
31it [00:02, 12.42it/s]
33it [00:02, 12.26it/s]
36it [00:02, 12.73it/s]
39it [00:03, 11.61it/s]
41it [00:03, 11.80it/s]
43it [00:03, 11.68it/s]
45it [00:03, 11.72it/s]
47it [00:03, 11.82it/s]
49it [00:04, 12.02it/s]
9it [00:32,  3.64s/it]

9



0it [00:00, ?it/s]
2it [00:00,  9.79it/s]
4it [00:00, 11.13it/s]
5it [00:00, 10.18it/s]
7it [00:00, 10.88it/s]
9it [00:00, 11.32it/s]
12it [00:00, 12.75it/s]
14it [00:01, 12.83it/s]
16it [00:01, 12.73it/s]
19it [00:01, 13.64it/s]
22it [00:01, 14.13it/s]
24it [00:01, 14.25it/s]
27it [00:01, 14.76it/s]
29it [00:01, 15.00it/s]
32it [00:02, 15.06it/s]
34it [00:02, 15.14it/s]
36it [00:02, 14.78it/s]
38it [00:02, 14.90it/s]
40it [00:02, 15.05it/s]
43it [00:02, 14.95it/s]
45it [00:03, 14.73it/s]
47it [00:03, 14.56it/s]
49it [00:03, 14.72it/s]
10it [00:36,  3.61s/it]

10



0it [00:00, ?it/s]
1it [00:00,  9.00it/s]
3it [00:00, 10.37it/s]
5it [00:00,  9.48it/s]
8it [00:00, 11.88it/s]
10it [00:00, 12.88it/s]
12it [00:00, 13.66it/s]
15it [00:01, 14.99it/s]
17it [00:01, 14.75it/s]
19it [00:01, 14.73it/s]
21it [00:01, 14.51it/s]
23it [00:01, 14.48it/s]
25it [00:01, 14.70it/s]
27it [00:01, 14.79it/s]
29it [00:01, 14.95it/s]
31it [00:02, 15.05it/s]
33it [00:02, 14.74it/s]
35it [00:02, 14.78it/s]
37it [00:02, 14.70it/s]
40it [00:02, 15.13it/s]
43it [00:02, 15.45it/s]
46it [00:02, 15.73it/s]
48it [00:03, 15.76it/s]
50it [00:03, 15.53it/s]
11it [00:39,  3.58s/it]

11



0it [00:00, ?it/s]
3it [00:00, 20.52it/s]
4it [00:00, 16.21it/s]
6it [00:00, 16.49it/s]
9it [00:00, 16.46it/s]
11it [00:00, 16.87it/s]
13it [00:00, 16.81it/s]
15it [00:00, 17.09it/s]
17it [00:01, 15.94it/s]
19it [00:01, 15.36it/s]
21it [00:01, 15.23it/s]
23it [00:01, 14.91it/s]
25it [00:02, 12.25it/s]
27it [00:02, 12.35it/s]
29it [00:02, 12.50it/s]
31it [00:02, 12.18it/s]
33it [00:02, 12.06it/s]
35it [00:02, 12.09it/s]
37it [00:03, 11.95it/s]
40it [00:03, 12.22it/s]
42it [00:03, 12.30it/s]
44it [00:03, 12.42it/s]
46it [00:03, 12.53it/s]
48it [00:03, 12.44it/s]
50it [00:04, 12.47it/s]
12it [00:43,  3.62s/it]

12



0it [00:00, ?it/s]
2it [00:00, 18.51it/s]
4it [00:00, 17.01it/s]
6it [00:00, 17.13it/s]
8it [00:00, 15.92it/s]
10it [00:00, 15.45it/s]
12it [00:00, 14.99it/s]
15it [00:00, 16.00it/s]
17it [00:01, 15.57it/s]
19it [00:01, 15.74it/s]
21it [00:01, 15.63it/s]
23it [00:01, 14.54it/s]
25it [00:02, 11.86it/s]
27it [00:02, 12.17it/s]
29it [00:02, 12.28it/s]
32it [00:02, 12.80it/s]
34it [00:02, 12.97it/s]
37it [00:02, 13.43it/s]
39it [00:02, 13.57it/s]
42it [00:02, 14.07it/s]
45it [00:03, 14.49it/s]
48it [00:03, 14.61it/s]
13it [00:46,  3.60s/it]

13



0it [00:00, ?it/s]
2it [00:00, 14.17it/s]
4it [00:00, 16.38it/s]
7it [00:00, 18.84it/s]
10it [00:00, 19.97it/s]
12it [00:00, 16.95it/s]
14it [00:00, 17.15it/s]
17it [00:00, 17.91it/s]
20it [00:01, 18.77it/s]
23it [00:01, 19.52it/s]
26it [00:01, 19.54it/s]
29it [00:01, 19.20it/s]
32it [00:01, 18.87it/s]
34it [00:01, 18.87it/s]
37it [00:01, 19.05it/s]
40it [00:02, 19.31it/s]
43it [00:02, 19.27it/s]
46it [00:02, 18.10it/s]
48it [00:02, 17.47it/s]
14it [00:49,  3.55s/it]

14



0it [00:00, ?it/s]



IndexError: list index out of range

## For creating better data distribution

In [None]:
# no. of aygmented images to be generated from each image
samples_per_img = 8
# destination folder
dest = ''
# required no. of images for each class
req_img = [5000, 0, 0, 0, 0, 5000, 1000, 5000, 3000, 1000, 5000, 2000, 3000, 5000]

for i in range(15):
    augment_images(dir_list, dest, samples_per_img, req_img)

## For creating validation set

In [None]:
# no. of aygmented images to be generated from each image
samples_per_img = 8
# destination folder
dest = 'validation'
# required no. of images for each class
req_img = [500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500]

augment_images(dir_list, dest, samples_per_img, req_img)

In [None]:
# counts total number of files 
def count_files(path):
    # total files
    total = 0
    # iterate through different folders and get list of files in each folder
    for i in range(15):
        label = str(i)
        dirs = scan_dir(label, path + label)
        total += len(dirs)
    return total

In [None]:
# training path
path_train = 'dataset//train//'
# validation path
path_val = 'dataset//train//'

total = count_files(path_train)
print('Total Training images: ' + str(total))
total = count_files(path_val)
print('Total Validation images: ' + str(total))