# Download Dataset and arrange the data into folders

The purpose of this notebook is to download the dataset and arrange them properly into folders so that we can easily use them for image augmentation.

In [None]:
import os
import random
import numpy as np
import pandas as pd
import tensorflow as tf
import keras

In [25]:
os.environ['PYTHONHASHSEED'] = '0'

In [None]:
# setting the seed for numpy
np.random.seed(2108)

In [None]:
# setting the seed for python random numbers
random.seed(2108)

In [None]:
# setting the seed for tensorflow
tf.set_random_seed(2108)

In [None]:
from keras import backend as K

# Force tensorflow to use a single thread
session_conf = tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
sess = tf.Session(graph = tf.get_default_graph(), config=session_conf)
K.set_session(sess)

## Downloading the data from kaggle
No need to do this if data is downloaded manually. I used this for downloading data into google colab.

Upload the kaggle.json file 

In [None]:
from google.colab import files
files.upload()

Install kaggle api if it is not already installed

In [0]:
!pip install -q kaggle

Doing some arrangements for using kaggle api. Basically, we would place kaggle.json file in directory so that kaggle script could be used to download the data

In [0]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 /content/.kaggle/kaggle.json

Now we download the data using the kaggle api

In [4]:
!kaggle competitions download -c invasive-species-monitoring -w

Downloading test.7z to .
100%|██████████████████████████████████████▉| 1.14G/1.14G [00:09<00:00, 111MB/s]
100%|███████████████████████████████████████| 1.14G/1.14G [00:09<00:00, 124MB/s]
Downloading train.7z to .
100%|█████████████████████████████████████▉| 1.98G/1.98G [00:23<00:00, 49.1MB/s]
100%|██████████████████████████████████████| 1.98G/1.98G [00:23<00:00, 89.3MB/s]
Downloading train_labels.csv.zip to .
  0%|                                               | 0.00/6.19k [00:00<?, ?B/s]
100%|██████████████████████████████████████| 6.19k/6.19k [00:00<00:00, 9.45MB/s]
Downloading sample_submission.csv.zip to .
  0%|                                               | 0.00/4.18k [00:00<?, ?B/s]
100%|██████████████████████████████████████| 4.18k/4.18k [00:00<00:00, 5.40MB/s]


## Extracting and moving the files into directories

Do not run this section if not working in google colab.

In [1]:
import os
# os.mkdir('input')
# os.mkdir('input/test')
# os.mkdir('input/train')

Now we extract the files and move the files into their respective directories for better organization

In [2]:
!7z e test.7z -oinput/test -r

'7z' is not recognized as an internal or external command,
operable program or batch file.


In [7]:
!7z e train.7z -oinput/train -r


7-Zip [64] 16.02 : Copyright (c) 1999-2016 Igor Pavlov : 2016-05-21
p7zip Version 16.02 (locale=en_US.UTF-8,Utf16=on,HugeFiles=on,64 bits,2 CPUs Intel(R) Xeon(R) CPU @ 2.30GHz (306F0),ASM,AES-NI)

Scanning the drive for archives:
  0M Sca        1 file, 2126278821 bytes (2028 MiB)

Extracting archive: train.7z
--
Path = train.7z
Type = 7z
Physical Size = 2126278821
Headers Size = 24792
Method = LZMA2:24
Solid = +
Blocks = 2



      0%      0% 4 - train/1000.jp                        0% 8 - train/1004.jp                        0% 13 - train/1009.j                        0% 14 - train/101.jp                        0% 17 - train/1012.j                        1% 21 - train/1016.j                        1% 25 - train/102.jp                        1% 29 - train/1023.j                        1% 33 - train/1027.j                        1% 37 - train/1030.j                        1% 38 - train/1031.j                        1% 41 - train/1034.j                        2% 45 - train/1038.j                        2% 49 - train/1041.j                        2% 53 - train/1045.j                        2% 58 - train/105.jp                        2% 59 - train/1050.j                        2% 62 - train/1053.j                        3% 66 - train/1057.j                        3% 71 - train/1061.j                        3% 75 - train/1065.j                        3% 79 - train/1069.j                        3% 80 - train/107.

In [8]:
!7z e train_labels.csv.zip -oinput -r


7-Zip [64] 16.02 : Copyright (c) 1999-2016 Igor Pavlov : 2016-05-21
p7zip Version 16.02 (locale=en_US.UTF-8,Utf16=on,HugeFiles=on,64 bits,2 CPUs Intel(R) Xeon(R) CPU @ 2.30GHz (306F0),ASM,AES-NI)

Scanning the drive for archives:
  0M Scan         1 file, 6341 bytes (7 KiB)

Extracting archive: train_labels.csv.zip
--
Path = train_labels.csv.zip
Type = zip
Physical Size = 6341

  0%    Everything is Ok

Folders: 1
Files: 2
Size:       17836
Compressed: 6341


In [5]:
os.listdir('input')

['sample_submission.csv', 'test', 'train', 'train_labels.csv']

## Preparing the training dataset
Here we will organize the images by their label name into folders. This way we can use keras for easy data augmentation.

In [8]:
import os
import pandas as pd
import shutil

TRAIN_DIR = 'input/train'
INVASIVE_DIR = 'input/train/invasive/'
NON_INVASIVE_DIR = 'input/train/non-invasive/'

os.mkdir(INVASIVE_DIR)
os.mkdir(NON_INVASIVE_DIR)

df = pd.read_csv('input/train_labels.csv')

for img in os.listdir(TRAIN_DIR):
    PATH = os.path.join(TRAIN_DIR, img)
    if os.path.isdir(PATH):
        continue
    label = df.invasive[int(img.split('.')[0])-1]
    if label == 0:
        shutil.copy2(PATH, NON_INVASIVE_DIR)
    elif label == 1:
        shutil.copy2(PATH, INVASIVE_DIR)

In [10]:
print(len(os.listdir(NON_INVASIVE_DIR))+len(os.listdir(INVASIVE_DIR)))
print(len(os.listdir(NON_INVASIVE_DIR)))
print(len(os.listdir(INVASIVE_DIR)))

2295
847
1448


In [11]:
os.listdir('input/train')

['1.jpg',
 '10.jpg',
 '100.jpg',
 '1000.jpg',
 '1001.jpg',
 '1002.jpg',
 '1003.jpg',
 '1004.jpg',
 '1005.jpg',
 '1006.jpg',
 '1007.jpg',
 '1008.jpg',
 '1009.jpg',
 '101.jpg',
 '1010.jpg',
 '1011.jpg',
 '1012.jpg',
 '1013.jpg',
 '1014.jpg',
 '1015.jpg',
 '1016.jpg',
 '1017.jpg',
 '1018.jpg',
 '1019.jpg',
 '102.jpg',
 '1020.jpg',
 '1021.jpg',
 '1022.jpg',
 '1023.jpg',
 '1024.jpg',
 '1025.jpg',
 '1026.jpg',
 '1027.jpg',
 '1028.jpg',
 '1029.jpg',
 '103.jpg',
 '1030.jpg',
 '1031.jpg',
 '1032.jpg',
 '1033.jpg',
 '1034.jpg',
 '1035.jpg',
 '1036.jpg',
 '1037.jpg',
 '1038.jpg',
 '1039.jpg',
 '104.jpg',
 '1040.jpg',
 '1041.jpg',
 '1042.jpg',
 '1043.jpg',
 '1044.jpg',
 '1045.jpg',
 '1046.jpg',
 '1047.jpg',
 '1048.jpg',
 '1049.jpg',
 '105.jpg',
 '1050.jpg',
 '1051.jpg',
 '1052.jpg',
 '1053.jpg',
 '1054.jpg',
 '1055.jpg',
 '1056.jpg',
 '1057.jpg',
 '1058.jpg',
 '1059.jpg',
 '106.jpg',
 '1060.jpg',
 '1061.jpg',
 '1062.jpg',
 '1063.jpg',
 '1064.jpg',
 '1065.jpg',
 '1066.jpg',
 '1067.jpg',
 '1068.jpg'

## Prepare the Validation dataset
Now we will prepare the validation dataset for our models. Here we will separate 200 images of each type, invasive and non-invasive, randomly, into our validation folder. The validation folder has two sub folders, names 'invasive' and 'non-invasive'. Same as our train folder

In [12]:
os.listdir()

['.ipynb_checkpoints',
 'basic_cnn.ipynb',
 'download',
 'Download_dataset_and_arrange.ipynb',
 'improved_cnn.ipynb',
 'input',
 'preprocessing_and_exploration.ipynb']

In [13]:
import random

VALIDATION_INVASIVE = 'input/validation/invasive/'
os.mkdir('input/validation/')
os.mkdir(VALIDATION_INVASIVE)

random.seed(2108)
images = random.sample(os.listdir(INVASIVE_DIR),200)

try:
    for img in images:
        PATH = os.path.join(INVASIVE_DIR, img)
        shutil.copy2(PATH, VALIDATION_INVASIVE)
except:
    print("Unexpected error. Retry.")
finally:
    for img in images:
        PATH = os.path.join(INVASIVE_DIR, img)
        os.remove(PATH)

In [14]:
print(len(os.listdir(NON_INVASIVE_DIR))+len(os.listdir(INVASIVE_DIR)))
print(len(os.listdir(NON_INVASIVE_DIR)))
print(len(os.listdir(INVASIVE_DIR)))

2095
847
1248


In [15]:
VALIDATION_NON_INVASIVE = 'input/validation/non-invasive/'
os.mkdir(VALIDATION_NON_INVASIVE)
random.seed(2108)
images = random.sample(os.listdir(NON_INVASIVE_DIR),200)

try:
    for img in images:
        PATH = os.path.join(NON_INVASIVE_DIR, img)
        shutil.copy2(PATH, VALIDATION_NON_INVASIVE)
except:
    print("Unexpected error. Retry.")
finally:
    for img in images:
        PATH = os.path.join(NON_INVASIVE_DIR, img)
        os.remove(PATH)

In [16]:
print(len(os.listdir(NON_INVASIVE_DIR))+len(os.listdir(INVASIVE_DIR)))
print(len(os.listdir(NON_INVASIVE_DIR)))
print(len(os.listdir(INVASIVE_DIR)))

1895
647
1248


In [17]:
print(len(os.listdir(VALIDATION_INVASIVE))+len(os.listdir(VALIDATION_NON_INVASIVE)))
print(len(os.listdir(VALIDATION_INVASIVE)))
print(len(os.listdir(VALIDATION_NON_INVASIVE)))

400
200
200


In [21]:
!rm input/train/*.jpg

In [22]:
os.listdir('input/train')

['invasive', 'non-invasive']

In [23]:
# os.rmdir('input/test/test')

FileNotFoundError: [WinError 2] The system cannot find the file specified: 'input/test/test'

In [24]:
# os.rmdir('input/train/train')

FileNotFoundError: [WinError 2] The system cannot find the file specified: 'input/train/train'