In [0]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


Everything in this book has been run, so no need to run any of it again.
Just start with one of the ModelStub books to work with a small dataset or the full set

In [0]:
from os import getcwd, chdir
from pathlib import Path
path = '/content/gdrive/Shared drives/IDS594/'
chdir(path)
print(getcwd())


/content/gdrive/Shared drives/IDS594


In [0]:
# This was our original unzipping method, until we shifted to using shell commands
# for increase performance

from zipfile import ZipFile 
  
# specifying the zip file name 
#file_names = ['seg_train','seg_test','seg_pred']

file_names = ['seg_train','seg_test']
  
# opening the zip files in READ mode, only if target extract dir doesn't exist
for file_name in file_names:
  dir_exists = Path(path+file_name).exists()
  if not(dir_exists):
    with ZipFile(file_name+'.zip', 'r') as zip:
      zip.extractall() 

## Create Small Data
This data set will be used for faster model iteration to enable for hyperparameter tuning, before shifting to the full data set for more fine tuning.

In [0]:
# Iterate through all train data to create a list of all image paths under data_root in 'pathlib.PosixPath' type
data_root = Path(path, 'seg_train')
all_image_paths = list(data_root.glob('*/*'))
print(all_image_paths[0])
# Extract and sort labels from image folder structure
label_names = sorted(item.name for item in data_root.glob('*/') if item.is_dir())
print(label_names)
# Use dictionary comprehension to create a dict of k=label names, v=integer extracted from tuples generated by enumerate
label_to_index = dict((name, index) for index,name in enumerate(label_names))
print(label_to_index)
# Create list of integer labels using list comprehension and pathlib to extract label names from
# path structure and remap to integer label
all_image_labels = [label_to_index[Path(path).parent.name]
                    for path in all_image_paths]

print(set(all_image_labels))

In [0]:
# Create folder structure with Train/Val/Test dirs with dirs for each category within each train/val/test dirs
# Using the pathlib library vs os because I like the ability behavior of POSIX objects vs strings
# Mostly because they have methods for accessing all parts of the path like parent dir, file extension,
# filename without extention, etc w/o having to use tricky string matching.

base_dir = Path(path,'small_data')
Path.mkdir(base_dir, exist_ok=True)

train_dir = Path(base_dir, 'train')
Path.mkdir(train_dir, exist_ok=True)

validation_dir = Path(base_dir, 'validation')
Path.mkdir(validation_dir, exist_ok=True)

test_dir = Path(base_dir, 'test')
Path.mkdir(test_dir, exist_ok=True)

for label in label_names:
  Path.mkdir(Path(base_dir,'train',label), exist_ok=True)
  Path.mkdir(Path(base_dir,'test',label), exist_ok=True)
  Path.mkdir(Path(base_dir,'validation',label), exist_ok=True)


In [0]:
# Copy operations with shutil turns out to be super slow compared to shell commands

import shutil
# Load subset of data into small_data to create balanced data set
TRAIN_SIZE = 500

# Populate training data
for label in label_names:
  label_image_paths = list(Path(data_root,label).glob('*.jpg'))
  for image_path in label_image_paths[:TRAIN_SIZE]:
    fname = image_path.name
    src = Path(data_root, label, fname)
    dst = Path(train_dir, label, fname)
    shutil.copyfile(src, dst)
    

In [0]:
TRAIN_SIZE = 500
VAL_SIZE = 250
start_index = TRAIN_SIZE
end_index = TRAIN_SIZE + VAL_SIZE

# Populate validation data
for label in label_names:
  label_image_paths = list(Path(data_root,label).glob('*.jpg'))
  for image_path in label_image_paths[start_index:end_index]:
    fname = image_path.name
    src = Path(data_root, label, fname)
    dst = Path(validation_dir, label, fname)
    shutil.copyfile(src, dst)

In [0]:
# Verify that a nice balanced set was created
import os

for label in label_names:
  print(label, 'train images:', len(list(Path(train_dir,label).iterdir())))
print()
for label in label_names:
  print(label, 'validation images:', len(list(Path(validation_dir,label).iterdir())))
print()
for label in label_names:
  print(label, 'test images:', len(list(Path(test_dir,label).iterdir())))

buildings train images: 500
forest train images: 500
glacier train images: 500
mountain train images: 500
sea train images: 500
street train images: 500

buildings validation images: 250
forest validation images: 250
glacier validation images: 250
mountain validation images: 250
sea validation images: 250
street validation images: 250

buildings test images: 0
forest test images: 0
glacier test images: 0
mountain test images: 0
sea test images: 0
street test images: 0


## Create Full_Data

In [0]:
# create list of all image paths under data_root in 'pathlib.PosixPath' type
data_root = Path(path, 'seg_train')
all_image_paths = list(data_root.glob('*/*'))
print(all_image_paths[0])
# Extract and sort labels from image folder structure
label_names = sorted(item.name for item in data_root.glob('*/') if item.is_dir())
print(label_names)
# Use dictionary comprehension to create a dict of k=label names, v=integer extracted from tuples generated by enumerate
label_to_index = dict((name, index) for index,name in enumerate(label_names))
print(label_to_index)
# Create list of integer labels using list comprehension and pathlib to extract label names from
# path structure and remap to integer label
all_image_labels = [label_to_index[Path(path).parent.name]
                    for path in all_image_paths]

print(set(all_image_labels))

/content/gdrive/Shared drives/IDS594/seg_train/buildings/1940.jpg
['buildings', 'forest', 'glacier', 'mountain', 'sea', 'street']
{'buildings': 0, 'forest': 1, 'glacier': 2, 'mountain': 3, 'sea': 4, 'street': 5}
{0, 1, 2, 3, 4, 5}


In [0]:
# Create folder structure with Train/Val/Test dirs with dirs for each category within each train/val/test dirs
base_dir = Path(path,'full_data')
Path.mkdir(base_dir, exist_ok=True)

train_dir = Path(base_dir, 'train')
Path.mkdir(train_dir, exist_ok=True)

validation_dir = Path(base_dir, 'validation')
Path.mkdir(validation_dir, exist_ok=True)

test_dir = Path(base_dir, 'test')
Path.mkdir(test_dir, exist_ok=True)

for label in label_names:
  Path.mkdir(Path(base_dir,'train',label), exist_ok=True)
  Path.mkdir(Path(base_dir,'test',label), exist_ok=True)
  Path.mkdir(Path(base_dir,'validation',label), exist_ok=True)

In [0]:
# As this will be the final training set before running model on held back test data, using
# shuffle and split functions to randomize. Later, we verify that the randomized set is still balanced
from sklearn.model_selection import train_test_split

x_train_paths, x_val_paths = train_test_split(all_image_paths, test_size=.3, random_state=0)
print(len(all_image_paths))
print(len(x_train_paths))
print(len(x_val_paths))



14034
9823
4211


In [0]:
import shutil
# Load train split from seg_train into full_data train dir class subdirs to position for flow_from_directory generator

# Populate training data
for label in label_names:
  for image_path in x_train_paths:
    if image_path.parent.name == label:
      fname = image_path.name
      src = Path(data_root, label, fname)
      dst = Path(train_dir, label, fname)
      shutil.copyfile(src, dst)

In [0]:
import shutil
# Load val split from seg_train into full_data validation dir class subdirs to position for flow_from_directory generator

# Populate validation data
for label in label_names:
  for image_path in x_val_paths:
    if image_path.parent.name == label:
      fname = image_path.name
      src = Path(data_root, label, fname)
      dst = Path(validation_dir, label, fname)
      shutil.copyfile(src, dst)

In [0]:
# Load test data from seg_test into full_data test dir class subdirs to position for flow_from_directory generator

data_root = Path(path, 'seg_test')
test_image_paths = list(data_root.glob('*/*'))

# Populate test data
for label in label_names:
  for image_path in test_image_paths:
    if image_path.parent.name == label:
      fname = image_path.name
      src = Path(data_root, label, fname)
      dst = Path(test_dir, label, fname)
      shutil.copyfile(src, dst)

In [0]:
# Display count of files in each created dir to verify contents
# Nice and balanced!

for label in label_names:
  print(label, 'train images:', len(list(Path(train_dir,label).iterdir())))
print()
for label in label_names:
  print(label, 'validation images:', len(list(Path(validation_dir,label).iterdir())))
print()
for label in label_names:
  print(label, 'test images:', len(list(Path(test_dir,label).iterdir())))

buildings train images: 1560
forest train images: 1629
glacier train images: 1666
mountain train images: 1732
sea train images: 1586
street train images: 1657

buildings validation images: 638
forest validation images: 642
glacier validation images: 738
mountain validation images: 780
sea validation images: 688
street validation images: 725

buildings test images: 437
forest test images: 474
glacier test images: 553
mountain test images: 525
sea test images: 510
street test images: 501


In [0]:
# I'm leaving the following cells as an example of exploration with shell commands
!ls -lh

total 772M
-rw------- 1 root root  87K Aug  6 23:15  BrewerModels.ipynb
drwx------ 5 root root 4.0K Aug  3 17:48  full_data
-rw------- 1 root root 227M Aug  6 23:43  fulldata.tar.xz
-rw------- 1 root root 243M Aug  6 22:50  full_data.zip
-rw------- 1 root root  18K Jul 31 20:19 'IDS594 - Part 2 .ipynb'
-rw------- 1 root root 361K Aug  4 17:46 'IDS594 Project.ipynb'
-rw------- 1 root root 6.1K Aug  3 23:45  ModelStub_Full_Data.ipynb
-rw------- 1 root root 5.0K Aug  3 23:44  ModelStub_Small_Data.ipynb
-rw------- 1 root root  19K Aug  6 23:43  ProjectDataPrep.ipynb
drwx------ 2 root root 4.0K Aug  5 19:00  SavedModels
drwx------ 2 root root 4.0K Aug  3 17:43  seg_test
-rw------- 1 root root  43M Aug  3 17:37  seg_test.zip
drwx------ 2 root root 4.0K Jul 27 01:08  seg_train
-rw------- 1 root root 200M Jul 25 21:27  seg_train.zip
drwx------ 5 root root 4.0K Jul 28 22:24  small_data
-rw------- 1 root root  61M Aug  6 23:36  smalldata.tar.xz
drwx------ 2 root root 4.0K Jul 29 21:53  test_set


In [0]:
# Zip full_data dir structure for easy transfer to /content dir in colab VM vs gdrive to speedup file reads
# Using the LZMA algo (-J option) due to reading about it's good performance
# This still took awhile, but was easily 2-3x faster than using zipfile library
%cd /content/gdrive/Shared drives/IDS594/
!tar -cJf fulldata.tar.xz full_data

/content/gdrive/Shared drives/IDS594


In [0]:
# Copies compressed small data archive to VM /content dir and extracts it
%cd /content/gdrive/Shared drives/IDS594/
!cp smalldata.tar.xz /content
%cd /content
!tar -xf smalldata.tar.xz
%ls

/content
 BrewerModels.ipynb          ModelStub_Small_Data.ipynb   seg_train.zip
 [0m[01;34mfull_data[0m/                  ProjectDataPrep.ipynb        [01;34msmall_data[0m/
 full_data.zip               [01;34mSavedModels[0m/                 small_data.tar
'IDS594 - Part 2 .ipynb'     [01;34mseg_test[0m/                    [01;34mtest_set[0m/
'IDS594 Project.ipynb'       seg_test.zip
 ModelStub_Full_Data.ipynb   [01;34mseg_train[0m/
/content
[0m[01;34mfull_data[0m/  full_data.zip  [01;34mgdrive[0m/  [01;34msample_data[0m/


In [0]:
%cd /content/gdrive/Shared drives/IDS594/
!cp fulldata.tar.xz /content
%cd /content
!tar -xf fulldata.tar.xz
%ls

/content/gdrive/Shared drives/IDS594
/content
[0m[01;34mfull_data[0m/  fulldata.tar.xz  [01;34mgdrive[0m/  [01;34msample_data[0m/


In [0]:
%cd /content/gdrive/Shared drives/IDS594/
%pwd

/content/gdrive/Shared drives/IDS594


'/content/gdrive/Shared drives/IDS594'

In [0]:
!tar -cJf smalldata.tar.xz small_data

In [0]:
!cp smalldata.tar.xz /content

In [0]:
!pwd


/content


In [0]:
!tar -xf smalldata.tar.xz

In [0]:
!ls -lh

total 304M
drwx------ 5 root root 4.0K Aug  3 17:48 full_data
-rw------- 1 root root 243M Aug  6 23:04 full_data.zip
drwx------ 4 root root 4.0K Aug  6 19:11 gdrive
drwxr-xr-x 1 root root 4.0K Aug  2 16:06 sample_data
drwx------ 5 root root 4.0K Jul 28 22:24 small_data
-rw------- 1 root root  61M Aug  6 23:37 smalldata.tar.xz


In [0]:
%cd /content/gdrive/Shared drives/IDS594/

/content/gdrive/Shared drives/IDS594


In [0]:
!tar -cJf seg_train.tar.xz seg_train
!tar -cJf seg_test.tar.xz seg_test

tar: seg_train/buildings: file changed as we read it
