In [6]:
%matplotlib inline    
#displays images in the notebook rather than a separate window
from scipy.misc import imread, imresize #reading and resizing images
import matplotlib.pyplot as plt #displaying images
import dask.bag as db  #For easy parallelizatoin of applying a function on a list
import pandas as pd
import glob #getting the list of files in a directory
import numpy as np
import pickle #use pickle for serializing Python objects to disk

### Create a dataframe of image paths, labels and hold-out indicator

In [7]:
# ????what is holdout??? 
DATA_PATH = r"N:\DATA\distracted_driver_classification"

image_df = pd.read_csv(DATA_PATH + r'\driver_imgs_list.csv')
image_paths_df = pd.DataFrame(glob.glob(DATA_PATH + r'\imgs\train\*\*.jpg'), columns = ['full_path'])
image_paths_df['img'] = image_paths_df['full_path'].map(lambda x: x.split('\\')[-1])
image_df = pd.merge(image_df, image_paths_df, how='inner', on = 'img')
image_df['holdout'] = image_df.subject.isin(['p061', 'p064', 'p066', 'p072', 'p075', 'p081'])  #6 Subjects for hold-out set
image_df['label'] =   [int(x.replace('c', '')) for x in image_df.classname]
image_df = image_df[image_df['classname'].isin(['c0', 'c1'])] #Limiting it to two classes only
image_df['rowId'] = range(image_df.shape[0])

In [10]:
image_df.head()

Unnamed: 0,subject,classname,img,full_path,holdout,label,rowId
0,p002,c0,img_44733.jpg,N:\DATA\distracted_driver_classification\imgs\...,False,0,0
1,p002,c0,img_72999.jpg,N:\DATA\distracted_driver_classification\imgs\...,False,0,1
2,p002,c0,img_25094.jpg,N:\DATA\distracted_driver_classification\imgs\...,False,0,2
3,p002,c0,img_69092.jpg,N:\DATA\distracted_driver_classification\imgs\...,False,0,3
4,p002,c0,img_92629.jpg,N:\DATA\distracted_driver_classification\imgs\...,False,0,4


In [11]:
image_df['classname'].value_counts()

c0    2489
c1    2267
Name: classname, dtype: int64

#### Parallel apply using dask.bag
`dask.bag` is a library that help with simple parallelization tasks. It is a single-machine analogue of Spark.

In [17]:
#?????what does 3 mean ? mean(2)
def read_and_transform(path):
    """
    Helper function to read, convert to grayscale, 
    crop and downsample an image from a given path.
    """
    return imresize(imread(path)[40:400, 120:640], (64, 64, 3)).mean(2)

In [18]:
images_bag = db.from_sequence(image_df['full_path']) 
transformed_images = images_bag.map(read_and_transform).compute()  #apply the function to all the paths
                                                                   #parallel and lazy evaluation with dask.bag

In [19]:
X_train = [transformed_images[i] for i in image_df[image_df.holdout==False].rowId]
y_train = image_df[image_df.holdout==False]['label']
# ????? np.stack ??? np.newaxis
X_train = np.stack(X_train, axis=0)
X_train = X_train[:,:,:, np.newaxis]

y_train = y_train.astype(np.int64).values

In [20]:
X_test = [transformed_images[i] for i in image_df[image_df.holdout==True].rowId]
y_test = image_df[image_df.holdout==True]['label']

X_test = np.stack(X_test, axis=0)
X_test = X_test[:,:,:, np.newaxis]

y_test = y_test.astype(np.int64).values

In [21]:
# why serilize to a pickle
pickle.dump(((X_train, y_train), (X_test, y_test)), open('driver_images_processed.pckl', 'wb'))