In [1]:
from __future__ import print_function
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neighbors import NearestNeighbors
from os.path import isfile, isdir, join, split, splitext, exists
from os import listdir, makedirs
import json
from pprint import pprint

## Method Declaration Cell

In [2]:
def calculate_SAD(arr1, arr2):
    """Calculate sum of absolute distance"""
    z = abs(arr1/100 - arr2/100)
    z = z * 100
    return np.sum(z)

def calculate_SSD(arr1, arr2):
    """Calculate sum of squared distance"""
    z = abs(arr1/100 - arr2/100)
    z = z * 100
    return np.sum(z**2)

timer = []
def stopwatch_start():
    if len(timer) == 0:
        timer.append(time.time())
    else:
        print("Stopwatch already started, please stop the stopwatch using stopwatch_stop() function")

def stopwatch_stop():
    if len(timer) > 0:
        return time.time() - timer.pop()
    else:
        print("Please start a stopwatch using time_start() function")

In [3]:
# Require the following library:
# import matplotlib.pyplot as plt

# Receive np.array, show the image
def show_image(arr, title=""):
    fig, ax = plt.subplots(figsize=(10, 8))
    plt.imshow(np.asarray(arr))
    plt.title(title)

# Question 1
Read all images and convert to grayscale images

In [4]:
my_path = "CroppedYale/"

directories = [join(my_path, d) for d in listdir(my_path) if isdir(join(my_path, d))]
image_files = [join(d, f) for d in directories for f in listdir(d) 
               if isfile(join(d, f)) and splitext(f)[-1] != ".bad"]

# Per question specified, take the first 35 as train_images and the rest as
train_image = [join(d, f)for d in directories for n, f in enumerate(listdir(d)) 
               if isfile(join(d, f)) and n < 35 and splitext(f)[-1] != ".bad"]
test_image = [join(d, f)for d in directories for n, f in enumerate(listdir(d)) 
               if isfile(join(d, f)) and n >= 35 and splitext(f)[-1] != ".bad"]

print("Total train_image: {}".format(len(train_image)))
print("Total test_image: {}".format(len(test_image)))
print("Total image files found: {}".format(len(image_files)))

# print("{:^3} - {}".format("#", "Filename"))
# ["{:03} - {}".format(n+1, f) for n, f in enumerate(image_files)]

Total train_image: 1322
Total test_image: 1093
Total image files found: 2415


## Code within above cell is equal to the following code:
```
directories = []
for d in listdir(my_path):
    if isdir(join(my_path, d)):
        directories.append(join(my_path, d))
directories[0:5], len(directories)

image_files = []
for d in directories:
    for f in listdir(d):
        if isfile(join(d, f)):
            image_files.append(join(d, f))
image_files[0:5]
```

In [5]:
grayscale_path = "CroppedYale_Grayscale/"

for n, d in enumerate(directories):
    _d = d.split('/')
    _dir = join(grayscale_path, _d[1])
    if not exists(_dir):
#         print("{} Not exist".format(_dir))
#         print("Creating directory {}".format(_dir))
        makedirs(_dir)
        print("Created '{}' directory\n".format(_dir))
    else:
#         print("'{}' folder exist".format(_dir))
        continue

The following code converts the images into grayscale images<br>
Logical operations as following:
1. `Line 3~5`: Since the program may takes some times and computational resources, we define a timer for measurement
2. `Line 8~11`: Through the previous run, we found some images have .bad extension. We used `os.path.splitext` to filter out the extension of the file and `continue` the loop upon finding `.pgm` extension.
3. `Line 13~17`: Since the grayscale images may already exists, we did a check-up before actually converting the images and save them into grayscale image
4. `Line 19~22`: Open image, convert image into grayscale with `L` parameter, save it into `gray_img_name`, add `converted` counter by 1

In [6]:
import time

start = time.time()
converted = 0
file_existed = 0

for i in image_files:
    ext = splitext(i)
    if ext[-1] != ".pgm":
        print("Excluding image {}".format(i))
        continue

    img_name = i.split('/')
    gray_img_name = join(grayscale_path, img_name[1], img_name[2])
    if exists(gray_img_name):
        file_existed = file_existed + 1
        continue
    
    img = Image.open(i)
    img = img.convert("L")
    img.save(gray_img_name)
    converted = converted + 1
print("{} of grayscale images already existed.".format(file_existed))
print("Finished converting {} images to grayscale image in {:.3f}s".format(converted, time.time() - start))

2415 of grayscale images already existed.
Finished converting 0 images to grayscale image in 0.033s


# Question 2
Split the images into training set / test set
- First 35 images as training, the rest 30 images as testing

Slicing up the first 35 identical person images as training image and the rest 30 images as testing

In [7]:
# train_image = image_files[:35]
# test_image  = image_files[35:64]
print("len(train_image): {}\n" \
      "len(test_image) : {}".format(len(train_image), len(test_image)))

len(train_image): 1322
len(test_image) : 1093


## The following part is not necessary
Introduce two `np.array` variables as `train_arr` and `test_arr` to contain the array

```
train_arr = np.array([], ndmin=1, dtype=np.int64)
test_arr = np.array([], ndmin=1, dtype=np.int64)

start = time.time()
for a_train_image in train_image:
    train_arr = np.append(train_arr,
                         np.array(Image.open(a_train_image)))
print("Finished appending train_image to train_arr in {:.3f}s".format(time.time() - start))

start = time.time()
for a_test_image in test_image:
    test_arr = np.append(test_arr,
                         np.array(Image.open(a_test_image)))
print("Finished appending test_image to test_arr in {:.3f}s".format(time.time() - start))

print("train_arr shape : {}".format(train_arr.shape))
print("test_arr shape  : {}".format(test_arr.shape))
```
Takes around 240s to append train_arr<br>
Takes around 150s to append test_arr

Example above does not use `np.ravel` on its implementation because `np.append` append values to the end of an array.

In [8]:
gs_train_image = [train.replace("CroppedYale", "CroppedYale_Grayscale") for train in train_image]
gs_test_image  = [test.replace("CroppedYale", "CroppedYale_Grayscale") for test in test_image]

print("Total gs_train_image: {}".format(len(gs_train_image)))
print("Total gs_test_image : {}".format(len(gs_test_image)))


Total gs_train_image: 1322
Total gs_test_image : 1093


Declaring dictionary variables where the dictionary has a `key` with the **name of the test image** and the `value` representing a `list` with only two members. The first members is the `train_image` with the lowest `calculate_SAD` or lowest `calculate_SSD` score.

```
sad_score_dict["test_image_1"] = ["train_image_n", 10]
ssd_score_dict["test_image_1"] = ["train_image_n", 10]
```

In [9]:
sad_score_dict = {}
ssd_score_dict = {}

# The following code cell will run around 9~10hrs in Macbook Pro 15" 2015
The following code cell will be marked-down

```
print("Total gs_train_image: {}".format(len(gs_train_image)))
print("Total gs_test_image : {}".format(len(gs_test_image)))

stopwatch_stop()
stopwatch_start()

sad_score_dict = {}
ssd_score_dict = {}

for i, test in enumerate(gs_test_image):
    print("\nTest image {}\t".format(i))
    sad_score_dict[test] = {}
    ssd_score_dict[test] = {}
    min_sad_image = ""
    for n, train in enumerate(gs_train_image):
        if n % 50 == 0:
            print("\tProgress: comparing with train_image {:>4}.....".format(n))
        te = np.array(Image.open(test), dtype=np.int64)
        tr = np.array(Image.open(train), dtype=np.int64)
        
        # Set an initial value for the SAD/SSD score and assign the first train image
        if n==0:
            sad_score_dict[test] = {train: calculate_SAD(te, tr)}
            ssd_score_dict[test] = {train: calculate_SSD(te, tr)}
            min_sad_image = train
            print("\t\tSetting initial SAD/SSD value...")
            print("\t\tKey   : {}".format(sad_score_dict[test].keys()[0]))
            print("\t\tValue : {}".format(sad_score_dict[test].keys()[0]))
            continue
        
        # Comparing the current image with the minimum SAD/SSD score
        _sad = calculate_SAD(te, tr)
        if _sad < sad_score_dict[test][min_sad_image]:
            print("\t\tFound a lower SAD/SSD value...")
            sad_score_dict[test] = {train: _sad}
            ssd_score_dict[test] = {train: calculate_SSD(te, tr)}
            min_sad_image = train
            print("\t\tUpdating SAD/SSD value...")
            print("\t\tKey   : {}".format(sad_score_dict[test].keys()[0]))
            print("\t\tValue : {}".format(sad_score_dict[test].keys()[0]))

print("Total elapsed time: {}".format(stopwatch_stop()))

# Time elapsed: 34618.692924
```

## The following code writes all the SAD/SSD calculation into JSON file
It will be marked down since the cell above is marked-down

```
# Writing to JSON file to eliminate the need for calculation

stopwatch_stop()
stopwatch_start()
with open('sad_result.json', 'w') as fp:
    json.dump(sad_score_dict, fp)
print("Writing sad_result.json finished in {:.3f}".format(stopwatch_stop()))

stopwatch_start()
with open('ssd_result.json', 'w') as fp:
    json.dump(ssd_score_dict, fp)
print("Writing ssd_result.json finished in {:.3f}".format(stopwatch_stop()))
```

# Reading through JSON.files written on previous calculation

In [10]:
sad_score_dict = json.load(open('sad_result.json'))
ssd_score_dict = json.load(open('ssd_result.json'))

In [35]:
pprint(sad_score_dict.items()[0:5], indent=4)
pprint(ssd_score_dict.items()[0:5], indent=4)

[   (   u'CroppedYale_Grayscale/yaleB36/yaleB36_P00A-060E-20.pgm',
        {   u'CroppedYale_Grayscale/yaleB36/yaleB36_P00A-070E+00.pgm': 282300}),
    (   u'CroppedYale_Grayscale/yaleB35/yaleB35_P00A-110E+65.pgm',
        {   u'CroppedYale_Grayscale/yaleB32/yaleB32_P00A-110E+65.pgm': 13900}),
    (   u'CroppedYale_Grayscale/yaleB25/yaleB25_P00A+010E-20.pgm',
        {   u'CroppedYale_Grayscale/yaleB25/yaleB25_P00A+025E+00.pgm': 714200}),
    (   u'CroppedYale_Grayscale/yaleB04/yaleB04_P00A+110E+40.pgm',
        {   u'CroppedYale_Grayscale/yaleB01/yaleB01_P00A+110E+40.pgm': 239400}),
    (   u'CroppedYale_Grayscale/yaleB06/yaleB06_P00A-010E-20.pgm',
        {   u'CroppedYale_Grayscale/yaleB06/yaleB06_P00A+000E-20.pgm': 512200})]
[   (   u'CroppedYale_Grayscale/yaleB36/yaleB36_P00A-060E-20.pgm',
        {   u'CroppedYale_Grayscale/yaleB36/yaleB36_P00A-070E+00.pgm': 28290000}),
    (   u'CroppedYale_Grayscale/yaleB35/yaleB35_P00A-110E+65.pgm',
        {   u'CroppedYale_Grayscale/yaleB32/

## Measuring the accuracy

```
# pseudocode
    accuracy = how many test correctly labeled / how many test images

# algorithm
    1. check if the test's key == test's value[0], where value[0] is the train_image and value[1] is its SAD/SSD score


```

In [53]:
sad_correct = 0
ssd_correct = 0
total_test_images = len(test_image)
for k in sad_score_dict.keys():            # k stands for key
    for pk in sad_score_dict[k].keys():    # pk stands predicted key
        if k.split('/')[1] in pk:
            sad_correct += 1

for k in ssd_score_dict.keys():            # k stands for key
    for pk in ssd_score_dict[k].keys():    # pk stands predicted key
        if k.split('/')[1] in pk:
            ssd_correct += 1

print("SAD correct : {:>5} SAD accuracy : {:>.5f}\n".format(sad_correct, 1. * sad_correct / total_test_images))
print("SSD correct : {:>5} SSD accuracy : {:>.5f}\n".format(ssd_correct, 1. * ssd_correct / total_test_images))

SAD correct :   686 SAD accuracy : 0.62763

SSD correct :   686 SSD accuracy : 0.62763

