In [1]:
import os
import sys
import io
import cv2
import numpy as np
import pandas as pd
import boto3
import mxnet as mx
import sagemaker.amazon.common as smac
from sagemaker import get_execution_role
from scipy.spatial.distance import cdist
import seaborn as sns
from time import gmtime, strftime
from IPython.display import clear_output

In [195]:
files = os.listdir('train')
dogs = [x for x in files if x.find('dog') != -1]
cats = [x for x in files if x.find('cat') != -1]
targets = [dogs, cats]

### Dataset
**Link:**

There are 12,500 images of each species contained. Images are larger than thumbnails and are of a variety of breeds. Angles, colors, position are incidental to each photograph.

In [243]:
# Helper Functions
def to_gray(color_img):
    gray = cv2.cvtColor(color_img, cv2.COLOR_BGR2GRAY)
    return gray

def generate_sift(gray_img):
    sift = cv2.xfeatures2d.SIFT_create()
    kp, desc = sift.detectAndCompute(gray_img, None)
    return kp, desc

### Extracting keypoint descriptors from images

In [244]:
def extract_desc(file, path='train/'):
    
    img = cv2.imread(path + file)
    img = to_gray(img)
    
    kp, desc = generate_sift(img)
    
    num_desc = desc.shape[0]
    images = np.full((num_desc,1), file)
    
    return images, kp, desc

In [245]:
def run_extract(file_list):
    
    num_of_files = len(file_list)
    i = 1
    desc_total = None
    kp_total = None
    img_total = None
    
    for file in file_list:
        
        # Extract all SIFT keypoints and descriptors
        images, kp, desc = extract_desc(file)
        
        if i == 1:
            kp_total = kp
            desc_total = desc
            img_total = images
            clear_output(wait=True)
            print(i, "/", num_of_files, "completed")
            i = i + 1
            continue
            
        else:
            kp_total = np.append(kp_total, kp)
            desc_total = np.vstack((desc_total, desc))
            img_total = np.vstack((img_total, images))
            clear_output(wait=True)
            print(i, "/", num_of_files, "completed")
            i = i + 1
    
    
    return img_total, kp_total, desc_total

In [8]:
dog_img, dog_kp, dog_desc = run_extract(dogs[:500])

500 / 500 completed


In [9]:
cat_img, cat_kp, cat_desc = run_extract(cats[:500])

500 / 500 completed


In [10]:
dog_kp[0].angle

129.7286376953125

The keypoint object has several attributes, angle being one of them. These could be incorporated in to the model at some point. There may be some benefit to using these attributes to filter descriptors or add bias to weights.

## Save/Load Data
Comment out if not needed


In [8]:
# Save Cats
#np.save('data/20190120-sampledesc-cats', cat_desc)
#np.save('data/20190120-img-cats', cat_img)

# Save Dogs
#np.save('data/20190120-sampledesc-dogs', dog_desc)
#np.save('20190120-kmeans-samplelabels-dogs', dog_labels)
#np.save('20190120-kmeans-samplecenters-dogs', centers)
#np.save('data/20190120-img-dogs', dog_img)

# Load Dogs
dog_desc = np.load('data/sampledesc-dogs.npy')
dog_img = np.load('data/20190120-img-dogs.npy')

## K-Means Clustering

After using the cv2 method below, it was far too slow on my personal macbook, engineering laptop, and lab desktop computer. I opted to use AWS Sagemaker for clustering.

In [None]:
# Define criteria = ( type, max_iter = 10 , epsilon = 1.0 )
#criteria = (cv2.TERM_CRITERIA_EPS + cv2.TERM_CRITERIA_MAX_ITER, 10, 1.0)

# Set flags (Just to avoid line break in the code)
#flags = cv2.KMEANS_RANDOM_CENTERS

# Apply KMeans
#compactness, dog_lab, dog_cen = cv2.kmeans(dog_desc ,1000,None,criteria,10,flags)

### AWS SageMaker

Create an S3 instance and load data

In [65]:
# S3 bucket
bucket = 'sagemaker-catsvsdogs-east-1'
prefix = 'sagemaker/DEMO-kmeans'

In [66]:
def get_s3_data(filename):
    s3 = boto3.resource('s3')
    s3.Bucket('cats-vs-dogs-descriptors').download_file(filename, '.desc.npy')
    a = np.load('.desc.npy')
    return a

In [23]:
FILE = 'sampledesc-dogs.npy'
dog_desc = get_s3_data(FILE)

In [24]:
dog_desc.shape

(436862, 128)

The next step is to try a standard configuration of AWS SageMaker (K-Means) with 1000 clusters. This will give me feel for how much time is saved by using AWS and also if any cost in incurred. 

Further efforts could focus around tuning the hyperparameters and building a more robust dictionary (i.e. more images and descriptors).

### Convert training data to bytes for SageMaker

In [67]:
def write_s3_data(bucket, prefix, channel, X):
    buf = io.BytesIO()
    smac.write_numpy_to_dense_tensor(buf, X.astype('float32'))
    buf.seek(0)
    boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, channel + '.data')).upload_fileobj(buf)

In [68]:
write_s3_data(bucket, prefix, 'train', dog_desc)

### SageMaker Low-Level SDK method

In [74]:
role = ''

In [70]:
job_name = 'kmeans-lowlevel-' + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
print("Training job", job_name)

Training job kmeans-lowlevel-2019-01-30-08-36-33


In [73]:
from sagemaker.amazon.amazon_estimator import get_image_uri
image = get_image_uri(boto3.Session().region_name, 'kmeans')

output_location = 's3://{}/kmeans_lowlevel_example/output'.format(bucket)
print('training artifacts will be uploaded to: {}'.format(output_location))

k = '1000'
features = '128'
create_training_params = \
{
    "AlgorithmSpecification": {
        "TrainingImage": image,
        "TrainingInputMode": "File"
    },
    "RoleArn": role,
    "OutputDataConfig": {
        "S3OutputPath": output_location
    },
    "ResourceConfig": {
        "InstanceCount": 2,
        "InstanceType": "ml.c4.8xlarge",
        "VolumeSizeInGB": 50
    },
    "TrainingJobName": job_name,
    "HyperParameters": {
        "k": k,
        "feature_dim": features,
        "mini_batch_size": "500"
    },
    "StoppingCondition": {
        "MaxRuntimeInSeconds": 60 * 60
    },
    "InputDataConfig": [
        {
            "ChannelName": "train",
            "DataSource": {
                "S3DataSource": {
                    "S3DataType": "S3Prefix",
                    "S3Uri": "s3://{}/{}/train.data".format(bucket, prefix),
                    "S3DataDistributionType": "FullyReplicated"
                }
            },
            "CompressionType": "None",
            "RecordWrapperType": "None"
        }
    ]
}

sagemaker = boto3.client('sagemaker')

sagemaker.create_training_job(**create_training_params)

status = sagemaker.describe_training_job(TrainingJobName=job_name)['TrainingJobStatus']
print(status)

try:
    sagemaker.get_waiter('training_job_completed_or_stopped').wait(TrainingJobName=job_name)
finally:
    status = sagemaker.describe_training_job(TrainingJobName=job_name)['TrainingJobStatus']
    print("Training job ended with status: " + status)
    if status == 'Failed':
        message = sagemaker.describe_training_job(TrainingJobName=job_name)['FailureReason']
        print('Training failed with the following error: {}'.format(message))
        raise Exception('Training job failed')

training artifacts will be uploaded to: s3://sagemaker-catsvsdogs-east-1/kmeans_lowlevel_example/output
InProgress
Training job ended with status: Completed


Overall, this took about 10 minutes to train and the cost  was $1.38. I tried several times though and was getting an error when it tried to write the file due to region mismatch between the training job and the bucket it was trying to write to. Thus the cost for one training run is likely lower.

### Inspect SageMaker output

In [76]:
path = 'kmeans_lowlevel_example/output/kmeans-lowlevel-2019-01-30-08-36-33/output/model.tar.gz'

In [77]:
s3_client = boto3.client('s3')
s3_client.download_file(bucket, path, 'model.tar.gz')
!tar -xvf model.tar.gz

x state_fecf1206-3e2f-4519-bdee-fe419ad22b6e
x state_95f27bc9-db6e-4e7c-b5e6-63b649e135f0
x model_algo-1


NameError: name 'mx' is not defined

In [80]:
kmeans_model = mx.ndarray.load('model_algo-1')

In [88]:
kmeans_numpy = kmeans_model[0].asnumpy()

In [86]:
kmeans_numpy.shape

(1000, 128)

The SageMaker output is a list of cluster centers; labels will have to be determined manually.

In [2]:
# Save to local disk
#np.save('data/kmeans_numpy', kmeans_numpy)

### Build Term Frequency Matrix

In [None]:
# Load from local
kmeans_numpy = np.load('data/kmeans_numpy.npy')

In [9]:
# Labels
y = np.argmin(cdist(dog_desc, kmeans_numpy, 'euclidean'), axis=1)

In [50]:
def desc_to_visual_words(labels, K=1000):
    label_counts = np.zeros(1000)
    for k in range(1,K-1):
        count = np.where(labels == k)[0].shape[0]
        label_counts[k-1] = count
    return label_counts

In [282]:
def build_code_book(img_list, labels):
    unique_img = np.unique(img_list)
    #print(unique_img.shape)
    code_book = np.zeros((1000,1000))
    i = 0
    for img in unique_img:
        ix = np.where(np.in1d(img_list, img))[0]
        label_hist = desc_to_visual_words(labels[ix])
        #print(label_hist)
        code_book[i] = label_hist
        i = i + 1
    
    return code_book

In [174]:
X = build_code_book(dog_img, y)
X.shape

(500, 1000)

Inspecting one image label frequency

In [179]:
print(X[333])

[1. 4. 6. 0. 1. 1. 0. 3. 0. 2. 4. 1. 4. 4. 1. 1. 0. 2. 0. 4. 2. 5. 2. 4.
 3. 0. 3. 2. 3. 1. 3. 1. 2. 0. 2. 0. 2. 1. 2. 0. 2. 0. 1. 1. 1. 0. 1. 2.
 3. 3. 3. 3. 1. 4. 2. 0. 0. 4. 3. 0. 2. 4. 1. 3. 0. 3. 2. 1. 0. 6. 1. 0.
 2. 0. 4. 4. 2. 1. 0. 1. 1. 2. 2. 4. 5. 2. 3. 0. 1. 0. 3. 4. 0. 4. 6. 3.
 3. 1. 1. 0. 1. 2. 0. 2. 2. 0. 3. 6. 2. 2. 1. 3. 0. 3. 6. 0. 2. 0. 0. 1.
 1. 2. 1. 1. 1. 0. 2. 2. 1. 0. 0. 2. 2. 2. 1. 0. 5. 2. 6. 2. 3. 3. 1. 2.
 2. 0. 3. 0. 5. 1. 2. 4. 2. 1. 1. 1. 3. 2. 1. 2. 0. 2. 2. 1. 0. 2. 1. 3.
 2. 1. 0. 2. 1. 3. 2. 4. 3. 3. 0. 2. 2. 3. 3. 4. 1. 1. 4. 9. 1. 2. 2. 0.
 2. 3. 0. 0. 2. 3. 3. 0. 1. 3. 0. 3. 3. 1. 7. 3. 2. 5. 3. 1. 0. 4. 2. 2.
 0. 3. 1. 1. 2. 2. 1. 5. 1. 2. 1. 1. 1. 1. 1. 1. 2. 0. 0. 1. 4. 0. 2. 1.
 1. 4. 4. 4. 2. 1. 1. 4. 2. 1. 1. 0. 4. 1. 0. 1. 2. 1. 6. 0. 0. 1. 0. 5.
 2. 2. 0. 3. 3. 1. 3. 4. 1. 3. 2. 2. 2. 2. 2. 0. 1. 0. 1. 2. 2. 3. 0. 5.
 3. 2. 0. 4. 2. 2. 1. 1. 0. 1. 2. 4. 5. 0. 0. 2. 3. 3. 4. 3. 1. 1. 2. 0.
 1. 3. 0. 0. 0. 0. 2. 3. 0. 0. 3. 1. 4. 2. 2. 2. 3.

### Transform to Term Frequency - Inverse Document Frequency Matrix

In [186]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidf = TfidfTransformer(norm="l2")
tfidf.fit(X)

TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)

In [194]:
X_train = tfidf.transform(X).todense()

### Append additional categories to training set

In [198]:
# Code to covert cat descriptors to histogram and append to training data

# load cat descriptors 
cat_desc = np.load('data/20190120-sampledesc-cats.npy')
cat_img = np.load('data/20190120-img-cats.npy')

In [199]:
y = np.argmin(cdist(cat_desc, kmeans_numpy, 'euclidean'), axis=1)

In [200]:
X_cats = build_code_book(cat_img, y)
X_cats.shape

(500, 1000)

In [205]:
tfidf.fit(X_cats)
X_train_cats = tfidf.transform(X_cats).todense()
X_train_cats.shape

(500, 1000)

In [207]:
X_train = np.vstack((X_train, X_train_cats))

In [208]:
X_train.shape

(1000, 1000)

## Train Model

In [221]:
# Generate targets

y_train = np.empty(0)
y_train = np.append(y_train, np.ones(500))
y_train = np.append(y_train, np.zeros(500))

In [223]:
from sklearn.utils import shuffle

X_train, y_train = shuffle(X_train, y_train, random_state=0)

In [228]:
from sklearn import svm

clf = svm.SVC(gamma='scale')
clf.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [230]:
pred = clf.predict(X_train)

In [237]:
print("Training Acc:", (y_train == pred).sum() / 1000)

Training Acc: 0.646


### Generate testing set descriptors

In [246]:
dog_img, dog_kp, dog_desc = run_extract(dogs[500:1000])

500 / 500 completed


In [263]:
cat_img, cat_kp, cat_desc = run_extract(cats[500:1000])

500 / 500 completed


In [275]:
desc = np.vstack((dog_desc, cat_desc))
img = np.vstack((dog_img, cat_img))

In [276]:
y = np.argmin(cdist(desc, kmeans_numpy, 'euclidean'), axis=1)

In [283]:
X_test = build_code_book(img, y)
X_test.shape

(1000, 1000)

In [288]:
tfidf.fit(X_test)
X_test = tfidf.transform(X_test).todense()

In [290]:
y_pred = clf.predict(X_test)

In [291]:
# Generate targets
y_test = np.append(np.ones(500), np.zeros(500))

In [292]:
print("Testing Acc:", (y_test == y_pred).sum() / 1000)

Testing Acc: 0.367


# Reflection

# Tasks

- Remove redundant codeblocks here
- Redo training by running tdidif on entire training set rather separately
- Choose better descriptors before clustering
- More data
- Less clusters
    - Read about large vs. small cluster sizes; I think it should be less than 500.
- Hyperparameter tuning 
    - AWS job to run through several K's
- Need AWS credits!


## Packages Needed:
- **pandas** for using scikit
- **scikit-learn** for kmeans & SVM

## DAN2 Research:
- How can we reduce the dictionary?
- Should we pre-process images more than grey-scaling? (e.g., HOG, edge detectiony, feature detection)
    - explore opencv4 for latest offerings

### Addtional Info
http://aishack.in/tutorials/sift-scale-invariant-feature-transform-introduction/