In [1]:
import os
from glob import glob

import numpy as np
import pandas as pd
from skimage.io import imread, imshow
from skimage.feature import hog
from skimage.color import rgb2gray
from skimage.transform import resize
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score

In [2]:
# DATA_DIR contains "training"
DATA_DIR = os.environ.get('DATA_DIR', '')

# 1 Preprocessing

Given the image data, we first want all images to be of the same dimension. Also, since our dataset is small, we will reduce the size to 64x64 pixels and convert them to grayscale.

In [3]:
def read_dataset(data_dir):
    categories = ['good', 'flare']
    X = []
    Y = []
    for i, category in enumerate(categories):
        for img in glob(data_dir+'training/'+category+'/*'):
            raw_image = rgb2gray(imread(img))
            X.append(resize(raw_image, (64,64)))
            Y.append(i)
            
    return np.array(X),np.array(Y)

In [4]:
X,y = read_dataset(DATA_DIR)

# 2 Feature extraction and selection

Now, we would like to extract features from the images which will be fed into our classifiers. For simplicity, we chose HOG as our feature. We also need to perform feature selection to avoid having more features than data points. 

In [5]:
hog_features = [hog(img) for img in X]

In [6]:
X_train, X_test, y_train, y_test = train_test_split(hog_features, y, 
                                                    test_size=0.2, 
                                                    random_state=14, 
                                                    shuffle=True)

In [7]:
# Set up for PCA feature selection
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
pca = PCA(.95)

In [8]:
# No. of features before
X_train[0].shape

(2916,)

In [9]:
pca.fit(X_train)

PCA(copy=True, iterated_power='auto', n_components=0.95, random_state=None,
    svd_solver='auto', tol=0.0, whiten=False)

In [10]:
# No. of features after
pca.n_components_

51

In [11]:
X_train = pca.transform(X_train)

In [12]:
X_test = pca.transform(X_test)

# 3 Classification and Model selection

No models are better than others in general so we will try different models and choose the best one according to our criteria which is accuracy. We will use three different models - SVM, Knn and random forest. 

In [13]:
models = {
    'SVC': GridSearchCV(SVC(), {
        'kernel': ('linear', 'rbf', 'sigmoid'),
        'C': [0.1, 1, 10]
    }),
    'KNN': GridSearchCV(KNeighborsClassifier(), {
        'n_neighbors': [3,5,7,9],
        'weights': ('uniform', 'distance')
    }),
    'RF': GridSearchCV(RandomForestClassifier(), {
        'n_estimators': [50, 100, 200, 300, 500],
    }),
}

In [14]:
for model in models.values():
    model.fit(X_train, y_train)

# 4 Results

In [15]:
for name, model in models.items():
    y_pred = model.predict(X_test)
    print(name, 'Accuracy', accuracy_score(y_test, y_pred))
    print(name, 'Precision', precision_score(y_test, y_pred))
    print(name, 'Recall', recall_score(y_test, y_pred))

SVC Accuracy 0.8125
SVC Precision 1.0
SVC Recall 0.5714285714285714
KNN Accuracy 0.75
KNN Precision 1.0
KNN Recall 0.42857142857142855
RF Accuracy 0.75
RF Precision 0.7142857142857143
RF Recall 0.7142857142857143
