## Exploration of Data

In [21]:
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import numpy as np
import scipy.io as sio
import os

# Load image data
data_load_1 = sio.loadmat('Proj2FeatVecsSet1.mat')
data_load_2 = sio.loadmat('Proj2TargetOutputsSet1.mat')
data_set = data_load_1['Proj2FeatVecsSet1']
data_target = data_load_2['Proj2TargetOutputsSet1']

In [22]:
# Divide my target data into nice 1-D classifier
number_labels = []
for ars in data_target:
    if np.all(ars == [1,-1,-1,-1,-1]):
        ars = 1
        number_labels.append(ars)
    elif np.all(ars == [-1,1,-1,-1,-1]):
        ars = 2
        number_labels.append(ars)
    elif np.all(ars == [-1,-1,1,-1,-1]):
        ars = 3
        number_labels.append(ars)
    elif np.all(ars == [-1,-1,-1,1,-1]):
        ars = 4
        number_labels.append(ars)
    elif np.all(ars == [-1,-1,-1,-1,1]):
        ars = 5
        number_labels.append(ars)
        
number_labels = np.asarray(number_labels)

## Define how many components we should use and run PCA

In [23]:
from sklearn.decomposition import PCA
pca = PCA().fit(data_set)
cum_var = np.cumsum(pca.explained_variance_ratio_)
eigenvalues = pca.explained_variance_

count = 0
for var in cum_var:
    count += 1
    if var >= 0.95:
        n_components = count
#         answer = "We need about "+ str(n_components) + " components to retain 95% of the variance"
#         print(answer)
        break
        
# plt.figure(1)
# plt.plot(cum_var)
# plt.xlabel('Number of Components')
# plt.ylabel('Cumulative Explained Variance')
# plt.figure(2)
# plt.plot(eigenvalues)
# plt.xlabel('Number of Components')
# plt.ylabel('Eigenvalues')
# plt.show()

# Minumum Noise Factor --> Similar to PCA but removes noise from bands

In [24]:
from sklearn.decomposition import PCA
from skimage.transform import rescale
from sklearn.cluster import KMeans
import numpy as np
import time

#Using PCA
pca = PCA(n_components=n_components)
reduced_data = pca.fit_transform(data_set)

## Creation of Data Folds - Training and Testing

In [25]:
from sklearn.model_selection import train_test_split
# #############################################################################
# Split into a training set and a test set using a stratified k fold
X_train, X_test, y_train, y_test = train_test_split(
    reduced_data, number_labels, test_size=0.20, stratify = number_labels)

## Data breakdown - For X_train_1 and y_train_1, take sample of 1000 out of 5000

In [26]:
# This can be done nicely with the kfold function
X_train_1 = X_train[:1000]
X_train_2 = X_train[5000:9999]
X_train_3 = X_train[10000:14999]
X_train_4 = X_train[15000:20000]
y_train_1 = y_train[:1000]
y_train_2 = y_train[5000:9999]
y_train_3 = y_train[10000:14999]
y_train_4 = y_train[15000:20000]

## TrainMyClassifierParameters Function

In [27]:
def TrainMyClassifierParameters(Algorithm):
    if Algorithm == 'SVM':
        Parameters = {
            'C' : [0.1],
            'gamma' : [0.1]
        }
    elif Algorithm == 'RVM':
        Parameters = {
            'alpha' : [1e-06],
            'beta' : [1e-06]
        }
    elif Algorithm == 'GP':
        Parameters = {
            'length_scale' : [1]             
        }
    return Parameters, Algorithm

## TrainMyClassifier Function

In [28]:
# Not sure if this is the correct type of function we need
def TrainMyClassifier(XEstimate, YEstimate, XValidate, TrainMyClassifierParameters):
    from sklearn.svm import SVC
    from skrvm import RVC
    from sklearn.gaussian_process import GaussianProcessClassifier
    from sklearn.multiclass import OneVsOneClassifier
    from sklearn.model_selection import GridSearchCV
    from sklearn.gaussian_process.kernels import RBF
    from time import time
    t0 = time()
    # Paramaters should have this shape in order for it to work ==>  Parameters = {'C': [1e3, 1e4, 1e5], 'gamma': [0.001, 0.01, 0.1] }
    if TrainMyClassifierParameters[1] == 'SVM':
        # ################################################
        # Train a SVM classification model
        print("Fitting the classifier to the training set")
        param_grid = TrainMyClassifierParameters[0]
        clf = OneVsOneClassifier(GridSearchCV(SVC(kernel='rbf', class_weight='balanced'), param_grid))
        clf = clf.fit(XEstimate, YEstimate)
        y_pred = clf.predict(XValidate)
        scores = clf.score(XEstimate,YEstimate)
        print("done in %0.3fs" % (time() - t0))
        return y_pred, scores
    elif TrainMyClassifierParameters[1] == 'RVM':
        # #############################################################################
        # Train a RVM classification model
        print("Fitting the classifier to the training set")
        t0 = time()
        param_grid = TrainMyClassifierParameters[0]
        clf = OneVsOneClassifier(GridSearchCV(RVC(kernel='rbf',n_iter=1), param_grid))
        clf.fit(XEstimate, YEstimate)
        y_pred = clf.predict(XValidate)
        scores = clf.score(XEstimate,YEstimate)
        print("done in %0.3fs" % (time() - t0))
        return y_pred, scores
    elif TrainMyClassifierParameters[1] == 'GP':
       # #############################################################################
        # Train a GP classification model
        print("Fitting the classifier to the training set")
        t0 = time()
        param_grid = TrainMyClassifierParameters[0]['length_scale']
        k_rbf = 1 * RBF(length_scale=param_grid)
        clf = OneVsOneClassifier(GaussianProcessClassifier(kernel = k_rbf))
        clf.fit(X_train_1, y_train_1)
        y_pred = clf.predict(XValidate)
        scores = clf.score(XEstimate,YEstimate)
        print("done in %0.3fs" % (time() - t0))
        return y_pred, scores
    else:
        print("Incorrect type of algorithm, please use only one of the supported classifiers SVM, RVM, GP")

In [29]:
test = TrainMyClassifier(X_train_1, y_train_1, X_test,TrainMyClassifierParameters('SVM'))

Fitting the classifier to the training set
done in 1.007s


## MyConfusionMatrix Function 

In [30]:
def MyConfusionMatrix(Y,YValidate,ClassNames):
    from sklearn.metrics import classification_report
    from sklearn.metrics import confusion_matrix
    from sklearn.metrics import accuracy_score
    import pandas as pd
    c_r = classification_report(YValidate, Y)
    c_m = confusion_matrix(YValidate, Y)
    a_s = accuracy_score(YValidate, Y)
    # labels = ['One','Two','Three','Four','Five'] - This is the format of the labels
    labels = ClassNames
    df = pd.DataFrame(c_m, dtype='str', index=labels)
    df.columns = ClassNames
    return c_m, df, a_s

In [31]:
MyConfusionMatrix(y_test, test[0],['One','Two','Three','Four','Five'])

(array([[975,  77, 221,  27,  80],
        [  4, 364,   1,  46,   1],
        [ 14,  10, 627,   7,  17],
        [  4, 546, 101, 910, 246],
        [  3,   3,  50,  10, 656]], dtype=int64),
        One  Two Three Four Five
 One    975   77   221   27   80
 Two      4  364     1   46    1
 Three   14   10   627    7   17
 Four     4  546   101  910  246
 Five     3    3    50   10  656,
 0.7064)

## TestMyClassifier Function

In [32]:
def TestMyClassifier(XTest, Parameters, EstParameters):
    # Do similar to trainmyclassifer but with the data from MyCrossValidation
    return YTest