## Exploration of Data

In [1]:
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import numpy as np
import scipy.io as sio
import os

# Load image data
data_load_1 = sio.loadmat('Proj2FeatVecsSet1.mat')
data_load_2 = sio.loadmat('Proj2TargetOutputsSet1.mat')
data_set = data_load_1['Proj2FeatVecsSet1']
data_target = data_load_2['Proj2TargetOutputsSet1']

In [2]:
# Divide my target data into nice 1-D classifier
number_labels = []
for ars in data_target:
    if np.all(ars == [1,-1,-1,-1,-1]):
        ars = 1
        number_labels.append(ars)
    elif np.all(ars == [-1,1,-1,-1,-1]):
        ars = 2
        number_labels.append(ars)
    elif np.all(ars == [-1,-1,1,-1,-1]):
        ars = 3
        number_labels.append(ars)
    elif np.all(ars == [-1,-1,-1,1,-1]):
        ars = 4
        number_labels.append(ars)
    elif np.all(ars == [-1,-1,-1,-1,1]):
        ars = 5
        number_labels.append(ars)
        
number_labels = np.asarray(number_labels)

## Define how many components we should use and run PCA

In [3]:
from sklearn.decomposition import PCA
pca = PCA().fit(data_set)
cum_var = np.cumsum(pca.explained_variance_ratio_)
eigenvalues = pca.explained_variance_

count = 0
for var in cum_var:
    count += 1
    if var >= 0.95:
        n_components = count
#         answer = "We need about "+ str(n_components) + " components to retain 95% of the variance"
#         print(answer)
        break
        
# plt.figure(1)
# plt.plot(cum_var)
# plt.xlabel('Number of Components')
# plt.ylabel('Cumulative Explained Variance')
# plt.figure(2)
# plt.plot(eigenvalues)
# plt.xlabel('Number of Components')
# plt.ylabel('Eigenvalues')
# plt.show()

# Minumum Noise Factor --> Similar to PCA but removes noise from bands

In [4]:
from sklearn.decomposition import PCA
from skimage.transform import rescale
from sklearn.cluster import KMeans
import numpy as np
import time

#Using PCA
pca = PCA(n_components=n_components)
reduced_data = pca.fit_transform(data_set)

## Creation of Data Folds - Training and Testing

In [5]:
from sklearn.model_selection import train_test_split
# #############################################################################
# Split into a training set and a test set using a stratified k fold
X_train, X_test, y_train, y_test = train_test_split(
    reduced_data, number_labels, test_size=0.20, stratify = number_labels)

## Data breakdown - For X_train_1 and y_train_1, take sample of 1000 out of 5000

In [6]:
# This can be done nicely with the kfold function
X_train_1 = X_train[:1000]
X_train_2 = X_train[5000:9999]
X_train_3 = X_train[10000:14999]
X_train_4 = X_train[15000:20000]
y_train_1 = y_train[:1000]
y_train_2 = y_train[5000:9999]
y_train_3 = y_train[10000:14999]
y_train_4 = y_train[15000:20000]

## TrainMyClassifierParameters Function

In [7]:
def TrainMyClassifierParameters(Algorithm):
    if Algorithm == 'SVM':
        Parameters = {
            'C' : [0.1],
            'gamma' : [0.1]
        }
    elif Algorithm == 'RVM':
        Parameters = {
            'alpha' : [1e-06],
            'beta' : [1e-06]
        }
    elif Algorithm == 'GP':
        Parameters = {
            'length_scale' : [1]             
        }
    return Parameters, Algorithm

## TrainMyClassifier Function

In [10]:
# Not sure if this is the correct type of function we need
def TrainMyClassifier(XEstimate, YEstimate, XValidate, TrainMyClassifierParameters):
    from sklearn.svm import SVC
    from skrvm import RVC
    from sklearn.gaussian_process import GaussianProcessClassifier
    from sklearn.multiclass import OneVsOneClassifier
    from sklearn.model_selection import GridSearchCV
    from sklearn.gaussian_process.kernels import RBF
    from time import time
    t0 = time()
    # Paramaters should have this shape in order for it to work ==>  Parameters = {'C': [1e3, 1e4, 1e5], 'gamma': [0.001, 0.01, 0.1] }
    if TrainMyClassifierParameters[1] == 'SVM':
        # ################################################
        # Train a SVM classification model
        print("Fitting the classifier to the training set")
        param_grid = TrainMyClassifierParameters[0]
#         clf = OneVsOneClassifier(GridSearchCV(SVC(kernel='rbf', class_weight='balanced'), param_grid))
        clf = OneVsOneClassifier(SVC(kernel='rbf', class_weight='balanced'))
        clf = clf.fit(XEstimate, YEstimate)
        y_pred = clf.predict(XValidate)
        scores = clf.score(XEstimate,YEstimate)
        print("done in %0.3fs" % (time() - t0))
        return y_pred, scores
    elif TrainMyClassifierParameters[1] == 'RVM':
        # #############################################################################
        # Train a RVM classification model
        print("Fitting the classifier to the training set")
        t0 = time()
        param_grid = TrainMyClassifierParameters[0]
        clf = OneVsOneClassifier(RVC(kernel='rbf',n_iter=1))
        clf.fit(XEstimate, YEstimate)
        y_pred = clf.predict(XValidate)
        scores = clf.score(XEstimate,YEstimate)
        print("done in %0.3fs" % (time() - t0))
        return y_pred, scores
    elif TrainMyClassifierParameters[1] == 'GP':
       # #############################################################################
        # Train a GP classification model
        print("Fitting the classifier to the training set")
        t0 = time()
        param_grid = TrainMyClassifierParameters[0]['length_scale']
        k_rbf = 1 * RBF(length_scale=param_grid)
        clf = OneVsOneClassifier(GaussianProcessClassifier(kernel = k_rbf))
        clf.fit(X_train_1, y_train_1)
        y_pred = clf.predict(XValidate)
        scores = clf.score(XEstimate,YEstimate)
        print("done in %0.3fs" % (time() - t0))
        return y_pred, scores
    else:
        print("Incorrect type of algorithm, please use only one of the supported classifiers SVM, RVM, GP")

In [11]:
test = TrainMyClassifier(X_train_1, y_train_1, X_test,TrainMyClassifierParameters('SVM'))

Fitting the classifier to the training set
done in 0.512s


## MyConfusionMatrix Function 

In [20]:
def MyConfusionMatrix(Y,YValidate,ClassNames):
    from sklearn.metrics import classification_report
    from sklearn.metrics import confusion_matrix
    from sklearn.metrics import accuracy_score
    import pandas as pd
    c_r = classification_report(YValidate, Y)
    c_m = confusion_matrix(YValidate, Y)
    a_s = accuracy_score(YValidate, Y)
    # labels = ['One','Two','Three','Four','Five'] - This is the format of the labels
    labels = ClassNames
    df = pd.DataFrame(c_m, dtype='str', index=labels)
    df.columns = ClassNames
    return c_m, df, a_s

In [14]:
MyConfusionMatrix(y_test, test[0],['One','Two','Three','Four','Five'])

(array([[959,  25,  29,  22,  21],
        [ 14, 817,   0,  52,   2],
        [ 16,   5, 935,  12,  14],
        [  2, 150,  10, 883,  37],
        [  9,   3,  26,  31, 926]]),        One  Two Three Four Five
 One    959   25    29   22   21
 Two     14  817     0   52    2
 Three   16    5   935   12   14
 Four     2  150    10  883   37
 Five     9    3    26   31  926, 0.90400000000000003)

## MyCrossValidate Function

In [46]:
def MyCrossValidate(XTrain,YTrain2,Nf,Algorithm): #Why do we use a YTrain with '2' index?
    from sklearn.model_selection import KFold
    from sklearn.model_selection import train_test_split
    dict = {} 
    
    pca = PCA(n_components=9)
    reduced_data = pca.fit_transform(XTrain)
    #print reduced_data.shape
    kf = KFold(n_splits=Nf)
    kf.get_n_splits(XTrain)
    EstParameters = []
    EstConfMatrices = []
    ConfMatrix=[]
    YTrain = []
    YPreds = []
    YValids = []
    i=0        
    for train_index, test_index in kf.split(XTrain):
        # print("TRAIN:", train_index, "TEST:", test_index)
        XEst1 = XTrain[train_index]
        YEst1 = YTrain2[train_index]      
        XValid = XTrain[test_index]
        YValid = YTrain2[test_index]          
        XEst = XEst1[:4000]
        YEst = YEst1[:4000]
        TrainMyClassifierParameters = []
        TrainMyClassifierParameters.append({'C': [1e3], 'gamma': [0.001] })
        TrainMyClassifierParameters.append(Algorithm)
        y_pred1, scores1 = TrainMyClassifier(XEst,YEst,XValid,TrainMyClassifierParameters)
        TrainMyClassifierParameters[0] = {'C': [1e4], 'gamma': [0.01]}
        y_pred2, scores2 = TrainMyClassifier(XEst,YEst,XValid,TrainMyClassifierParameters)
        TrainMyClassifierParameters[0] = {'C': [1e5], 'gamma': [0.1]}
        y_pred3, scores3 = TrainMyClassifier(XEst,YEst,XValid,TrainMyClassifierParameters)
        if max(scores1,scores2,scores3)==scores1:
            y_pred = y_pred1
            dict[i]= {'scores':[scores1,scores2,scores3],'C': [1e3], 'gamma': [0.001]}
        elif max(scores1,scores2,scores3)==scores2:
            y_pred = y_pred2
            dict[i]= {'scores':[scores1,scores2,scores3],'C': [1e4], 'gamma': [0.01]}
        else:
            y_pred = y_pred3  
            dict[i]= {'scores':[scores1,scores2,scores3],'C': [1e5], 'gamma': [0.1]}
        confMatrix, df, a_s = MyConfusionMatrix(y_pred,YValid,['One','Two','Three','Four','Five'])
        EstConfMatrices.append(confMatrix)
#         EstParameters.append(params)
        YTrain.append(y_pred)
        YPreds = np.concatenate((YPreds,y_pred))
        YValids = np.concatenate((YValids,YValid))
        i=i+1
#         y_pred, params = TrainMyClassifier(XEst,YEst,XValid,Algorithm,{'C': [1], 'gamma': [1] })
#         confMatrix = MyConfusionMatrix(y_pred,YValid)
#         EstConfMatrices.append(confMatrix)
#         EstParameters.append(params)
#         YTrain.append(y_pred)
    ConfMatrix, df, a_s = MyConfusionMatrix(YPreds,YValids,['One','Two','Three','Four','Five'])
    np.save(Algorithm+'.npy',dict)
    return YTrain,EstParameters,EstConfMatrices,ConfMatrix

In [48]:
YTrain,EstParameters,EstConfMatrices,ConfMatrix = MyCrossValidate(X_train,y_train,5,'RVM')
print (ConfMatrix)

Fitting the classifier to the training set
done in 45.897s
Fitting the classifier to the training set
done in 45.450s
Fitting the classifier to the training set
done in 48.215s
Fitting the classifier to the training set
done in 57.116s
Fitting the classifier to the training set
done in 55.931s
Fitting the classifier to the training set
done in 49.076s
Fitting the classifier to the training set
done in 49.007s
Fitting the classifier to the training set
done in 47.890s
Fitting the classifier to the training set
done in 49.402s
Fitting the classifier to the training set
done in 47.682s
Fitting the classifier to the training set
done in 48.003s
Fitting the classifier to the training set
done in 46.977s
Fitting the classifier to the training set
done in 48.770s
Fitting the classifier to the training set
done in 52.184s
Fitting the classifier to the training set
done in 49.523s
[[3872   48   43   17   20]
 [  65 3414   28  477   16]
 [  72   13 3748   43  124]
 [  45  352   44 3457  102]
 [ 

## TestMyClassifier Function

In [19]:
def TestMyClassifier(XTest, Parameters, EstParameters):
    # Do similar to trainmyclassifer but with the data from MyCrossValidation
    return YTest