# Homework #3 - Labeled Faces in the Wild

Matriculation Numbers: A0124772E, A0136070R, A0121299A

Email Addresses: a0124772@u.nus.edu, e0005572@u.nus.edu, e0008742@u.nus.edu

## Boilerplate

In [None]:
###################
##### IMPORTS #####
###################

# Numpy
import numpy as np

# SciKit
from sklearn.model_selection import KFold
from sklearn.svm import SVC

In [None]:
#####################################
##### MODEL SELECTION FRAMEWORK #####
#####################################

class ValidatedModel():
    def __init__(self):
        self.X              = None    # original points
        self.y              = None    # original classifications
        self.trained_models = None
        
    
    def supplyDataset(self, all_samples, all_labels):
        '''
            Ensures there are as many labels as samples.
            
            Sets the training samples X.
            Sets the training labels y.
        '''
        assert all_samples.shape[0] == all_labels.shape[0]
        
        self.X = all_samples
        self.y = all_labels
    
    def trainAll(self, cross_validation_param=5):
        '''
            For each split of training and test sets (using k-fold),
                * train the model on the training set
                * compute E_in
                * compute E_out
            Store a list of the (model, E_in, E_out) tuples.
        '''
        self.preprocess()
        
        kf = KFold(n_splits=cross_validation_param)
        self.trained_models = []
        
        for train_idx, test_idx in kf.split(self.X):
            trained_model = self.train(self.X[train_idx], self.y[train_idx])
            
            E_in  = self.getError(trained_model, self.X[train_idx], self.y[train_idx])
            E_out = self.getError(trained_model, self.X[test_idx], self.y[test_idx])
            
            self.trained_models.append((trained_model, E_in, E_out))
        
        self.postprocess()
    
    def getErrors(self):
        '''
            Computes the average generalisation error (|E_out - E_in|) over all pair of training and test sets.
            Returns a pair of the average generalisation error and the average in-sample error.
        '''
        sum_generalisation_error = 0
        sum_E_in = 0
        
        for model in self.trained_models:
            E_in  = model[1]
            E_out = model[2]
            
            sum_generalisation_error += abs(E_in - E_out)
            sum_E_in                 += E_in
        
        average_generalisation_error = sum_generalisation_error / len(self.trained_models)
        average_E_in                 = sum_E_in                 / len(self.trained_models)
        
        return (average_generalisation_error, average_E_in)
    
    def getError(self, classifier, points, classifications):
        '''
            Calculate the error of a model over a label given a sample dataset and labels for it.
            0/1-loss is used as this is a classification problem.
        '''

        # use the model to predict the classifications of all points in the test set
        predicted_classifications = classifier.predict(points)

        # calculate the error using 0/1 loss
        N = predicted_classifications.shape[0]
        assert N == classifications.shape[0]
        num_misclassifications = 0
        for i in range(0, N):
            if predicted_classifications[i] != classifications[i]:
                num_misclassifications += 1

        return num_misclassifications/N
    
    def predict(self, points):
        '''
            Get the modal prediction across all the models.
        '''
        num_points = points.shape[0]
        predictions_by_model = [] # ith element is prediction of all points by model i
        predictions_by_point = [] # ith element is modal prediction of point i by all models
        
        for model, E_in, E_out in self.trained_models:
            predictions_by_model.append(model.predict(points))
        
        predictions_by_model = np.array(predictions_by_model)
        
        for i in range(0, num_points):
            point_i_modal_prediction = np.argmax(np.bincount(predictions_by_model[:, i]))
            predictions_by_point.append(point_i_modal_prediction)
            
        predictions_by_point = np.array(predictions_by_point)
        
        return predictions_by_point
    
    def preprocess(self):
        pass
    
    def train(self, points, classifications):
        pass
    
    def postprocess(self):
        pass


In [None]:
class CrappySVM(ValidatedModel):
    def train(self, samples, labels):
        clf = SVC(kernel='linear')
        clf.fit(samples, labels)
        return clf

X = np.load('X_train.npy')
y = np.load('y_train.npy')

X_train = X[0:400]
y_train = y[0:400]

example = CrappySVM()

example.supplyDataset(X_train, y_train)
example.trainAll()

print(example.getErrors())


## Statement of (Team-Level) Individual Work

Please initial (between the square brackets) one of the following statements.

[X] I, A0124772E and A0136070R and A0121299A, certify that we have followed the CS 3244 Machine Learning class guidelines for homework assignments.  In particular, we expressly vow that we have followed the Facebook rule in discussing with others (out of our team) in doing the assignment and did not take notes (digital or printed) from the discussions.  

[ ] I, <*substitute your matric number here*>, did not follow the class rules regarding the homework assignment, because of the following reason:

<*Please fill in*>

I suggest that I should be graded as follows:

<*Please fill in*>

### References

