In [None]:
import numpy as np
from sklearn.svm import SVC

In [None]:
X, y = np.load('X_train.npy'), np.load('y_train.npy')
Z = np.load('X_test.npy')

samples, features = X.shape
tests = Z.shape[0]

# Classification: Support Vector Machine (Linear)

In [None]:
def svmTrain(data, labels, cost, kernel, gamma, degree):
    model = SVC(C = cost, kernel = kernel, degree = degree, gamma = gamma).fit(data, labels)
    vectors = model.n_support_
    return (model, sum(vectors))

def svmPredict(data, svmModel):
    return svmModel.predict(data)

In [None]:
'''
PARAMETERS:
cost    Penalty parameter C of the error term.

kernel  Specifies the kernel type to be used in the algorithm. It must be one of 'linear', 'poly',
        'rbf', 'sigmoid', 'precomputed' or a callable. If none is given, 'rbf' will be used.
        
gamma   Kernel coefficient for 'rbf', 'poly' and 'sigmoid'.

degree  Degree of the polynomial kernel function ('poly'). Ignored by all other kernels.
'''

costs = [10**p for p in range(-2, 3)]
kernels = ['linear',
           # 'rbf',
           'poly']
gammas = [10**p for p in range(-4, 1)]
degrees = list(range(2, 5))

In [None]:
fileName = 'hw3-lfw-svm-models-labels.txt'

def isLinear(kernel): return kernel == 'linear'
def isPoly(kernel): return kernel == 'poly'

def testAll(costs = [1], kernels = ['linear'], gammas = ['auto'], degrees = [1]):
    f = open(fileName, 'w')
    # f.write("Rows:\t%d,\tColumns:\t%d\n" % (samples, features))
    f.write("['SVM'") # for eval
    for K in kernels:
        for C in costs:   
            if isLinear(K):
                testOne(C, K) # LINEAR
            else:
                for G in gammas:
                    if isPoly(K):
                        for D in degrees:
                            testOne(C, K, G, D) # POLY
                    else:
                        testOne(C, K, G) # RBF
    f.write("]") # for eval
    f.close()
                
def testOne(cost = 1, kernel = 'linear', gamma = 'auto', degree = 1):
    svmModel, totalSV = svmTrain(X, y, cost, kernel, gamma, degree)
    a = svmPredict(Z, svmModel).tolist()
    '''
    f.write("Cost:\t%s\nKernel:\t%s\n" % (str(cost), str(kernel))) # LINEAR, POLY, RBF
    if not isLinear(kernel):
        f.write("Gamma:\t%s\n" % str(gamma)) # POLY, RBF
        if isPoly(kernel):
            f.write("Degree: %s\n" % str(degree)) # POLY
    f.write("Number of Support Vectors:\t%s\n" % str(totalSV))
    f.write("Labels:\t%s\n" % str(a))
    f.write("\n")
    '''
    f.write(", %s" % str(a)) # for eval

In [None]:
testAll(costs, kernels, gammas, degrees)

In [None]:
from collections import Counter
from functools import reduce
import math

In [None]:
fileName = 'hw3-lfw-svm-models-labels-average.'

def getAverage():
    with open(fileName + 'txt', 'r') as f:
        labels = []
        models = eval(f.read())[1:]
    f.close()
    numModels = len(models)
    for row in range(tests):
        y = (model[row] for model in models)
        counter = Counter(list(y))
        mode = counter.most_common()[0][0]
        labels.append(mode)
    g = open(labelListFileName, 'w')
    g.write(str(labels))
    g.close()
    
def toCsv():
    f = open(fileName + 'csv', 'w')
    f.write('ImageId,PredictedClass\n')
    with open(labelListFileName, 'r') as g:
        array = eval(g.read())
        i = 0
        for prediction in array:
            f.write('%d,%d\n' %(i, prediction))
            i += 1
    f.close()

In [None]:
# getAverage(fileName)
toCsv()

# Classification: Neural networks

In [None]:
from sklearn.neural_network import MLPClassifier

In [None]:
def nnTrain(data, labels, solver, alpha, learningRateInit, learningRateType):
    model = MLPClassifier(solver = solver, alpha = alpha,
                          learning_rate_init = learningRateInit, \
                          learning_rate = learningRateType).fit(data, labels)
    return model

def nnPredict(data, nnModel):
    return nnModel.predict(data)

In [None]:
'''
PARAMETERS:
solver              {'lbfgs' , 'sgd', 'adam'}, default 'adam'
                    'lbfgs' optimizer in the family of quasi-Newton methods
                    'adam'  stochastic gradient-based optimizer)
                    The solver for weight optimization.
                    
alpha               float, optional, default 0.0001
                    L2 penalty (regularization term) parameter.
                    
learning_rate       {'constant', 'invscaling', 'adaptive'}, default 'constant'
                    'adaptive'  Keeps the learning rate constant to 'learning_rate_init' (below) as long as
                    1. training loss keeps decreasing by 'tol' (tolerance), and
                    2. 'early_stopping' = True and validation score keeps increasing by 'tol'
                    Otherwise, the current learning rate is divided by 5
                    # Only for solver = 'sgd'.

learning_rate_init  double, optional, default 0.001
                    The initial learning rate used. It controls the step-size in updating the weights.
                    # Not for solver = 'lbfgs'.

early_stopping      Automatically sets aside 10% of training data as validation 
                    And terminates training when validation score is not improving
                    by at least tol for two consecutive epochs.
                    # Not for solver = 'lbfgs'.
'''

solvers = [# 'lbfgs', 'sgd',
           'adam']
'''
alphas = [10**p for p in range(-4, -1)] # 1)]
learningRateInits = [0.0001, 0.001] # 10**(-4) and 10**(-3)
'''
alphas = [0.01]
learningRateInits = [0.001]
learningRateTypes = ['constant', 'invscaling', 'adaptive']

In [None]:
fileName = 'hw3-lfw-nn.txt'

def isLbfgs(solver): return solver == 'lbfgs'
def isSgd(solver): return solver == 'sgd'

def testAll(solvers = ['lbfgs'], alphas = [0.0001], learningRateInits = [0.001], learningRateTypes = ['constant']):
    f = open(fileName, 'w')
    # f.write("Rows:\t%d,\tColumns:\t%d\n" % (samples, features))
    for solver in solvers:
        for alpha in alphas:
            if isLbfgs(solver):
                testOne(solver, alpha) # LBFGS
            else:
                for learnRate in learningRateInits:
                    if isSgd(solver):
                        for learnType in learningRateTypes:
                            testOne(solver, alpha, learnRate, learnType) # SGD
                    else:
                        testOne(solver, alpha, learnRate) # ADAM
    f.close()
                
def testOne(solver = 'lbfgs', alpha = 0.0001, learningRateInit = 0.001, learningRateType = 'constant'):
    nnModel = nnTrain(X, y, solver, alpha, learningRateInit, learningRateType)
    a = nnPredict(Z, nnModel).tolist()
    '''
    f.write("Solver:\t%s\nAlpha:\t%s\n" % (str(solver), str(alpha))) # LBFGS, SGD, ADAM
    if not isLbfgs(solver):
        f.write("Learning Rate:\t%s\n" % str(learningRateInit))
        if isSgd(solver):
            f.write("Learning Rate Type:\t%s\n" % str(learningRateType))
    f.write("Labels:\t%s\n" % str(a))
    f.write("\n")
    '''
    g = open('hw3-lfw-nn-labels.csv', 'w')
    g.write('ImageId,PredictedClass\n')
    i = 0
    for label in a:
        g.write("%d,%d\n" %(i, label))
        i += 1

In [None]:
testAll(solvers, alphas, learningRateInits, learningRateTypes)

# Dimensionality reduction: Feature selection

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFECV
'''
import sklearn
print('The scikit-learn version is {}.'.format(sklearn.__version__))
'''

In [None]:
'''
PARAMETERS:
estimator   A supervised learning estimator with a fit method.
step        If greater than or equal to 1, then step corresponds to the (integer) number of features to remove
            at each iteration. If within (0.0, 1.0), then step corresponds to the percentage (rounded down) of
            features to remove at each iteration.
cv          Determines the cross-validation splitting strategy.
'''

kernel = 'linear'
step = 1
folds = 2

In [None]:
svc = SVC(kernel = kernel)
# Create the RFE object and compute a cross-validated score.
# The "accuracy" scoring is proportional to the number of correct classifications.
rfecv = RFECV(estimator = svc, step = step, cv = StratifiedKFold(folds), scoring = 'accuracy')
X, y = X.tolist(), y.tolist()
rfecv.fit(X, y)

In [None]:
print(rfecv.get_support(True))

In [None]:
print(__doc__)

import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFECV
from sklearn.datasets import make_classification

# Build a classification task using 3 informative features
'''
X, y = make_classification(n_samples=1000, n_features=25, n_informative=3,
                           n_redundant=2, n_repeated=0, n_classes=8,
                           n_clusters_per_class=1, random_state=0)
'''
# Create the RFE object and compute a cross-validated score.
svc = SVC(kernel="linear")
# The "accuracy" scoring is proportional to the number of correct
# classifications
rfecv = RFECV(estimator=svc, step=1, cv=StratifiedKFold(2),
              scoring='accuracy')
rfecv.fit(X, y)

print("Optimal number of features : %d" % rfecv.n_features_)

# Plot number of features VS. cross-validation scores
plt.figure()
plt.xlabel("Number of features selected")
plt.ylabel("Cross validation score (nb of correct classifications)")
plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
plt.show()