In [1]:
from scipy.optimize import minimize

import utils
import numpy as np

class DataLoader(object):
    # this class has a standard iterator declared
    # __len__ returns the number of batches (size of the object)
    # __get_item__ handles integer based indexing of the object 
    def __init__(self, data_file, batch_size):
        with open(data_file, 'r') as df:
            data = df.readlines()

        data = data[1:]
        data = data[:(len(data)//batch_size)*batch_size]
        np.random.shuffle(data)
        data = np.array([[float(col) for col in row.split(',')] for row in data])
        input_data, targets = data[:, :-1], data[:, -1]
        input_data = np.hstack([input_data, np.ones((len(input_data), 1), dtype=np.float32)])

        self.num_features = input_data.shape[1]
        self.current_batch_index = 0
        self.input_batches = np.split(input_data, len(input_data)//batch_size)
        self.target_batches = np.split(targets, len(targets)//batch_size)

    def __len__(self):
        return len(self.input_batches)

    def __getitem__(self,i):

        batch_input_data = self.input_batches[i]
        batch_targets = self.target_batches[i]
        return batch_input_data, batch_targets

def classify(inputs, weights):
    #this functions returns w^Tx . The output  is batch_size*1
	return np.dot(inputs, np.reshape(weights, (np.size(weights), 1)).reshape((-1,)))

def get_objective_function(trainx,trainy,loss_type, regularizer_type, loss_weight):
    # this function calculates the loss for a current batch
    loss_function = utils.loss_functions[loss_type]
    if regularizer_type != None:

        regularizer_function = utils.regularizer_functions[regularizer_type]
    def objective_function(weights):
        loss = 0
        
        inputs, targets = trainx,trainy
        outputs = classify(inputs, weights)
        loss += loss_weight*loss_function(targets, outputs)
        if regularizer_type != None:
            # regulariser function is called from utils.py
            loss += regularizer_function(weights)
        return loss
    return objective_function

def get_gradient_function(trainx,trainy,loss_type, regularizer_type, loss_weight):
    # This is a way to declare function inside a function 
    # The get_gradient_function receives the train data from the current batch
    # and all other parameters on which the loss function and gradient depend
    # like C,regulariser_type and loss function
    loss_grad_function = utils.loss_grad_functions[loss_type]
    if regularizer_type != None:
        regularizer_grad_function = utils.regularizer_grad_functions[regularizer_type]
    # gradient function is called from scipy.optimise.minimise()
    # the only paramter its can send is weights 
    # hence there was a need to pass the current batch through get_objective_function


    def gradient_function(weights):

        gradient = np.zeros(len(weights), dtype=np.float32)
        X=trainx
        Y=trainy
        outputs = classify(X,weights)
        # loss_grad_function is called from utils.py
        gradient = loss_weight*loss_grad_function(weights,X,Y,outputs)/len(trainx)
        if regularizer_type != None:
            # regulariser grad function is called from utils.py
            gradient += regularizer_grad_function(weights)
        return gradient
    return gradient_function

def train(data_loader, loss_type, regularizer_type, loss_weight):
    initial_model_parameters = np.random.random((data_loader.num_features))

    num_epochs=1000
    for i in range(num_epochs):
        loss=0
        if(i==0):
            start_parameters=initial_model_parameters
        for j in range(len(data_loader)):
            trainx,trainy=data_loader[j]
            objective_function = get_objective_function(trainx,trainy,loss_type, 
                                                regularizer_type,loss_weight)
            gradient_function = get_gradient_function(trainx,trainy, loss_type, 
                                              regularizer_type, loss_weight)
            # to know about this function please read about scipy.optimise.minimise
            trained_model_parameters = minimize(objective_function, 
                                        start_parameters, 
                                        method="CG", 
                                        jac=gradient_function,
                                        options={'disp': False,
                                                 'maxiter': 1})
            loss+=objective_function(trained_model_parameters.x)
            start_parameters=trained_model_parameters.x
        # prints the batch loss
        print("loss is  ",loss)
        
    print("Optimizer information:")
    print(trained_model_parameters)
    return trained_model_parameters.x
            

def test(inputs, weights):
    outputs = classify(inputs, weights)
    probs = 1/(1+np.exp(-outputs))
    # this is done to get all terms in 0 or 1 You can change for -1 and 1
    return np.round(probs)

def write_csv_file(outputs, output_file):
    # dumps the output file
    with open(output_file, "w") as out_file:
        out_file.write("ID, Output\n")
        for i in range(len(outputs)):
            out_file.write("{}, {}".format(i+1, str(outputs[i])) + "\n")
def get_data(data_file):
    with open(data_file, 'r') as df:
        data = df.readlines()

    data = data[1:]
    data = np.array([[float(col) for col in row.split(',')] for row in data])
    input_data = np.hstack([data, np.ones((len(data), 1), dtype=np.float32)])

    return input_data



In [2]:
train_data_loader = DataLoader("train.csv", 64)
test_data = get_data("test.csv")
print("Got files")

Got files


In [None]:
test_data_output

In [3]:
import pandas as pd
actual_targets=pd.read_csv("targets.csv")
actual_targets=actual_targets.values

In [None]:
np.sum(actual_targets[:,1]==test_data_output)/300

In [4]:
import matplotlib.pyplot as plt

In [6]:
accuracies=[]
c_list=[100]
for c in c_list:
    print("Started training for c = ",c)
    trained_model_parameters = train(train_data_loader, "square_hinge_loss", "L2", c)
    print("Predicting outputs")
    test_data_output = test(test_data, trained_model_parameters)
    accuracies.append(np.sum(actual_targets[:,1]==test_data_output)/300)

Started training for c =  100
loss is   60549.361006675055
loss is   48699.356154363886
loss is   43305.12098788081
loss is   40143.47050626809
loss is   38050.62114844968
loss is   36413.665967636814
loss is   35086.71720576822
loss is   33986.1041987825
loss is   33064.181237251716
loss is   32258.597455616018
loss is   31603.28441350245
loss is   31005.128144387774
loss is   30544.924276204114
loss is   30112.64203267228
loss is   30265.81583500657
loss is   29436.117789910473
loss is   28996.33621456001
loss is   28745.10642283185
loss is   29087.546076535487
loss is   28953.593201528114
loss is   28851.382251260555
loss is   28714.584174439475
loss is   28593.70688808258
loss is   28485.892355620857
loss is   28386.948773055694
loss is   28300.290186224935
loss is   28222.808634662255
loss is   28153.481415698716
loss is   28091.416451758596
loss is   28011.716076918063
loss is   27984.87638401128
loss is   27932.351023005245
loss is   27886.041611359266
loss is   27848.7775321580

loss is   27542.975022561193
loss is   27542.975022560757
loss is   27542.975022560368
loss is   27542.97502255996
loss is   27542.975022559673
loss is   27542.97502255939
loss is   27542.975022559116
loss is   27542.97502255887
loss is   27542.97502255864
loss is   27542.975022558458
loss is   27542.975022558276
loss is   27542.97502255811
loss is   27542.975022557956
loss is   27542.975022557835
loss is   27542.975022557694
loss is   27542.975022557603
loss is   27542.975022557504
loss is   27542.97502255741
loss is   27542.975022557337
loss is   27542.97502255725
loss is   27542.975022557166
loss is   27542.975022557108
loss is   27542.97502255707
loss is   27542.975022557006
loss is   27542.975022556973
loss is   27542.97502255694
loss is   27542.97502255692
loss is   27542.97502255687
loss is   27542.975022556828
loss is   27542.9750225568
loss is   27542.975022556773
loss is   27542.975022556748
loss is   27542.97502255673
loss is   27542.97502255671
loss is   27542.97502255672
l

loss is   27542.97502255651
loss is   27542.975022556508
loss is   27542.97502255651
loss is   27542.975022556497
loss is   27542.975022556522
loss is   27542.9750225565
loss is   27542.97502255649
loss is   27542.975022556508
loss is   27542.975022556504
loss is   27542.975022556533
loss is   27542.975022556548
loss is   27542.975022556508
loss is   27542.97502255651
loss is   27542.975022556504
loss is   27542.975022556515
loss is   27542.975022556497
loss is   27542.975022556493
loss is   27542.97502255649
loss is   27542.975022556537
loss is   27542.975022556515
loss is   27542.975022556504
loss is   27542.975022556504
loss is   27542.975022556497
loss is   27542.97502255652
loss is   27542.975022556504
loss is   27542.975022556508
loss is   27542.97502255651
loss is   27542.975022556504
loss is   27542.975022556493
loss is   27542.975022556497
loss is   27542.9750225565
loss is   27542.975022556508
loss is   27542.9750225565
loss is   27542.9750225565
loss is   27542.975022556533


loss is   27542.9750225565
loss is   27542.975022556522
loss is   27542.975022556515
loss is   27542.9750225565
loss is   27542.975022556508
loss is   27542.975022556508
loss is   27542.9750225565
loss is   27542.975022556504
loss is   27542.975022556508
loss is   27542.9750225565
loss is   27542.975022556493
loss is   27542.975022556548
loss is   27542.975022556493
loss is   27542.975022556526
loss is   27542.9750225565
loss is   27542.97502255651
loss is   27542.975022556504
loss is   27542.97502255651
loss is   27542.9750225565
loss is   27542.9750225565
loss is   27542.975022556486
loss is   27542.97502255648
loss is   27542.975022556526
loss is   27542.975022556504
loss is   27542.97502255649
loss is   27542.975022556508
loss is   27542.97502255651
loss is   27542.975022556497
loss is   27542.9750225565
loss is   27542.975022556508
loss is   27542.975022556497
loss is   27542.97502255653
loss is   27542.97502255649
loss is   27542.975022556515
loss is   27542.97502255652
loss is  

In [7]:
accuracies

[0.9233333333333333]

In [None]:
plt.xscale('log')
plt.plot(c_list,accuracies)

## Perceptron Loss with L4 regularization: best accuracy at c=1 (0.937)
## Square Hinge Loss with L2 regularization: best accuracy at c=100 (0.923)