## Assignment 4 - Classification

Follow the instructions in <i>Assignment 4.pdf</i> to implement the classifier of your choice in both basic and ensemble forms below. Pay close attention to the required output and be sure to print it for <b>both</b> the basic and the ensemble methods. Failure to provide the correct outputs will result in 0 points for your code!! <p>

Class skeletons have been provided for you for each of the classifiers. Fill in each of the 4 methods in the 2 classifiers, then fill in the method to read in the training and test data. <p>
    
As before run the code below before anything else to download the datasets and import the needed modules. <b>DO NOT</b> use any module not defined below without permission from the TA, doing so will result in a 0 for your code!


In [9]:
from gcsfs import GCSFileSystem
import numpy as np  # for any array related operations
import math
import random

In [10]:
# Download datasets - may take a few seconds
fs = GCSFileSystem(project='csci4800-dm', token='anon', access='read_only')
fs.get('csci4800-data/assignment_4/breast_cancer_train.libsvm', './breast_cancer_train.libsvm')
fs.get('csci4800-data/assignment_4/breast_cancer_test.libsvm', './breast_cancer_test.libsvm')
fs.get('csci4800-data/assignment_4/led_train.libsvm', './led_train.libsvm')
fs.get('csci4800-data/assignment_4/led_test.libsvm', './led_test.libsvm')
fs.get('csci4800-data/assignment_4/poker_train.libsvm', './poker_train.libsvm')
fs.get('csci4800-data/assignment_4/poker_test.libsvm', './poker_test.libsvm')

In [254]:
"""
Dataset Paths - define the names of the datasets you will read in for training/testing here
"""

train_set = "breast_cancer_train.libsvm"
test_set = "breast_cancer_test.libsvm"

train_set = "poker_train.libsvm"
test_set = "poker_test.libsvm"

train_set = "led_train.libsvm"
test_set = "led_test.libsvm"



### Basic Method
Implement your basic method below. It must contain a train and a test method that builds the classifier and assigns labels to unseen data respectively.


In [255]:
class BasicMethod:
    summaries = {}
    def __init__(self):
        pass
    
    def separateByClass(self, dataset):
        separated = {}
        for i in range(len(dataset)):
            vector = dataset[i]
            if (vector[0] not in separated):
                separated[vector[0]] = []
            separated[vector[0]].append(vector)
        return separated
        
    def mean(self, numbers):
        return float(np.mean(numbers))
        
    def stdev(self, numbers):
        return float(np.std(numbers, ddof=1))
        
    def summarize(self, dataset):
        summaries = [(self.mean(attribute), self.stdev(attribute)) for attribute in zip(*dataset)]
        del summaries[0]
        return summaries

    def summarizeByClass(self, dataset):
        separated = self.separateByClass(dataset)
        summaries = {}
        for classValue, instances in separated.items():
            summaries[classValue] = self.summarize(instances)
        return summaries
    
    def calculateProbability(self, x, mean, stdev):
        exponent = math.exp(-(math.pow(x-mean,2)/(2*math.pow(stdev,2))))
        return (1 / (math.sqrt(2*math.pi) * stdev)) * exponent
    
    def calculateClassProbabilities(self, summaries, inputVector):
        probabilities = {}
        for classValue, classSummaries in summaries.items():
            # ******** Weight for each classifiction ************* #
            if classValue == 1:
                probabilities[classValue] = 1.7
            elif classValue == -1:
                probabilities[classValue] = 1
            # ******** Weight for each classifiction ************* #
            for i in range(len(classSummaries)):
                mean, stdev = classSummaries[i]
#                 print(str(mean) + "****" + str(stdev))
                x = inputVector[i]
#                 print(x)
                probabilities[classValue] *= self.calculateProbability(x, mean, stdev)
        return probabilities

    def predict(self, summaries, inputVector):
        probabilities = self.calculateClassProbabilities(summaries, inputVector[1:])
        bestLabel, bestProb = None, -1
        for classValue, probability in probabilities.items():
            if bestLabel is None or probability > bestProb:
                bestProb = probability
                bestLabel = classValue
        return bestLabel
    
    def getPredictions(self, summaries, testSet):
        predictions = []
        for i in range(len(testSet)):
            result = self.predict(summaries, testSet[i])
            predictions.append(result)
        return predictions
 
    def getAccuracy(self, testSet, predictions):
        correct = 0
        for i in range(len(testSet)):
            if testSet[i][0] == predictions[i]:
                correct += 1
        return (correct/float(len(testSet))) * 100.0
    
    def confusionMatrix(self, testSet, predictions):
        truePositives = 0
        falsePositives = 0
        falseNegatives = 0
        trueNegatives = 0
        for i in range(len(testSet)-1):
            if testSet[i][0] == predictions[i]:
                if testSet[i][0] == 1:
                    truePositives += 1
                elif testSet[i][0] == -1:
                    trueNegatives += 1
            elif testSet[i][0] != predictions[i]:
                if testSet[i][0] == 1:
                    falseNegatives += 1
                elif testSet[i][0] == -1:
                    falsePositives += 1
        total = len(testSet)
        accuracy = ((truePositives + trueNegatives) / total)
        error = 1 - accuracy
        return accuracy, error, falsePositives, falseNegatives, truePositives, trueNegatives, total
    
    def train(self, X, y):
        """
        Define the code to train your classifier here. 
        
        X - matrix of attributes (1 row per example)
        y - array of labels (1 row per example)
        """ 
        self.summaries = self.summarizeByClass(X)
#         print(self.summarizeByClass(X))

#         inputVector = [2, 10, 2, 8, 1, 8, 3, 2, 2, 13]
#         probabilities = self.calculateClassProbabilities(self.summaries, inputVector)
#         print('Probabilities for each class: ' + str(probabilities))
        pass
    
    def test(self, X, y):
        """
        Define the code to test your classifier here. 
        
        X - matrix of attributes (1 row per example)
        y - array of labels (1 row per example)
        """
        
        predictions = self.getPredictions(self.summaries, X)
        accuracy = self.getAccuracy(X, predictions)
        accuracy, error, falsePositives, falseNegatives, truePositives, trueNegatives, total = self.confusionMatrix(X, predictions)
        sensitivity = truePositives / (truePositives + falseNegatives)
        specifivity = trueNegatives / (trueNegatives + falsePositives)
        precision =  truePositives / (truePositives + falsePositives)
        recall = truePositives / (truePositives + falseNegatives)
        Fm = (2 * precision * recall) / (recall + precision)
        Fb = (1)
            
        
        print(accuracy)
        print(error)
        print(sensitivity)
        print(specifivity)
        print(precision)
        print(Fm)
        print(Fb)
        pass

### Ensemble Method
Implement your ensemble method below using the basic method you implmented above. It must also contain a train and a test method similar to those defined in the basic method.

In [256]:
class EnsembleMethod:
    summaries = {}
    k = 10 # **********************Number of sub datasets*********************
    Xi = []
    Yi = []
    Wi = []
    def __init__(self):
        pass
    
    def separateByClass(self, dataset):
        separated = {}
        for i in range(len(dataset)):
            vector = dataset[i]
            if (vector[0] not in separated):
                separated[vector[0]] = []
            separated[vector[0]].append(vector)
        return separated
        
    def mean(self, numbers):
        return float(np.mean(numbers))
        
    def stdev(self, numbers):
        return float(np.std(numbers, ddof=1))
        
    def summarize(self, dataset):
        summaries = [(self.mean(attribute), self.stdev(attribute)) for attribute in zip(*dataset)]
        del summaries[0]
        return summaries

    def summarizeByClass(self, dataset):
        separated = self.separateByClass(dataset)
        summaries = {}
        for classValue, instances in separated.items():
            summaries[classValue] = self.summarize(instances)
        return summaries
    
    def calculateProbability(self, x, mean, stdev):
        exponent = math.exp(-(math.pow(x-mean,2)/(2*math.pow(stdev,2))))
        return (1 / (math.sqrt(2*math.pi) * stdev)) * exponent
    
    def calculateClassProbabilities(self, summaries, inputVector):
        probabilities = {}
        for classValue, classSummaries in summaries.items():
            # ******** Weight for each classifiction ************* #
            if classValue == 1:
                probabilities[classValue] = 1.6
            elif classValue == -1:
                probabilities[classValue] = 1
            # ******** Weight for each classifiction ************* #
            for i in range(len(classSummaries)):
                mean, stdev = classSummaries[i]
#                 print(str(mean) + "****" + str(stdev))
                x = inputVector[i]
#                 print(x)
                probabilities[classValue] *= self.calculateProbability(x, mean, stdev)
        return probabilities

    def predict(self, summaries, inputVector):
        probabilities = self.calculateClassProbabilities(summaries, inputVector[1:])
        bestLabel, bestProb = None, -1
        for classValue, probability in probabilities.items():
            if bestLabel is None or probability > bestProb:
                bestProb = probability
                bestLabel = classValue
        return bestLabel
    
    def getPredictions(self, summaries, testSet):
        predictions = []
        for i in range(len(testSet)):
            result = self.predict(summaries, testSet[i])
            predictions.append(result)
        return predictions
 
    def getAccuracy(self, testSet, predictions):
        correct = 0
        for i in range(len(testSet)):
            if testSet[i][0] == predictions[i]:
                correct += 1
        return (correct/float(len(testSet))) * 100.0
    
    def confusionMatrix(self, testSet, predictions):
        truePositives = 0
        falsePositives = 0
        falseNegatives = 0
        trueNegatives = 0
        for i in range(len(testSet)-1):
            if testSet[i][0] == predictions[i]:
                if testSet[i][0] == 1:
                    truePositives += 1
                elif testSet[i][0] == -1:
                    trueNegatives += 1
            elif testSet[i][0] != predictions[i]:
                if testSet[i][0] == 1:
                    falseNegatives += 1
                elif testSet[i][0] == -1:
                    falsePositives += 1
        total = len(testSet)
        accuracy = ((truePositives + trueNegatives) / total)
        error = 1 - accuracy
        return accuracy, error, falsePositives, falseNegatives, truePositives, trueNegatives, total
    
    def newSample(self, X, y, k):
        Xi = []
        Yi = []
        Wi = []
        Ei = []
        i = 0
        while i != k:
            idx = np.random.randint(len(X), size=int(len(X)*.8)) #size of sample compared to original **************************************************
            x = X[idx,:]
            y = x[:,0]
            self.summaries = self.summarizeByClass(x)
            predictions = self.getPredictions(self.summaries, x)
            accuracy, error, falsePositives, falseNegatives, truePositives, trueNegatives, total = self.confusionMatrix(x, predictions)
            if error < 0.5:
                Xi.append(x)
                Yi.append(y)
                Wi.append(1/k)
                Ei.append(error)
                i += 1
                if i == (k):
                    for j in range(0, len(Wi)):
                        Wi[j] = Ei[j]/(1-Ei[j])
                    normalizedWi = Wi / np.linalg.norm(Wi)
                    return Xi, Yi, normalizedWi

    def train(self, X, y):
        """
        Define the code to train your classifier here. 
        
        X - matrix of attributes (1 row per example)
        y - array of labels (1 row per example)
        """ 
        self.Xi, self.Yi, self.Wi = self.newSample(X, y, self.k)
        pass
    
    def test(self, X, y):
        """
        Define the code to test your classifier here. 
        
        X - matrix of attributes (1 row per example)
        y - array of labels (1 row per example)
        """
        predictions = []
        classWeight = {"-1": 0, "1": 0}
        finalPredictions = []
        for i in range(0, self.k):
            prediction = self.getPredictions(self.summaries, self.Xi[i])
            predictions.append(prediction)

        for j in range(0, len(X)):
            for i in range(0, self.k):
                if predictions[i][j] == 1:
                    classWeight["1"] += self.Wi[i]
                elif predictions[i][j] == -1:
                    classWeight["-1"] += self.Wi[i]
            if classWeight["1"] > classWeight["-1"]:
                value = 1
            elif classWeight["1"] < classWeight["-1"]:
                value = -1
            finalPredictions.append(value)
            classWeight["1"] = 0
            classWeight["-1"] = 0
                 
        accuracy, error, falsePositives, falseNegatives, truePositives, trueNegatives, total = self.confusionMatrix(X, finalPredictions)
        
        sensitivity = truePositives / (truePositives + falseNegatives)
        specifivity = trueNegatives / (trueNegatives + falsePositives)
        precision =  truePositives / (truePositives + falsePositives)
        recall = truePositives / (truePositives + falseNegatives)
        Fm = (2 * precision * recall) / (recall + precision)
        Fb = (1)
            
        print(accuracy)
        print(error)
        print(sensitivity)
        print(specifivity)
        print(precision)
        print(Fm)
        print(Fb)
        pass

### Main Loop
Implement a function to read in the data sets, then run both the basic and ensemble classifiers below. 

In [257]:
"""
Define your method to read in the data here.
"""
def read_LIBSVMs():
    """
    Fill in the ... with code to read in the training file and testing file respectively. 
    It may help to use Numpy arrays to store the data, but you are not required to do so.
    """
    trainDataY = []
    trainDataX = []
    
    testDataY = []
    testDataX = []

    for line in open(train_set, 'r').readlines():
        line = line.strip().split(" ")
        trainDataY.append(int(line[0]))
        lineName = line.pop(0)
        arr = [0] * int(line[-1].split(":")[0])  # fills the array with zeros

        for element in line:
            index,value = element.split(":")
            arr[int(index)-1] = int(value)

        entry = (lineName + " " + " ".join(map(str, arr))) 
        array = entry.split(" ")
        array = list(map(int, array))
        trainDataX.append(np.array(array))
    
    lens = np.array([len(item) for item in trainDataX])
    mask = lens[:,None] > np.arange(lens.max())
    out = np.zeros(mask.shape,dtype=int)
    out[mask] = np.concatenate(trainDataX)
    trainDataX = out

    trainData = [trainDataX,trainDataY]

    for line in open(test_set, 'r').readlines():
        line = line.strip().split(" ")
        testDataY.append(int(line[0]))
        lineName = line.pop(0)
        arr = [0] * int(line[-1].split(":")[0])  # fills the array with zeros

        for element in line:
            index,value = element.split(":")
            arr[int(index)-1] = int(value)

        entry = (lineName + " " + " ".join(map(str, arr))) 
        array = entry.split(" ")
        
        testDataX.append(np.array(array))

    lens = np.array([len(item) for item in testDataX])
    mask = lens[:,None] > np.arange(lens.max())
    out = np.zeros(mask.shape,dtype=int)
    out[mask] = np.concatenate(testDataX)
    testDataX = out
    
    testData = [testDataX,testDataY]

    train = trainData
    test = testData

    return train, test

In [258]:
"""
Run basic classifier and output.
"""
train_data, test_data = read_LIBSVMs()
basic = BasicMethod()
basic.train(train_data[0], train_data[1])
basic.test(test_data[0], test_data[1])

0.8765432098765432
0.12345679012345678
0.8623853211009175
0.87890625
0.43119266055045874
0.5749235474006117
1


In [259]:
"""
Run ensemble classifier and output.
"""
train_data, test_data = read_LIBSVMs()
ensemble = EnsembleMethod()
ensemble.train(train_data[0], train_data[1])
ensemble.test(test_data[0], test_data[1])

0.8941798941798942
0.10582010582010581
0.009174311926605505
0.9892578125
0.08333333333333333
0.01652892561983471
1
