In [91]:
import numpy
import random
from time import time
from sklearn.cross_validation import KFold
from sklearn import cluster
from scipy.stats import mode
from scipy.spatial.distance import euclidean

class OutputGen(object):

    def calculateAccuracy(self, samplingMethod, samplingType, count, k, data):
        finalAccuracy = 0
        startTime = time()
        kf = KFold(count, n_folds=k, shuffle=True)
        for train, test in kf:
            confidenceTable = self.getConfidenceTable(data[train,:], data[test,:], k)
            output, accuracy = self.getAccuracy(data[test,:], confidenceTable)
            with open("output/%s_%s_%s_%s.csv" % (samplingMethod, samplingType, count, k), "a") as f:
                numpy.savetxt(f, numpy.hstack([output, confidenceTable]))
            finalAccuracy += accuracy
        finalAccuracy /= k
        totalTime = time() - startTime
        with open("output/accuracy.csv", "a") as f:           
            f.write("%s, %s, %s, %s, %s, %s, %s\n" % ("Kmeans", samplingMethod, samplingType, count, k, finalAccuracy, totalTime))

    def getConfidenceTable(self, training, testing, k):
        trainingData = training[:,range(training.shape[1] - 1)]
        trainingClass = training[:, [(training.shape[1]-1)]].reshape(-1,)
        testingData = testing[:,range(testing.shape[1] - 1)]
        # Number of clusters are set to 25
        numberOfClusters = 25

        kMeans = cluster.KMeans(k)
        kMeans.fit(trainingData)

        # find out points in each cluster
        Kclusters = kMeans.labels_
        centroids = []
        for j in range(numberOfClusters):
            indices = [i for i in range(len(trainingData)) if Kclusters[i] == j]
            if indices:
                centroids.append([trainingData[indices,:].mean(axis=0), mode(trainingClass[indices])[0][0]])

        results = numpy.zeros(shape=(len(testingData),2), dtype=float)
        for i in range(len(testingData)):
            for centroid in centroids:
                results[i][centroid[1]] = results[i][centroid[1]] + euclidean(testingData[i], centroid[0])
        results = results / results.sum(axis=1).reshape(results.shape[0], 1)
        return results
    
    #Calculate accuracy based on confidence table
    def getAccuracy(self, test, confidenceTable):
        outputs = numpy.array(zip([1 if ct[1] > ct[0] else 0 for ct in confidenceTable], test[:, [(test.shape[1]-1)]].reshape(-1,)))
        correctCount = 0
        for output in outputs:
            if output[0] == output[1]:
                correctCount += 1
        return outputs, float(correctCount)/float(len(outputs))
    
if __name__ == "__main__":
    preprocessedData = numpy.loadtxt("data/preprocessed_data.csv", delimiter=',', dtype=int)
    classifier = OutputGen()
    ks = [2,4,5]
    for k in ks:
        print "For K = ", k
        with open("input.txt", "r") as f:
            for inputLine in f:
                inputLine = inputLine.rstrip()
                if inputLine:
                    #Read each line in input file
                    size = int(inputLine)
                    print "Starting for", size
                    
                    # Sampling Type : Simple without replacements
                    #select random data points without replacement
                    randomDataPoints = random.sample(range(preprocessedData.shape[0]), size)
                    #Get preprocessed data for selected random data points
                    preprocessedRandomData = preprocessedData[randomDataPoints,:]
                    #Calculate the accuracy for the selected random data
                    classifier.calculateAccuracy("Simple","Without Replacement", size, k, preprocessedRandomData)
                    print "Process completed for simple sampling with replacement"
                    
                    # Sampling Type : Simple with replacements
                    #select random data points with replacement
                    randomDataPoints = [random.randint(0, size-1) for _ in range(size)]
                    #Get preprocessed data for selected random data points
                    preprocessedRandomData = preprocessedData[randomDataPoints,:]
                    #Calculate the accuracy for the selected random data
                    classifier.calculateAccuracy("Simple","With Replacemnt", size, k, preprocessedRandomData)                    
                    print "Process completed for simple sampling without replacement"

For K =  2
Starting for 7290
Process completed for simple sampling with replacement
Process completed for simple sampling without replacement
