In [209]:
from numpy import genfromtxt
from numpy import mean
from collections import Counter
from collections import defaultdict
from random import sample
from random import randint
from random import seed
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.cross_validation import cross_val_score
from sklearn.cross_validation import train_test_split

# set seed for random so results can be replicated
seed(12345678)

dataFiles = ['basic_event_frequency_count.csv', 
             'basic_event_frequency_count_with_session_min_size_500.csv',
             'basic_event_tfidf.csv',
             'basic_event_tfidf_with_session_min_size_500.csv',
             'complex_event_frequency_count.csv',
             'complex_event_frequency_count_with_session_min_size_500.csv',
             'complex_event_tfidf.csv',
             'complex_event_tfidf_with_session_min_size_500.csv',
             'extra_command_event_frequency_count_min_size_500.csv',
             'extra_command_event_tfidf_min_size_500.csv',
             'extra_command_event_tfidf.csv',
             'extra_command_event_frequency_count.csv']

def readData(fileName):
    # read in data
    testData = genfromtxt(fileName, delimiter=',')

    # extract features from data
    x = [item[2:] for item in testData[1:]]

    # extract programmer id from data
    y = [int(item[0]) for item in testData[1:]]
    return (x, y)

def numberOfProgrammersWithXSessions(y):
    # prints how many programmers have at least some value sessions
    countOfProgId = Counter(y)
    for i in range(1,31):
        occurance = 0
        for progId in countOfProgId.keys():
            if(countOfProgId[progId] > i):
                occurance += 1
        print(str(i) + " " + str(occurance))

def removeProgrammersWithLessThanThreshholdSessions(x, y, threshhold):        
    # remove programmers from data with less than x sessions
    progIdToKeep = [key for key in countOfProgId.keys() if countOfProgId[key] > threshhold]
    newX = []
    newY = []
    for i in range(len(y)):
        if y[i] in progIdToKeep:
            newX.append(x[i])
            newY.append(y[i])
    x = newX
    y = newY
    return (x, y)

def randomlySelectXSessionsForEachProgrammer(x, y, threshhold):    
    # randomly select threshold sessions for each programmers
    newX = []
    newY = []
    # make a dict with key progId and value a list of all session of that programmer 
    prodId = set(y)
    dataGroupedByProg = defaultdict(list)
    for i in range(len(x)):
        dataGroupedByProg[y[i]].append(x[i]) 
        
    #sample threshold from that list of list
    for key in dataGroupedByProg.keys():
        sampledSessions = sample(dataGroupedByProg[key], threshhold)
        for i in range(len(sampledSessions)):
            newX.append(sampledSessions[i])
            newY.append(key)
            
    return (newX, newY)

def processLinearSVCCrossVal(dataFiles):
    for fileName in dataFiles:
        data = readData(fileName)
        data = removeProgrammersWithLessThanThreshholdSessions(data[0], data[1], 10)
        (x, y) = randomlySelectXSessionsForEachProgrammer(data[0], data[1], 10)
        xTrain, xTest, yTrain, yTest = train_test_split(x, y, test_size=0.2, random_state=45)
        for i in [0.001, 0.0005,0.0004, 0.0003, 0.0002, 0.0001, 0.00009, 0.00008, 0.00005,  0.00001, 0.000001]:
            clf = LinearSVC(C=i).fit(xTrain, yTrain)
            print(str(i) + " " + fileName)
            print(mean(cross_val_score(clf, xTrain, yTrain, cv=5)))
            print("\n")

def processKNNCrossVal(dataFiles):            
    for fileName in dataFiles:
        data = readData(fileName)
        data = removeProgrammersWithLessThanThreshholdSessions(data[0], data[1], 10)
        (x, y) = randomlySelectXSessionsForEachProgrammer(data[0], data[1], 10)
        xTrain, xTest, yTrain, yTest = train_test_split(x, y, test_size=0.2, random_state=45)
        results = []
        for i in range(1, 9):
            results2 = []
            for j in range (10):
                nbrs = KNeighborsClassifier(n_neighbors = i)
                nbrs.fit(x, y)
                results2.append(mean(cross_val_score(nbrs, xTrain, yTrain, cv = 5)))
            results.append(mean(results2))
        print(fileName)    
        print(results)    

#processLinearSVCCrossVal(dataFiles)
#processKNNCrossVal(dataFiles)
for fileName in dataFiles:
    print(fileName)
    data = readData(fileName)
    data = removeProgrammersWithLessThanThreshholdSessions(data[0], data[1], 15)
    (x, y) = randomlySelectXSessionsForEachProgrammer(data[0], data[1], 15)
    xTrain, xTest, yTrain, yTest = train_test_split(x, y, test_size=0.2, random_state=45)

    nbrs = KNeighborsClassifier(n_neighbors = 1).fit(xTrain,yTrain)
    print("knn: " + str(nbrs.score(xTest, yTest)))
    clf = LinearSVC(C=0.0001).fit(xTrain,yTrain)
    print("Linear SVC: " + str(clf.score(xTest, yTest)))
    print("\n")


basic_event_frequency_count.csv
knn: 0.123456790123
Linear SVC: 0.111111111111


basic_event_frequency_count_with_session_min_size_500.csv
knn: 0.135802469136
Linear SVC: 0.358024691358


basic_event_tfidf.csv
knn: 0.172839506173
Linear SVC: 0.0246913580247


basic_event_tfidf_with_session_min_size_500.csv
knn: 0.234567901235
Linear SVC: 0.0123456790123


complex_event_frequency_count.csv
knn: 0.172839506173
Linear SVC: 0.16049382716


complex_event_frequency_count_with_session_min_size_500.csv
knn: 0.135802469136
Linear SVC: 0.567901234568


complex_event_tfidf.csv
knn: 0.246913580247
Linear SVC: 0.0123456790123


complex_event_tfidf_with_session_min_size_500.csv
knn: 0.395061728395
Linear SVC: 0.0246913580247


extra_command_event_frequency_count_min_size_500.csv
knn: 0.308641975309
Linear SVC: 0.728395061728


extra_command_event_tfidf_min_size_500.csv
knn: 0.518518518519
Linear SVC: 0.0246913580247


extra_command_event_tfidf.csv
knn: 0.320987654321
Linear SVC: 0.037037037037


ext