-
Notifications
You must be signed in to change notification settings - Fork 0
/
testUtils.py
98 lines (90 loc) · 3.72 KB
/
testUtils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
from sklearn import cross_validation
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
import numpy
from loadData import TCGAData
"""This file contains useful functions for testing/evaluating different learning algorithms"""
def print_genes_nonzero_coeff(data,coeffs):#data should be a TCGAData object
names = data.get_gene_names()
assert(len(coeffs)==len(names))
nonzeroNames = []
for i in range(0,len(names)):
if coeffs[i]!=0:
nonzeroNames.append(names[i])
return nonzeroNames
def kFoldCrossValid(X,Y,learningAlgo,k=4,names=None,selection='none'):#learningAlgo is an object, not a function! and assumes that X and Y are already numpy.arrays
"""
Expects matrix with feature vectors, labels, a learning algorithm, and (optionally) k and a feature selection method. Currently supporting 'chi2' and 'none'.
The learning algorithm needs to have a "fit" method that takes matrix with feature vectors and labels
and a predict method that takes in just one feature vector and returns a list (of length 1) with the prediction
(The fact that a list rather than a single value is returned is just due to the fact that that's what sklearn's
learning algorithms' predict functions do.)
Returns list with accuracies.
"""
kf = cross_validation.KFold(len(X), k=k,shuffle=True) #TODO vary k
accuracy = []
for train_index, test_index in kf:
#print("TRAIN: %s TEST: %s" % (train_index, test_index))
X_train, X_test = [X[i] for i in train_index], [X[i] for i in test_index]
y_train, y_test = [Y[i] for i in train_index], [Y[i] for i in test_index]
#print "Xtrain has %d examples, ytrain has %d labels" % (len(X_train),len(y_train))
if(selection=='chi2'):
numFeatures = len(X_train[0])
numExamples = len(y_train)
fs = SelectKBest(chi2,k=50)
#fs.fit(numpy.array(X_train)*1000,y_train)
fs.fit(numpy.array(X)*1000,Y)
indices = fs.get_support() #I think this gives you a bit mask of which features you want
names =numpy.array(names)
print names[indices]
X_train = numpy.array(X_train)
X_train = X_train[:,indices]
X_test = numpy.array(X_test)
X_test = X_test[:,indices]
elif selection=='random':
print 'random!!'
numFeatures = len(X_train[0])
numExamples = len(y_train)
indices = numpy.random.randint(0,numFeatures-1,50)
names =numpy.array(names)
print names[indices]
X_train = numpy.array(X_train)
X_train = X_train[:,indices]
X_test = numpy.array(X_test)
X_test = X_test[:,indices]
learningAlgo.fit(X_train,y_train)
predictions = []
for x_vec in X_test:
predictions.append(learningAlgo.predict(x_vec)[0])
accuracy.append(evaluateClassifications(predictions,y_test))
return accuracy
def evaluateClassifications(predicted,testLabels):
"""Returns a list [accuracy,precision,recall, fp, fn, tp, tn]
Yes, it would be nicer if I made an object with these attributes.
Complain to me if you want me to change it to that.
"""
numWrong = 0.0
numRight = 0.0
fp = 0.0
fn = 0.0
tp = 0.0
tn = 0.0
for i in range(0,len(predicted)):
if predicted[i]==testLabels[i] and predicted[i]: tp+=1
elif predicted[i]==testLabels[i] and not predicted[i]: tn+=1
elif predicted[i]!=testLabels[i] and predicted[i]: fp+=1
else: fn+=1
#print 'fp: %f' % fp
#print 'fn: %f' % fn
#print 'tp: %f' % tp
#print 'tn: %f' % tn
if fp+tp==0:
precision = float("inf")
else:
precision = tp/(fp+tp)
if tp+fn==0:
recall = float("inf")
else:
recall = tp/(tp+fn)
accuracy = ((tp+tn)/(len(predicted)))
return([accuracy,precision,recall,fp,fn,tp,tn])