In [2]:
'''
make_submission() generates predictions for the Kaggle Painter by Numbers competion
using simple features (image size, aspect ratio and bits/pixel^2)
author: Swaroop Krothapalli - extended code of small yello duck
https://github.com/swaroop7/painters
'''
import time
import os
import random
import numpy as np
import pandas as pd
from PIL import Image
from matplotlib import pyplot as plt
from sklearn.cross_validation import KFold
import cv2
import numpy as np
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score  
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.cross_validation import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_curve, auc
from scipy.stats import itemfreq
from sklearn import neighbors, linear_model
from sklearn.metrics import classification_report
import xgboost as xgb
from sklearn import cross_validation
np.set_printoptions(precision=3, linewidth=100)

mingw_path = 'C:\\Program Files\\mingw-w64\\x86_64-5.3.0-posix-seh-rt_v4-rev0\\mingw64\\bin'
os.environ['PATH'] = mingw_path + ';' + os.environ['PATH']

os.chdir('C:\\Users\\swaroop\\Downloads\\painters\\ImageSets\\Wyeths')

def getEntropy(signal):
    lensig=signal.size
    symset=list(set(signal))
    numsym=len(symset)
    probabability_distribution=[np.size(signal[signal==i])/(1.0*lensig) for i in symset]
    entropy=np.sum([p*np.log2(1.0/p) for p in probabability_distribution])
    return entropy

def calculateEntropyNeighbourhood(artwork, neighbourhood):
    image = cv2.imread(artwork)
    gray_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    
    colorIm=np.array(image)
    grayIm=np.array(gray_image)
    
    N=neighbourhood
    S=grayIm.shape
    E=np.array(grayIm)
    
    for row in range(S[0]): 
            for col in range (S[1]): 
                    Lx=np.max([0,col-N]) 
                    Ux=np.min([S[1],col+N])
                    Ly=np.max([0,row-N])
                    Uy=np.min([S[0],row+N])
                    # makes region 1-D
                    region=grayIm[Ly:Uy,Lx:Ux].flatten()
                    E[row,col]=getEntropy(region)
    
    average=np.mean(E)
    return average

def getDTM(artwork, neighbourhood):
    image = cv2.imread(artwork)
    image32f = np.float32(image)
    mu    = cv2.blur(image32f,(5,5))
    mu2   = cv2.blur(cv2.multiply(image32f,image32f), (5,5))
    sigma = cv2.sqrt( mu2 - cv2.multiply(mu, mu) )
    return np.mean(sigma)

#image_info_test = get_image_info(test_info, 'test')
def get_image_info(test_info, dir):
	if dir == 'test':
		images = list(set(list(test_info.image1.unique()) + list(test_info.image2.unique())))
		info = pd.DataFrame(np.array(images).reshape((-1, 1)), columns = ['filename'])
		#print info
	else:
		info = test_info
	
	info['pixelsx'] = np.nan
	info['pixelsy'] = np.nan
	info['size_bytes'] = np.nan
	info['entropy1'] = np.nan
	info['entropy5'] = np.nan
	info['entropy10'] = np.nan
	info['entropy15'] = np.nan
# 	info['entropy20'] = np.nan
#	info['dtm'] = np.nan
	info['dtm1'] = np.nan
	info['dtm5'] = np.nan
	info['dtm10'] = np.nan
	info['dtm15'] = np.nan

    
	
	j = 0
	for i in info.index.values:
		j += 1        
		try:
			#print i
			#fil = 'C:\\Users\\swaroop\\Downloads\\painters\\ImageSets\\Wyeths\\'+dir+'\\'+info.loc[i, 'filename']
			fil = 'C:\\Users\\swaroop\\Downloads\\painters\\ImageSets\\Wyeths\\'+info.loc[i, 'filename']
			#print fil
			im = Image.open('C:\\Users\\swaroop\\Downloads\\painters\\ImageSets\\Wyeths\\'+info.loc[i, 'filename'])
			#print im
			#print im.size
			info.loc[i, 'pixelsx'], info.loc[i, 'pixelsy'] = im.size
			#im = cv2.imread(dir+'/'+info.loc[i, 'new_filename'])
			#info.loc[i, 'pixelsx'], info.loc[i, 'pixelsy'] = im.shape[0:2]
			info.loc[i, 'size_bytes'] = os.path.getsize(info.loc[i, 'filename'])
			#info.loc[i, 'entropy'] = calculateEntropyNeighbourhood(fil, 1)
			#print calculateEntropyNeighbourhood(fil, 1)
			#info.loc[i, 'dtm'] = getDTM(fil)
			info.loc[i, 'entropy1'] = calculateEntropyNeighbourhood(fil, 1)
			info.loc[i, 'entropy5'] = calculateEntropyNeighbourhood(fil, 5)
			info.loc[i, 'entropy10'] = calculateEntropyNeighbourhood(fil, 10)
			info.loc[i, 'entropy15'] = calculateEntropyNeighbourhood(fil, 15)
# 			info['entropy20'] = calculateEntropyNeighbourhood(fil, 20)
			info.loc[i, 'dtm1'] = getDTM(fil, 1)
			info.loc[i, 'dtm5'] = getDTM(fil, 5)
			info.loc[i, 'dtm10'] = getDTM(fil, 10)
			info.loc[i, 'dtm15'] = getDTM(fil, 15)

            
            
		except:
			print dir+'\\'+info.loc[i, 'filename']
		if (j%10 == 0):
			print '',
	info=info.dropna()
	print 'info shape',info.shape
	return info.rename(columns={'filename' : 'new_filename'})

#t = make_pairs(train_info)	
def make_pairs(train_info):
	print "make pairs train info shape",train_info.shape
	artists = train_info.artist.unique()

	n = train_info.groupby('artist').size()
	n = (2*n**2).sum() 
	t = pd.DataFrame(np.zeros((n, 4)), columns=['artist1', 'image1', 'artist2', 'image2'])
	i = 0
	j = 0
	for m in artists:
		
		a = train_info[train_info.artist==m][['artist', 'new_filename']].values
		use = train_info[train_info.artist != m].index.values
		print "a and use shapes", a.shape, use.shape
		np.random.shuffle(use)
		#print a.shape, use.shape
		nm = np.mean([a.shape[0]**2, train_info[train_info.artist != m].shape[0] ])
		print nm
		use = use[0:nm]
		print "use.shape",use.shape
		b = train_info[train_info.artist!=m][['artist', 'new_filename']].ix[use, :].values
		#print nm, use.shape, b.shape
		a2 = pd.DataFrame(np.concatenate([np.repeat(a[:, 0], a.shape[0]).reshape((-1,1)), np.repeat(a[:, 1], a.shape[0]).reshape((-1,1)), np.tile(a, (a.shape[0], 1))], axis=1), columns=['artist1', 'image1', 'artist2', 'image2'])
		a2 = a2.loc[0:nm, :]
		b2 = pd.DataFrame(np.concatenate([np.tile(a, (a.shape[0], 1))[0:b.shape[0], :], b], axis=1), columns=['artist1', 'image1', 'artist2', 'image2'])
		print j, i, a2.shape[0], b2.shape[0]
		#print b2
		t.iloc[i:i+a2.shape[0], :] = a2.values
		t.iloc[i+a2.shape[0]:i+a2.shape[0]+b2.shape[0], :] = b2.values
		i += a2.shape[0] +b2.shape[0]
		j += 1
	
	t = t[~t.image2.isin([np.nan, 0])]
	print t.shape, t[t.image1 > t.image2].shape
	print t.columns.values
	#print t
	print "hi1",t.drop_duplicates(subset=['artist1', 'artist2','image1', 'image2'], keep=False).shape
	#return t[t.image1 > t.image2]	
	return t.drop_duplicates(subset=['artist1', 'artist2','image1', 'image2'], keep=False)


#x_train, y_train, x_cv, y_cv = prep_data([train_info, None], 'cv')	
#x_test, y_test = prep_data([None, submission_info], 'test')	
def prep_data(input, split):
	info = input[0]
	data = input[1]
	
	if split=='cv':
		#artists = info.artist.unique()
		artists = info.artist
		#print artists
		#print 'hi', artists
		np.random.shuffle(artists)
		
		info = get_image_info(info, 'train')
		info['bytes_per_pixel'] = 1.0*info['size_bytes']/(info['pixelsx']*info['pixelsy'])
		info['aspect_ratio'] = 1.0*info['pixelsx']/info['pixelsy']
		#train_artists = artists[0:int(0.8*len(artists))]
		#test_artists = artists[int(0.8*len(artists)):]
		#print artists
		#print 'hi',info[info.artist.isin(artists)].shape
		#train = make_pairs(info[info.artist.isin(artists)])
		#test = make_pairs(info[info.artist.isin(test_artists)])
		#print train.shape
		#train['in_train'] = True
		#test['in_train'] = True
		#data = train
		#data['sameArtist'] = data['artist1'] == data['artist2']
		print info.columns

        
	if split=='test':

		info = get_image_info(data, 'test')
		info['bytes_per_pixel'] = 1.0*info['size_bytes']/(info['pixelsx']*info['pixelsy'])
		info['aspect_ratio'] = 1.0*info['pixelsx']/info['pixelsy']	
		
		data['in_train'] = False
	
		if 'artist1' in data.columns:
			data['sameArtist'] = data['artist1'] == data['artist2']


	info['artist'] = info['artist'].map({'watercolor': 1, 'tempura': 0})
	y_train = info['artist']
	info = info.drop(['Name', 'new_filename', 'year', 'artist', 'artist_orig'], axis=1)
	x_train = info
# 	data2 = pd.merge(data, info[['new_filename', 'pixelsx', 'pixelsy', 'size_bytes', 'bytes_per_pixel', 'aspect_ratio']], how='left', left_on='image1', right_on='new_filename')
# 	data2.drop('new_filename', 1, inplace=True)
	
# 	data2 = pd.merge(data2, info[['new_filename', 'pixelsx', 'pixelsy', 'size_bytes', 'bytes_per_pixel', 'aspect_ratio']], how='left', left_on='image2', right_on='new_filename')
# 	data2.drop('new_filename', 1, inplace=True)
	
# 	x_train = data2[data2.in_train==True][['pixelsx_x', 'pixelsy_x', 'size_bytes_x', 'bytes_per_pixel_x', 'aspect_ratio_x', 'pixelsx_y', 'pixelsy_y', 'size_bytes_y', 'bytes_per_pixel_y', 'aspect_ratio_y']].values
# 	x_test = data2[data2.in_train==False][['pixelsx_x', 'pixelsy_x', 'size_bytes_x', 'bytes_per_pixel_x', 'aspect_ratio_x', 'pixelsx_y', 'pixelsy_y', 'size_bytes_y', 'bytes_per_pixel_y', 'aspect_ratio_y']].values
	
	
# 	if 'artist1' in data.columns: 
# 		y_train = data2[data2.in_train==True]['sameArtist'].values
# 		y_test = data2[data2.in_train==False]['sameArtist'].values
# 	else:
# 		y_test = None	
	
	if split=='cv':		
		return x_train, y_train, x_train, y_train  
 	if split=='test':
		return x_test, y_test




In [None]:
import pandas as pd
import numpy as np
import time

start_time = time.time()
train_info = pd.read_csv('C:\\Users\\swaroop\\Downloads\\painters\\ImageSets\\wyeths_medium.csv')
#submission_info = pd.read_csv('submission_info.csv')
print 'prepping training and cv data'
x_train, y_train, x_cv, y_cv = prep_data([train_info, None], 'cv')

print x_train.shape

#np.savetxt('x_train_wyeth.txt', x_train, fmt = '%1.3f' )
#np.savetxt('y_train_wyeth.txt', y_train, fmt = '%1.3f' )

print (time.time() - start_time)/60 , "minutes"

#print 'prepping test data'
#x_test, y_test = prep_data([None, submission_info], 'test')

prepping training and cv data
train\Above the Orchard-1957.jpg


In [6]:
x_train['ent5_1'] = x_train['entropy5'] - x_train['entropy1']
x_train['ent10_1'] = x_train['entropy10'] - x_train['entropy1']
x_train['ent15_1'] = x_train['entropy15'] - x_train['entropy1']
x_train['ent10_5'] = x_train['entropy10'] - x_train['entropy5']
x_train['ent15_5'] = x_train['entropy15'] - x_train['entropy5']
x_train['ent15_10'] = x_train['entropy15'] - x_train['entropy10']


x_train['ent5d1'] = x_train['entropy5'] / x_train['entropy1']
x_train['ent10d1'] = x_train['entropy10'] / x_train['entropy1']
x_train['ent15d1'] = x_train['entropy15'] / x_train['entropy1']
x_train['ent10d5'] = x_train['entropy10'] / x_train['entropy5']
x_train['ent15d5'] = x_train['entropy15'] / x_train['entropy5']
x_train['ent15d10'] = x_train['entropy15'] / x_train['entropy10']


x_train['ent1d5'] = x_train['entropy1'] / x_train['entropy5']
x_train['ent1d10'] = x_train['entropy1'] / x_train['entropy10']
x_train['ent1d15'] = x_train['entropy1'] / x_train['entropy15']
x_train['ent5d10'] = x_train['entropy5'] / x_train['entropy10']
x_train['ent5d15'] = x_train['entropy5'] / x_train['entropy15']
x_train['ent10d15'] = x_train['entropy10'] / x_train['entropy15']

x_train['dtm10_5'] = x_train['dtm10'] - x_train['dtm5']
x_train['dtm15_5'] = x_train['dtm15'] - x_train['dtm5']
x_train['dtm15_10'] = x_train['dtm15'] - x_train['dtm10']

x_train['dtm10d5'] = x_train['dtm10'] / x_train['dtm5']
x_train['dtm15d5'] = x_train['dtm15'] / x_train['dtm5']
x_train['dtm15d10'] = x_train['dtm15'] / x_train['dtm10']

x_train['dtm5d10'] = x_train['dtm5'] / x_train['dtm10']
x_train['dtm5d15'] = x_train['dtm5'] / x_train['dtm15']
x_train['dtm10d15'] = x_train['dtm10'] / x_train['dtm15']

def print_results(clf, y_test, y_pred, y_pred_prob):
    #y_pred_prob = clf.predict_proba(y_test)[:,1]
    #y_pred = clf.predict(y_test)
    print 'ROC - ',roc_auc_score(y_test, y_pred_prob)
    print 'Accuracy - ',accuracy_score(y_test, y_pred)
    print 'Confusion Matrix - ', confusion_matrix(y_test, y_pred)
    #print 'Precision - ',precision_score(y_test, y_pred ),'Recall - ',recall_score(y_test, y_pred),'F1- Score',f1_score(y_test, y_pred),'\n'
    target_names = ['class 0', 'class 1']
    print classification_report(y_test, y_pred, target_names=target_names)

  if __name__ == '__main__':


IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

In [7]:
x_train = np.loadtxt('C:\\Users\\swaroop\\Downloads\\painters\\ImageSets\\Wyeths\\x_train_wyeth.txt')
y_train = np.loadtxt('C:\\Users\\swaroop\\Downloads\\painters\\ImageSets\\Wyeths\\y_train_wyeth.txt')

def print_results(clf, y_test, y_pred, y_pred_prob):
    #y_pred_prob = clf.predict_proba(y_test)[:,1]
    #y_pred = clf.predict(y_test)
    print 'ROC - ',roc_auc_score(y_test, y_pred_prob)
    print 'Accuracy - ',accuracy_score(y_test, y_pred)
    print 'Confusion Matrix - ', confusion_matrix(y_test, y_pred)
    #print 'Precision - ',precision_score(y_test, y_pred ),'Recall - ',recall_score(y_test, y_pred),'F1- Score',f1_score(y_test, y_pred),'\n'
    target_names = ['class 0', 'class 1']
    print classification_report(y_test, y_pred, target_names=target_names)

In [8]:
print x_train.shape, y_train.shape
training, testing, y_training, y_testing = train_test_split(x_train, y_train, test_size=0.2, stratify=y_train, random_state=42)
actual_training, validation, y_actual_training, y_validation = train_test_split(training, y_training, test_size=0.2, stratify=y_training, random_state=42)
print training.shape, testing.shape
#print y_training, y_testing
print itemfreq(y_training)
print itemfreq(y_testing)
print itemfreq(y_validation)
print itemfreq(y_train)

print y_validation
# print y_actual_training
# print actual_training


(50L, 28L) (50L,)
(40L, 28L) (10L, 28L)
[[  0.  22.]
 [  1.  18.]]
[[ 0.  5.]
 [ 1.  5.]]
[[ 0.  4.]
 [ 1.  4.]]
[[  0.  27.]
 [  1.  23.]]
[ 0.  0.  1.  1.  0.  1.  0.  1.]


In [30]:
#Random Forest
print "ACCURACY RESUTLS"    
print "Mean Accuracy of 5 fold CV","\t", "STD","\t", "Validation ","\t", "Test"        

for i in xrange(1):
    clf = RandomForestClassifier(n_estimators=20, class_weight='balanced', n_jobs = -1, random_state = 42)
    clf.fit(actual_training, y_actual_training)
    scores = cross_validation.cross_val_score(clf, actual_training, y_actual_training, cv=5, n_jobs = -1)
    y_pred_valid = clf.predict(validation)
    print i, scores.mean(), scores.std(), accuracy_score(y_validation, y_pred_valid), accuracy_score(y_testing, clf.predict(testing))
    
print_results(clf, y_testing, clf.predict(testing), clf.predict_proba(testing)[:,1] )

ACCURACY RESUTLS
Mean Accuracy of 5 fold CV 	STD 	Validation  	Test
0 0.549523809524 0.202066422253 0.625 0.3
ROC -  0.18
Accuracy -  0.3
Confusion Matrix -  [[2 3]
 [4 1]]
             precision    recall  f1-score   support

    class 0       0.33      0.40      0.36         5
    class 1       0.25      0.20      0.22         5

avg / total       0.29      0.30      0.29        10



In [10]:
#XGB
    
learning_r_col = [0.01, 0.02, 0.03, 0.04,0.05,0.06,0.07,0.08,0.09,0.1]    
    
print "ACCURACY RESUTLS"    
print "Mean Accuracy of 5 fold CV","\t", "STD","\t", "Validation ","\t", "Test"        

    
for i in xrange(1):
    clf = xgb.XGBClassifier(max_depth=2, n_estimators=48, learning_rate=0.05, nthread = -1) #objective='multi:softprob'
    clf.fit(actual_training, y_actual_training)
    scores = cross_validation.cross_val_score(clf, actual_training, y_actual_training, cv=5, n_jobs = -1)
    y_pred_valid = clf.predict(validation)
    print i, "\t",scores.mean(),"\t", scores.std(), "\t",accuracy_score(y_validation, y_pred_valid), "\t",accuracy_score(y_testing, clf.predict(testing))

print_results(clf, y_testing, clf.predict(testing), clf.predict_proba(testing)[:,1] )

ACCURACY RESUTLS
Mean Accuracy of 5 fold CV 	STD 	Validation  	Test
0 	0.359047619048 	0.177705200605 	0.5 	0.6
ROC -  0.32
Accuracy -  0.6
Confusion Matrix -  [[4 1]
 [3 2]]
             precision    recall  f1-score   support

    class 0       0.57      0.80      0.67         5
    class 1       0.67      0.40      0.50         5

avg / total       0.62      0.60      0.58        10



In [18]:
#Logistic
    
print "ACCURACY RESUTLS"    
print "Mean Accuracy of 5 fold CV","\t", "STD","\t", "Validation ","\t", "Test"    
    
for i in xrange(1):  # , max_iter = i
    clf = linear_model.LogisticRegression( class_weight='balanced')
    clf.fit(actual_training, y_actual_training)
    scores = cross_validation.cross_val_score(clf, actual_training, y_actual_training, cv=5, n_jobs = -1)
    y_pred_valid = clf.predict(validation)
    print i, "\t",scores.mean(),"\t", scores.std(), "\t",accuracy_score(y_validation, y_pred_valid), "\t",accuracy_score(y_testing, clf.predict(testing))
    
print_results(clf, y_testing, clf.predict(testing), clf.predict_proba(testing)[:,1] )

ACCURACY RESUTLS
Mean Accuracy of 5 fold CV 	STD 	Validation  	Test
0 	0.606666666667 	0.237477293341 	0.375 	0.4
ROC -  0.16
Accuracy -  0.4
Confusion Matrix -  [[3 2]
 [4 1]]
             precision    recall  f1-score   support

    class 0       0.43      0.60      0.50         5
    class 1       0.33      0.20      0.25         5

avg / total       0.38      0.40      0.38        10



In [12]:
#Linear SVC
    
print "ACCURACY RESUTLS of 5 fold CV"    
print "Mean Accuracy ","\t", "STD","\t", "Validation ","\t", "Test"    

iter = [10,100,500,1000]

for i in iter:
    clf = LinearSVC(class_weight = 'balanced', dual = False, max_iter = i)
    clf.fit(actual_training, y_actual_training)
    scores = cross_validation.cross_val_score(clf, actual_training, y_actual_training, cv=5, n_jobs = -1)
    y_pred_valid = clf.predict(validation)
    print i, "\t",scores.mean(),"\t", scores.std(), "\t",accuracy_score(y_validation, y_pred_valid), "\t",accuracy_score(y_testing, clf.predict(testing))

target_names = ['class 0', 'class 1']
print 'Confusion Matrix - ', confusion_matrix(y_testing, clf.predict(testing))
print classification_report(y_testing, clf.predict(testing), target_names=target_names)

ACCURACY RESUTLS of 5 fold CV
Mean Accuracy  	STD 	Validation  	Test
10 	0.618095238095 	0.14662955622 	0.375 	0.4
100 	0.578095238095 	0.212950336475 	0.375 	0.4
500 	0.578095238095 	0.212950336475 	0.375 	0.4
1000 	0.578095238095 	0.212950336475 	0.375 	0.4
Confusion Matrix -  [[3 2]
 [4 1]]
             precision    recall  f1-score   support

    class 0       0.43      0.60      0.50         5
    class 1       0.33      0.20      0.25         5

avg / total       0.38      0.40      0.38        10



In [14]:
'''
def train_classifier(x_train, y_train, x_cv, y_cv):    
    clf1 = SVC(kernel = 'sigmoid',probability=True, tol = 0.01)
    clf2 = SVC(kernel = 'rbf',  probability=True, tol = 0.01)
    clf3 = RandomForestClassifier(n_estimators=20,class_weight='balanced', n_jobs = -1)
    clf4 = GaussianNB()
    clf5 = BernoulliNB() 
    clf6 = LinearSVC(class_weight='balanced',dual = False)
    clf7 = SVC(kernel = 'poly', probability=True, tol = 0.1, degree=2, C=1.0)
    clf8 = neighbors.KNeighborsClassifier()
    clf9 = linear_model.LogisticRegression(class_weight = 'balanced')
    clf10 = xgb.XGBClassifier(max_depth=3, n_estimators=30, learning_rate=0.05) #objective='multi:softprob'
    print 'starting fit'
    
    print x_train.shape, y_train.shape
    training, testing, y_training, y_testing = train_test_split(x_train, y_train, test_size=0.2, stratify=y_train, random_state=42)
    
    print training.shape, testing.shape
    #print y_training, y_testing
    print itemfreq(y_training)
    print itemfreq(y_testing)
    
    clf1.fit(training, y_training)
#     scores = cross_validation.cross_val_score(clf1, training, y_training, cv=5, scoring='roc_auc', n_jobs = -1)
#     print "SVM - Sigmoid Scores",
#     print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
#     clf2.fit(training, y_training)
#     scores = cross_validation.cross_val_score(clf2, training, y_training, cv=5, scoring='roc_auc', n_jobs = -1)
#     print "SVM - RBF Kernel", 
#     print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
    clf3.fit(training, y_training)
    scores = cross_validation.cross_val_score(clf3, training, y_training, cv=5,  n_jobs = -1)
    print "Random Forest", 
    print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
#     clf4.fit(training, y_training)
#     scores = cross_validation.cross_val_score(clf4, training, y_training, cv=5, scoring='roc_auc', n_jobs = -1)
#     print "Gaussian NB", 
#     print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
#     clf5.fit(training, y_training) 
#     scores = cross_validation.cross_val_score(clf5, training, y_training, cv=5, scoring='roc_auc', n_jobs = -1)
#     print "Bernoulli NB", 
#     print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
    clf6.fit(training, y_training)
    scores = cross_validation.cross_val_score(clf6, training, y_training, cv=5,  n_jobs = -1)
    print "SVM - Linear Kernel", 
    print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
#     clf7.fit(training, y_training) 
#     clf8.fit(training, y_training) 
#     scores = cross_validation.cross_val_score(clf8, training, y_training, cv=5, scoring='roc_auc', n_jobs = -1)
#     print "KNN", 
#     print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
    clf9.fit(training, y_training) 
    scores = cross_validation.cross_val_score(clf9, training, y_training, cv=5,  n_jobs = -1)
    print "Logistic", 
    print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
    clf10.fit(training, y_training)
    scores = cross_validation.cross_val_score(clf10, training, y_training, cv=5,  n_jobs = -1)
    print "XGBoost", 
    print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
#     print 'SVM - Sigmoid Kernel'
#     print_results(clf1, y_testing, clf1.predict(testing), clf1.predict_proba(testing)[:,1] )
#     print 'SVM - rbf Kernel'
#     print_results(clf2, y_testing, clf2.predict(testing), clf2.predict_proba(testing)[:,1] )
    print 'Random Forest'
    print_results(clf3, y_testing, clf3.predict(testing), clf3.predict_proba(testing)[:,1] )
#     print 'Gaussian NB'
#     print_results(clf4, y_testing, clf4.predict(testing), clf4.predict_proba(testing)[:,1] )
#     print 'Bernoulli NB'
#     print_results(clf5, y_testing, clf5.predict(testing), clf5.predict_proba(testing)[:,1] )
    print 'SVM Linear Kernel'    
    prob_pos = clf6.decision_function(testing)
    prob_pos = (prob_pos - prob_pos.min()) / (prob_pos.max() - prob_pos.min())
    print_results(clf6, y_testing, clf6.predict(testing), prob_pos )
#     print 'SVM Polynomial Kernel'
#     print_results(clf7, y_testing, clf7.predict(testing), clf7.predict_proba(testing)[:,1] )
#     print 'Nearest Neighbors'
#     print_results(clf8, y_testing, clf8.predict(testing), clf8.predict_proba(testing)[:,1] )
    print 'Logistic Regression'
    print_results(clf9, y_testing, clf9.predict(testing), clf9.predict_proba(testing)[:,1] )
    print 'XG Boost'
    print_results(clf10, y_testing, clf10.predict(testing), clf10.predict_proba(testing)[:,1] )
'''

In [15]:
train_classifier(actual_training, y_actual_training, validation, y_validation)

starting fit
(32L, 28L) (32L,)
(25L, 28L) (7L, 28L)
[[  0.  14.]
 [  1.  11.]]
[[ 0.  4.]
 [ 1.  3.]]
Random Forest Accuracy: 0.36 (+/- 0.39)
SVM - Linear Kernel Accuracy: 0.51 (+/- 0.34)
Logistic Accuracy: 0.47 (+/- 0.42)
XGBoost Accuracy: 0.46 (+/- 0.30)
Random Forest
ROC -  0.416666666667
Accuracy -  0.428571428571
Confusion Matrix -  [[2 2]
 [2 1]]
             precision    recall  f1-score   support

    class 0       0.50      0.50      0.50         4
    class 1       0.33      0.33      0.33         3

avg / total       0.43      0.43      0.43         7

SVM Linear Kernel
ROC -  1.0
Accuracy -  0.857142857143
Confusion Matrix -  [[4 0]
 [1 2]]
             precision    recall  f1-score   support

    class 0       0.80      1.00      0.89         4
    class 1       1.00      0.67      0.80         3

avg / total       0.89      0.86      0.85         7

Logistic Regression
ROC -  1.0
Accuracy -  0.857142857143
Confusion Matrix -  [[4 0]
 [1 2]]
             precision    recal