In [25]:
'''
make_submission() generates predictions for the Kaggle Painter by Numbers competion
using simple features (image size, aspect ratio and bits/pixel^2)
author: Swaroop Krothapalli - extended code of small yello duck
https://github.com/swaroop7/painters
'''
import time
import os
import random
import numpy as np
import pandas as pd
from PIL import Image
from matplotlib import pyplot as plt
from sklearn.cross_validation import KFold
from sklearn.linear_model import LogisticRegression
import cv2
import numpy as np
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score  
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.cross_validation import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_curve, auc
from scipy.stats import itemfreq
from sklearn import neighbors, linear_model
from sklearn.metrics import classification_report
import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn import decomposition, pipeline, metrics, grid_search
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV
from sklearn import cross_validation
from sklearn.metrics import accuracy_score
np.set_printoptions(precision=3, linewidth=100)
random.seed(7)

mingw_path = 'C:\\Program Files\\mingw-w64\\x86_64-5.3.0-posix-seh-rt_v4-rev0\\mingw64\\bin'
os.environ['PATH'] = mingw_path + ';' + os.environ['PATH']

os.chdir('C:\\Users\\swaroop\\Downloads\\painters\\ImageSets\\Pilot\\Artwork_for_Testing')


def getEntropy(signal):
    lensig=signal.size
    symset=list(set(signal))
    numsym=len(symset)
    probabability_distribution=[np.size(signal[signal==i])/(1.0*lensig) for i in symset]
    entropy=np.sum([p*np.log2(1.0/p) for p in probabability_distribution])
    return entropy

def calculateEntropyNeighbourhood(artwork, neighbourhood):
    image = cv2.imread(artwork)
    gray_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    
    colorIm=np.array(image)
    grayIm=np.array(gray_image)
    
    N=neighbourhood
    S=grayIm.shape
    E=np.array(grayIm)
    
    for row in range(S[0]): 
            for col in range (S[1]): 
                    Lx=np.max([0,col-N]) 
                    Ux=np.min([S[1],col+N])
                    Ly=np.max([0,row-N])
                    Uy=np.min([S[0],row+N])
                    # makes region 1-D
                    region=grayIm[Ly:Uy,Lx:Ux].flatten()
                    E[row,col]=getEntropy(region)
    
    average=np.mean(E)
    return average

def getDTM(artwork, neighbourhood):
    image = cv2.imread(artwork)
    image32f = np.float32(image)
    mu    = cv2.blur(image32f,(neighbourhood,neighbourhood))
    mu2   = cv2.blur(cv2.multiply(image32f,image32f), (neighbourhood,neighbourhood))
    sigma = cv2.sqrt( mu2 - cv2.multiply(mu, mu) )
    return np.mean(sigma)

def get_image_info(test_info, dir):
	if dir == 'test':
		images = list(set(list(test_info.image1.unique()) + list(test_info.image2.unique())))
		info = pd.DataFrame(np.array(images).reshape((-1, 1)), columns = ['filename'])

	else:
		info = test_info
	
	info['pixelsx'] = np.nan
	info['pixelsy'] = np.nan
	info['size_bytes'] = np.nan
	info['entropy1'] = np.nan
	info['entropy5'] = np.nan
	info['entropy10'] = np.nan
	info['entropy15'] = np.nan
	info['dtm1'] = np.nan
	info['dtm5'] = np.nan
	info['dtm10'] = np.nan
	info['dtm15'] = np.nan


	j = 0
	for i in info.index.values:
		j += 1        
		try:
			fil = 'C:\\Users\\swaroop\\Downloads\\painters\\ImageSets\\Pilot\\Artwork_for_Testing\\'+info.loc[i, 'filename']
			im = Image.open('C:\\Users\\swaroop\\Downloads\\painters\\ImageSets\\Pilot\\Artwork_for_Testing\\'+info.loc[i, 'filename'])
			info.loc[i, 'pixelsx'], info.loc[i, 'pixelsy'] = im.size
			info.loc[i, 'size_bytes'] = os.path.getsize(info.loc[i, 'filename'])
			info.loc[i, 'entropy1'] = calculateEntropyNeighbourhood(fil, 1)
			info.loc[i, 'entropy5'] = calculateEntropyNeighbourhood(fil, 5)
			info.loc[i, 'entropy10'] = calculateEntropyNeighbourhood(fil, 10)
			info.loc[i, 'entropy15'] = calculateEntropyNeighbourhood(fil, 15)
			info.loc[i, 'dtm1'] = getDTM(fil, 1)
			info.loc[i, 'dtm5'] = getDTM(fil, 5)
			info.loc[i, 'dtm10'] = getDTM(fil, 10)
			info.loc[i, 'dtm15'] = getDTM(fil, 15)
		except:
			print dir+'\\'+info.loc[i, 'filename']
		if (j%10 == 0):
			print '',
	info=info.dropna()
	print 'info shape',info.shape
	return info.rename(columns={'filename' : 'new_filename'})

def make_pairs(train_info):
	artists = train_info.artist.unique()

	n = train_info.groupby('artist').size()
	n = (2*n**2).sum() 
	t = pd.DataFrame(np.zeros((n, 4)), columns=['artist1', 'image1', 'artist2', 'image2'])
	i = 0
	j = 0
	for m in artists:
		
		a = train_info[train_info.artist==m][['artist', 'new_filename']].values
		use = train_info[train_info.artist != m].index.values
		np.random.shuffle(use)
		nm = np.mean([a.shape[0]**2, train_info[train_info.artist != m].shape[0] ])

		use = use[0:nm]
		b = train_info[train_info.artist!=m][['artist', 'new_filename']].ix[use, :].values
		a2 = pd.DataFrame(np.concatenate([np.repeat(a[:, 0], a.shape[0]).reshape((-1,1)), np.repeat(a[:, 1], a.shape[0]).reshape((-1,1)), np.tile(a, (a.shape[0], 1))], axis=1), columns=['artist1', 'image1', 'artist2', 'image2'])
		a2 = a2.loc[0:nm, :]
		b2 = pd.DataFrame(np.concatenate([np.tile(a, (a.shape[0], 1))[0:b.shape[0], :], b], axis=1), columns=['artist1', 'image1', 'artist2', 'image2'])
		t.iloc[i:i+a2.shape[0], :] = a2.values
		t.iloc[i+a2.shape[0]:i+a2.shape[0]+b2.shape[0], :] = b2.values
		i += a2.shape[0] +b2.shape[0]
		j += 1
	
	t = t[~t.image2.isin([np.nan, 0])]
	return t.drop_duplicates(subset=['artist1', 'artist2','image1', 'image2'], keep=False)


def prep_data(input, split):
	info = input[0]
	data = input[1]
	
	if split=='cv':
		artists = info.artist
		np.random.shuffle(artists)
		info = get_image_info(info, 'train')
		info['bytes_per_pixel'] = 1.0*info['size_bytes']/(info['pixelsx']*info['pixelsy'])
		info['aspect_ratio'] = 1.0*info['pixelsx']/info['pixelsy']	
		print 'hi',info[info.artist.isin(artists)].shape
		print info.columns
		info['artist'] = info['artist'].map({'hudsonriver': 1, 'impressionist': 0})
		y_train = info['artist']
		x_train = info.drop(['artist', 'new_filename'], axis=1) 
		print x_train.columns
		print y_train
		print x_train

	if split=='test':

		info = get_image_info(data, 'test')
		info['bytes_per_pixel'] = 1.0*info['size_bytes']/(info['pixelsx']*info['pixelsy'])
		info['aspect_ratio'] = 1.0*info['pixelsx']/info['pixelsy']	
		data['in_train'] = False
		if 'artist1' in data.columns:
			data['sameArtist'] = data['artist1'] == data['artist2']


	if split=='cv':
		return x_train, y_train, x_train, y_train  
 	if split=='test':
		return x_test, y_test


In [None]:
import pandas as pd
import numpy as np
import time

start_time = time.time()
train_info = pd.read_csv('C:\\Users\\swaroop\\Downloads\\painters\\ImageSets\\pilot_medium.csv')
print 'prepping training and cv data'
x_train, y_train, x_cv, y_cv = prep_data([train_info, None], 'cv')

print x_train.shape


print (time.time() - start_time)/60 , "minutes"

prepping training and cv data
  train\Monet1880b.jpg
train\Renoir-1873.jpg
train\Renoir-1874.jpg
train\Renoir-1880.jpg


In [2]:
x_train = np.loadtxt('C:\\Users\\swaroop\\Downloads\\painters\\ImageSets\\Pilot\\Artwork_for_Testing\\x_train_pilot.txt')
y_train = np.loadtxt('C:\\Users\\swaroop\\Downloads\\painters\\ImageSets\\Pilot\\Artwork_for_Testing\\y_train_pilot.txt')

In [3]:
x_train['ent5_1'] = x_train['entropy5'] - x_train['entropy1']
x_train['ent10_1'] = x_train['entropy10'] - x_train['entropy1']
x_train['ent15_1'] = x_train['entropy15'] - x_train['entropy1']
x_train['ent10_5'] = x_train['entropy10'] - x_train['entropy5']
x_train['ent15_5'] = x_train['entropy15'] - x_train['entropy5']
x_train['ent15_10'] = x_train['entropy15'] - x_train['entropy10']


x_train['ent5d1'] = x_train['entropy5'] / x_train['entropy1']
x_train['ent10d1'] = x_train['entropy10'] / x_train['entropy1']
x_train['ent15d1'] = x_train['entropy15'] / x_train['entropy1']
x_train['ent10d5'] = x_train['entropy10'] / x_train['entropy5']
x_train['ent15d5'] = x_train['entropy15'] / x_train['entropy5']
x_train['ent15d10'] = x_train['entropy15'] / x_train['entropy10']

x_train['ent1d5'] = x_train['entropy1'] / x_train['entropy5']
x_train['ent1d10'] = x_train['entropy1'] / x_train['entropy10']
x_train['ent1d15'] = x_train['entropy1'] / x_train['entropy15']
x_train['ent5d10'] = x_train['entropy5'] / x_train['entropy10']
x_train['ent5d15'] = x_train['entropy5'] / x_train['entropy15']
x_train['ent10d15'] = x_train['entropy10'] / x_train['entropy15']


x_train['dtm10_5'] = x_train['dtm10'] - x_train['dtm5']
x_train['dtm15_5'] = x_train['dtm15'] - x_train['dtm5']
x_train['dtm15_10'] = x_train['dtm15'] - x_train['dtm10']

x_train['dtm10d5'] = x_train['dtm10'] / x_train['dtm5']
x_train['dtm15d5'] = x_train['dtm15'] / x_train['dtm5']
x_train['dtm15d10'] = x_train['dtm15'] / x_train['dtm10']

x_train['dtm5d10'] = x_train['dtm5'] / x_train['dtm10']
x_train['dtm5d15'] = x_train['dtm5'] / x_train['dtm15']
x_train['dtm10d15'] = x_train['dtm10'] / x_train['dtm15']

IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

In [3]:
def print_results(clf, y_test, y_pred, y_pred_prob):
    print 'ROC - ',roc_auc_score(y_test, y_pred_prob)
    print 'Confusion Matrix - ', confusion_matrix(y_test, y_pred)
    target_names = ['class 0', 'class 1']
    print classification_report(y_test, y_pred, target_names=target_names)
    

from operator import itemgetter

def report(grid_scores, n_top):
    top_scores = sorted(grid_scores, key=itemgetter(1), reverse=True)[:n_top]
    for i, score in enumerate(top_scores):
        print("Model with rank: {0}".format(i + 1))
        print("Mean validation score: {0:.4f})".format(
              score.mean_validation_score,
              np.std(score.cv_validation_scores)))
        print("Parameters: {0}".format(score.parameters))
        print("")

In [4]:
print x_train.shape, y_train.shape
training, testing, y_training, y_testing = train_test_split(x_train, y_train, test_size=0.2, stratify=y_train, random_state=42)
actual_training, validation, y_actual_training, y_validation = train_test_split(training, y_training, test_size=0.2, stratify=y_training, random_state=42)
print training.shape, testing.shape
#print y_training, y_testing
print itemfreq(y_training)
print itemfreq(y_testing)
print itemfreq(y_validation)
print itemfreq(y_train)
print itemfreq(y_actual_training)

print y_validation

(31L, 22L) (31L,)
(24L, 22L) (7L, 22L)
[[  0.  13.]
 [  1.  11.]]
[[ 0.  4.]
 [ 1.  3.]]
[[ 0.  3.]
 [ 1.  2.]]
[[  0.  17.]
 [  1.  14.]]
[[  0.  10.]
 [  1.   9.]]
[ 1.  1.  0.  0.  0.]


In [6]:
#Random Forest
random.seed(9001)

clf = RandomForestClassifier()

param_dist = {"max_depth": [3, 6, 9,12,20],
              "max_features": [5,10,15,20,None],
              "min_samples_split": [2, 5, 10, 15,20,50,100],
              "min_samples_leaf": [ 1, 2, 5, 10],
              "class_weight": ['balanced'],
              "n_estimators": [10,20,50,100, 150, 200],
               "oob_score": [True],
                "random_state": [9001]}


random_search = RandomizedSearchCV(clf, param_distributions=param_dist, cv=5, scoring='accuracy',  n_jobs=-1, verbose=1)
                                   #n_iter=n_iter_search)

random_search.fit(training, y_training)


best_est = random_search.best_estimator_

print best_est

print(" Test Accuracy: %.2f%%" % (accuracy_score(y_testing, best_est.predict(testing)) * 100.0))
report(random_search.grid_scores_, 4)

print_results(best_est, y_testing, best_est.predict(testing), best_est.predict_proba(testing)[:,1] )

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    5.9s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    8.1s finished


RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=9, max_features=5,
            max_leaf_nodes=None, min_impurity_split=1e-07,
            min_samples_leaf=5, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=1,
            oob_score=True, random_state=9001, verbose=0, warm_start=False)
 Test Accuracy: 57.14%
Model with rank: 1
Mean validation score: 0.6250)
Parameters: {'oob_score': True, 'min_samples_leaf': 5, 'n_estimators': 50, 'max_features': 5, 'random_state': 9001, 'min_samples_split': 2, 'max_depth': 9, 'class_weight': 'balanced'}

Model with rank: 2
Mean validation score: 0.5417)
Parameters: {'oob_score': True, 'min_samples_leaf': 10, 'n_estimators': 100, 'max_features': None, 'random_state': 9001, 'min_samples_split': 15, 'max_depth': 6, 'class_weight': 'balanced'}

Model with rank: 3
Mean validation score: 0.5000)
Parameters: {'oob_score': True, 'min_samples_leaf': 10, 'n_estimators':



In [16]:
#XGBoost Classifier with GridSearch

params={
    'max_depth': [2,3,4], #[3,4,5,6,7,8,9], # 5 is good 
    'subsample': [0.4,0.5,0.6,0.7,0.8,0.9,1.0], #[0.4,0.5,0.6,0.7,0.8,0.9,1.0],
    'colsample_bytree': [0.5,0.6,0.7,0.8,0.9,1.0], #[0.5,0.6,0.7,0.8],
    'n_estimators': [3, 10, 22, 25, 40, 100], #[1000,2000,3000]
    #"n_estimators": st.randint(3, 40),
    'reg_alpha': [0, 0.03, 0.1, 0.5, 1.0] #[0.01, 0.02, 0.03, 0.04]
}



xgb_clf = xgb.XGBClassifier()
rs = RandomizedSearchCV(xgb_clf,
                  params,
                  cv=5,
                  scoring="accuracy",
                  n_jobs=-1,
                  verbose=1)
rs.fit(training, y_training)
best_est = rs.best_estimator_
print(best_est)


print(" Test Accuracy: %.2f%%" % (accuracy_score(y_testing, best_est.predict(testing)) * 100.0))

report(rs.grid_scores_, 5)

#xgb_pred = best_est.predict(score_df)
print_results(best_est, y_testing, best_est.predict(testing), best_est.predict_proba(testing)[:,1] )

Fitting 5 folds for each of 10 candidates, totalling 50 fits
XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.9,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=10, nthread=-1,
       objective='binary:logistic', reg_alpha=1.0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=0.8)
 Test Accuracy: 57.14%
Model with rank: 1
Mean validation score: 0.5833)
Parameters: {'n_estimators': 10, 'subsample': 0.8, 'reg_alpha': 1.0, 'colsample_bytree': 0.9, 'max_depth': 3}

Model with rank: 2
Mean validation score: 0.5000)
Parameters: {'n_estimators': 22, 'subsample': 0.6, 'reg_alpha': 0, 'colsample_bytree': 1.0, 'max_depth': 3}

Model with rank: 3
Mean validation score: 0.5000)
Parameters: {'n_estimators': 100, 'subsample': 0.9, 'reg_alpha': 1.0, 'colsample_bytree': 0.5, 'max_depth': 2}

Model with rank: 4
Mean validation score: 0.5000)
Parameters: {'n_estimators': 100, 'subsample': 

[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    2.8s finished


In [24]:
#Logistic Regression

logistic = LogisticRegression(random_state=1)

pipe = Pipeline(steps=[('logistic', logistic)])

a = 'ovr'
estimator = GridSearchCV(pipe,
                         dict(
                              #pca__n_components=n_components,
                              logistic__C=[0.001, 0.01, 0.1, 1, 10, 100], 
                              logistic__random_state=[999],
                             # logistic__intercept_scaling=[1.0],
                              logistic__tol=[0.1,0.01, 0.001, 0.0001],
                              #logistic__dual=[True],
                              #logistic__multi_class=['ovr'],
                            logistic__class_weight =['balanced']))
estimator.fit(training, y_training)

best_model = estimator.best_estimator_
print best_model    
    
print(" Test Accuracy: %.2f%%" % (accuracy_score(y_testing, best_model.predict(testing)) * 100.0))

report(estimator.grid_scores_, 4)

print_results(best_model, y_testing, best_model.predict(testing), best_model.predict_proba(testing)[:,1] )

Pipeline(steps=[('logistic', LogisticRegression(C=0.001, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=999,
          solver='liblinear', tol=0.01, verbose=0, warm_start=False))])
 Test Accuracy: 57.14%
Model with rank: 1
Mean validation score: 0.5417)
Parameters: {'logistic__C': 0.001, 'logistic__random_state': 999, 'logistic__tol': 0.01, 'logistic__class_weight': 'balanced'}

Model with rank: 2
Mean validation score: 0.5417)
Parameters: {'logistic__C': 0.001, 'logistic__random_state': 999, 'logistic__tol': 0.001, 'logistic__class_weight': 'balanced'}

Model with rank: 3
Mean validation score: 0.5417)
Parameters: {'logistic__C': 0.001, 'logistic__random_state': 999, 'logistic__tol': 0.0001, 'logistic__class_weight': 'balanced'}

Model with rank: 4
Mean validation score: 0.5417)
Parameters: {'logistic__C': 0.01, 'logistic__random_state': 999, 'logistic__tol': 0.01, 

In [15]:
# SVC with SVD
    
scl = StandardScaler()
    
svm_model = SVC(random_state=1, probability=True)
    
clf = pipeline.Pipeline([('scl', scl), ('svm', svm_model)])
    
    
param_grid = {'svm__C': [0.1, 1, 9,10, 12, 100]}
              #, 'svm__decision_function_shape': ['ovr']}
    
model = grid_search.GridSearchCV(estimator = clf, param_grid=param_grid, scoring='accuracy', verbose=0, n_jobs=-1, iid=True, refit=True, cv=10)
                                     

model.fit(actual_training, y_actual_training)
print("Best score: %0.3f" % model.best_score_)
print("Best parameters set:")
best_parameters = model.best_estimator_.get_params()
for param_name in sorted(param_grid.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))
    
    
best_model = model.best_estimator_
    
print best_model

print(" Test Accuracy: %.2f%%" % (accuracy_score(y_testing, best_model.predict(testing)) * 100.0))

report(model.grid_scores_, 5)

print_results(best_model, y_testing, best_model.predict(testing), best_model.predict_proba(testing)[:,1] )



Best score: 0.789
Best parameters set:
	svm__C: 1
Pipeline(steps=[('scl', StandardScaler(copy=True, with_mean=True, with_std=True)), ('svm', SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=True, random_state=1, shrinking=True, tol=0.001,
  verbose=False))])
 Test Accuracy: 71.43%
Model with rank: 1
Mean validation score: 0.7895)
Parameters: {'svm__C': 1}

Model with rank: 2
Mean validation score: 0.7368)
Parameters: {'svm__C': 9}

Model with rank: 3
Mean validation score: 0.7368)
Parameters: {'svm__C': 10}

Model with rank: 4
Mean validation score: 0.7368)
Parameters: {'svm__C': 12}

Model with rank: 5
Mean validation score: 0.7368)
Parameters: {'svm__C': 100}

ROC -  0.75
Confusion Matrix -  [[2 2]
 [0 3]]
             precision    recall  f1-score   support

    class 0       1.00      0.50      0.67         4
    class 1       0.60      1.00      0.75         3

avg / total       