In [1]:
%matplotlib notebook
import numpy as np

# digit dataset
from sklearn.datasets import load_digits
import matplotlib.pyplot as plt

# measure time execution
import time

# scikit-learn classifiers
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

# look for best hyperparamters by cross validation
from sklearn import grid_search

# compute performances classifier
from sklearn.metrics import confusion_matrix,accuracy_score

# split data into training and test
from sklearn.cross_validation import train_test_split

np.random.seed(seed=42)

# Import data

In [2]:
#Load labelled digit data
digits = load_digits()
X, y = digits.data, digits.target

# Bench classifiers

In [54]:
# define a fonction that takes estimator as argument and outputs time execution and test score
def bench(name,clf,Xtrain,ytrain,Xtest,ytest):
    # train
    t0 = time.time()
    clf.fit(Xtrain,ytrain)
    time_train = time.time()-t0
    # test
    t0 = time.time()
    ypred = clf.predict(Xtest)
    time_test = time.time()-t0
    # score
    score = accuracy_score(ytest, ypred)
    cf_matrix = confusion_matrix(ytest, ypred)*1.0/np.bincount(ytest)[:,np.newaxis]
    return name,score,time_train,time_test,cf_matrix

# find best k for knn using grid_search
clf_knn_grid = grid_search.GridSearchCV(KNeighborsClassifier(), {'n_neighbors': [1,2,3,4,5,6,7,8,9,10] },cv=5)
clf_knn_grid.fit(X,y)

k_scores = [x[1] for x in clf_knn_grid.grid_scores_]
print "*** knn cross-validation scores for different k : "
print k_scores
k_best = clf_knn_grid.best_params_['n_neighbors']
print "*** knn cross-validation best score for k = ",k_best,' ( score=',clf_knn_grid.best_score_,')'

# launch bench() on various estimators for 21% test data
clf_list = dict()
clf_list['LDA               '] = LinearDiscriminantAnalysis()
clf_list['LogisticRegression'] = LogisticRegression()
clf_list['QDA               '] = QuadraticDiscriminantAnalysis()
clf_list['KNN               '] = KNeighborsClassifier()
clf_list['KNN best          '] = clf_knn_grid.best_estimator_
results = []

Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.21, random_state=0)

results = []
for name in clf_list:
    clf = clf_list[name]
    print "*** currently processing ",name,"..."
    results.append(bench(name,clf,Xtrain,ytrain,Xtest,ytest))


*** knn cross-validation scores for different k : 
[0.96494156928213692, 0.96661101836393992, 0.96605453533667218, 0.96382860322760155, 0.96271563717306619, 0.95937673900946019, 0.95993322203672793, 0.95770728992765719, 0.95603784084585419, 0.95492487479131882]
*** knn cross-validation best score for k =  2  ( score= 0.966611018364 )
*** currently processing  LDA                ...
*** currently processing  LogisticRegression ...
*** currently processing  KNN                ...
*** currently processing  QDA                ...
*** currently processing  KNN best           ...


In [56]:
# display results
print "*"*68
print "* classifier          * score     * train time (s) * test time (s) *"
print "*"*68
for r in results:
    print "*"*68
    print "*",r[0]," * %0.5f   * %0.5f        * %0.5f       *" % (r[1],r[2],r[3])
print "*"*68


********************************************************************
* classifier          * score     * train time (s) * test time (s) *
********************************************************************
********************************************************************
* LDA                 * 0.95503   * 0.01290        * 0.00024       *
********************************************************************
* LogisticRegression  * 0.94974   * 0.19728        * 0.00020       *
********************************************************************
* KNN                 * 0.97619   * 0.00265        * 0.06111       *
********************************************************************
* QDA                 * 0.88624   * 0.03902        * 0.00394       *
********************************************************************
* KNN best            * 0.98148   * 0.00402        * 0.08657       *
********************************************************************


- Le **KNN** a le temps de training le plus court, ce qui n'est pas etonnant car ce classifieur n'a aucun calcul de paramètres à réaliser. En revanche, il a le temps de test le plus long car il calcule un grand nombre de distance (Ntrain) qu'il doit en plus partiellement mettre dans l'ordre pour selectionner la plus petite.
- Le temps de training du **LDA** est plus court que celui du **QDA**, car le LDA inverse une seule matrice (celle de variance-covariance supposé commune à toutes les classes) alors que le QDA inverse 10 matrices (celles propre à chaque classe) 
- La **Regression Logistique** a le temps de training le plus long car face à un probleme multi classe, scikit fait une descente de gradient par la methode de Newton (*'newton-cg'*) ou de quasi-Newton (*'lbfgs'*) où il faut approximer un Hessien (très couteux) à chaque itération. En revanche, ce classifieur a le temps de test le plus court car il s'agit d'une simple convolution suivi d'un logit.

# Confusion matrix

In [57]:
scores = np.asarray([r[1] for r in results])
ind_min = scores.argmin()
ind_max = scores.argmax()
np.set_printoptions(precision=3)
print "\n*** confusion matrix for worst score (",results[ind_min][0],",%0.5f) : \n" % results[ind_min][1]
print results[ind_min][4]
print "\n*** confusion matrix for best score (",results[ind_max][0],",%0.5f) : \n" % results[ind_max][1]
print results[ind_max][4]


*** confusion matrix for worst score ( QDA                ,0.88624) : 

[[ 1.     0.     0.     0.     0.     0.     0.     0.     0.     0.   ]
 [ 0.     0.886  0.029  0.     0.     0.     0.     0.     0.086  0.   ]
 [ 0.     0.108  0.757  0.     0.     0.     0.     0.     0.135  0.   ]
 [ 0.     0.     0.     0.812  0.     0.     0.     0.     0.188  0.   ]
 [ 0.     0.03   0.     0.     0.727  0.     0.     0.242  0.     0.   ]
 [ 0.     0.     0.     0.     0.     1.     0.     0.     0.     0.   ]
 [ 0.     0.     0.     0.     0.     0.     1.     0.     0.     0.   ]
 [ 0.     0.     0.     0.025  0.     0.     0.     0.975  0.     0.   ]
 [ 0.     0.024  0.024  0.     0.     0.     0.     0.024  0.929  0.   ]
 [ 0.     0.     0.     0.     0.     0.048  0.     0.095  0.119  0.738]]

*** confusion matrix for best score ( KNN best           ,0.98148) : 

[[ 1.     0.     0.     0.     0.     0.     0.     0.     0.     0.   ]
 [ 0.     1.     0.     0.     0.     0.     0.    

Le mauvais classifieur se trompe majortairement sur le chiffre "9", qu'il confond avec un "8" ou un "7" :

In [60]:
fig = plt.figure()
for i,l in enumerate((9,8,7)):
    ax = fig.add_subplot(1,3,i+1)
    ax.imshow(np.reshape(np.mean(X[y==l,:],axis=0),(8,8)), cmap=plt.cm.gray, interpolation='nearest')
    ax.set_title("mean for label "+str(l))

<IPython.core.display.Javascript object>

# Graphes

In [76]:
def plot_bivariate(sigma,mu):
    step = 50
    xx = np.linspace(mu[0]-2*sigma[0,0], mu[0]+2*sigma[0,0], step)
    yy = np.linspace(mu[1]-2*sigma[1,1], mu[1]+2*sigma[1,1], step)
    Xg, Yg = np.meshgrid(xx, yy)
    print Xg.shape
    X = np.concatenate((Xg.flatten()[np.newaxis,:],Yg.flatten()[np.newaxis,:]),axis=0)
    invSigma = np.linalg.det(sigma)
    mu = np.reshape(mu,(2,1))
    print np.dot(invSigma,(X-mu)).shape
    print np.dot(invSigma,(X-mu)).shape
    Z = np.dot(np.transpose(X-mu),np.dot(invSigma,(X-mu)))
    print Z.shape
    Z = np.reshape(Z,(step**2,step**2))
    print Z.shape
    #Z = 1/(2*np.pi*np.linalg.det(sigma))*np.exp( -0.5*np.dot(np.transpose(X-mu),np.dot(invSigma,(X-mu))) )
    fig = plt.figure(figsize=(9, 6), dpi = 90)
    ax = fig.add_subplot(111, projection='3d')
    ax.plot_surface(Xg, Yg, Z, cmap='Oranges',rstride=3, cstride=3, alpha=0.9, linewidth=0.5)
    return plt

mu = np.asarray([0,1])
sigma = np.asarray([[0.2,0.1],[0.1,0.2]])
plot_bivariate(sigma,mu)
plt.show()
    

(2, 2500)
(2, 2500)
(2500, 2500)
(2500, 2500)


<IPython.core.display.Javascript object>

ValueError: Unknown projection '3d'

# Courbe d'erreurs de test en fonction d'hyperparamètres

In [8]:
from sklearn.ensemble import RandomForestClassifier

param_grid_C = {'C': [0.01,0.1,1,10,100,150]}
param_grid_depth = {'max_depth': [10,50,100,150,200,250,300]}
param_grid_min_samples_split = {'min_samples_split': [2,5,8,10,15]} #The minimum number of samples required to split an internal node.
param_grid_min_samples_leaf = {'min_samples_leaf': [1,2,3,4,5,6]} # The minimum number of samples required to be at a leaf node.
param_grid_min_weight_fraction_leaf = {'min_weight_fraction_leaf': [0,0.5]} # The minimum weighted fraction of the input samples required to be at a leaf node.
param_grid_max_leaf_nodes = {'max_leaf_nodes': [10,50,75,100,150,200,250,300,350,400]} # Grow a tree with max_leaf_nodes in best-first fashion
param_grid_n_estimators = {'n_estimators' : [5,10,15,20,30,40,50,60,70,80,100,200,250,300,350,400]}

clf_logreg = grid_search.GridSearchCV(LogisticRegression(), param_grid_C)
clf_tree_1 = grid_search.GridSearchCV(DecisionTreeClassifier(), param_grid_depth)
clf_tree_2 = grid_search.GridSearchCV(DecisionTreeClassifier(), param_grid_min_samples_split)
clf_tree_3 = grid_search.GridSearchCV(DecisionTreeClassifier(), param_grid_min_samples_leaf)
clf_tree_4 = grid_search.GridSearchCV(DecisionTreeClassifier(), param_grid_min_weight_fraction_leaf)
clf_tree_5 = grid_search.GridSearchCV(DecisionTreeClassifier(), param_grid_max_leaf_nodes)
clf_svc = grid_search.GridSearchCV(SVC(gamma=0.001), param_grid_C)
clf_rd = grid_search.GridSearchCV(RandomForestClassifier(), param_grid_n_estimators)

clf_list = dict()
clf_list["Logistique Regression"] = clf_logreg
clf_list["Decision Tree 1"] = clf_tree_1
clf_list["Decision Tree 2"] = clf_tree_2
clf_list["Decision Tree 3"] = clf_tree_3
clf_list["Decision Tree 4"] = clf_tree_4
clf_list["Decision Tree 5"] = clf_tree_5
clf_list["SVM"] = clf_svc
clf_list["Random Forest"] = clf_rd

param_list = dict()
param_list["Logistique Regression"] = 'C'
param_list["Decision Tree 1"] = 'max_depth'
param_list["Decision Tree 2"] = 'min_samples_split'
param_list["Decision Tree 3"] = 'min_samples_leaf'
param_list["Decision Tree 4"] = 'min_weight_fraction_leaf'
param_list["Decision Tree 5"] = 'max_leaf_nodes'
param_list["SVM"] = 'C'
param_list["Random Forest"] = 'n_estimators'


for name_clf in clf_list:
    print name_clf
    clf = clf_list[name_clf]
    name_param = param_list[name_clf]
    clf.fit(Xscl,y)
    param_best = clf.best_params_[name_param]
    scores = [x[1] for x in clf.grid_scores_]
    errors = 1 - np.asarray(scores)
    print "regularization parameter list : ",clf.param_grid[name_param]
    print "errors : ",errors
    print "best regularization parameter : ",param_best
    fig4 = plt.figure()
    plt.grid()
    plt.plot(clf.param_grid[name_param], errors, 'o-', color="g",
                 label="Cross-validation error")
    plt.legend(loc="best")
    plt.xlabel(name_param)
    plt.ylabel('error')
    plt.legend()
    plt.title(name_clf)
    plt.show()
    

Decision Tree 5
regularization parameter list :  [10, 50, 75, 100, 150, 200, 250, 300, 350, 400]
errors :  [ 0.37173066  0.23316639  0.22982749  0.22982749  0.22648859  0.22481914
  0.21814135  0.22648859  0.21814135  0.22481914]
best regularization parameter :  250


<IPython.core.display.Javascript object>

Decision Tree 4
regularization parameter list :  [0, 0.5]
errors :  [ 0.21368948  0.21869783]
best regularization parameter :  0


<IPython.core.display.Javascript object>

SVM
regularization parameter list :  [0.01, 0.1, 1, 10, 100, 150]
errors :  [ 0.8425153   0.45464663  0.08569839  0.05063996  0.04730106  0.04785754]
best regularization parameter :  100


<IPython.core.display.Javascript object>

Decision Tree 1
regularization parameter list :  [10, 50, 100, 150, 200, 250, 300]
errors :  [ 0.22426266  0.23149694  0.21758486  0.22092376  0.213133    0.22760156
  0.22648859]
best regularization parameter :  200


<IPython.core.display.Javascript object>

Decision Tree 3
regularization parameter list :  [1, 2, 3, 4, 5, 6]
errors :  [ 0.22092376  0.22760156  0.23149694  0.23260991  0.23094046  0.23761825]
best regularization parameter :  1


<IPython.core.display.Javascript object>

Decision Tree 2
regularization parameter list :  [2, 5, 8, 10, 15]
errors :  [ 0.22036728  0.23094046  0.23483584  0.22593211  0.24429605]
best regularization parameter :  2


<IPython.core.display.Javascript object>

Logistique Regression
regularization parameter list :  [0.01, 0.1, 1, 10, 100, 150]
errors :  [ 0.0951586   0.07846411  0.07735114  0.08124652  0.08625487  0.08736784]
best regularization parameter :  1


<IPython.core.display.Javascript object>

Random Forest
regularization parameter list :  [5, 10, 15, 20, 30, 40, 50, 60, 70, 80, 100, 200, 250, 300, 350, 400]
errors :  [ 0.15136338  0.09293267  0.07957707  0.07512521  0.07122983  0.06733445
  0.065665    0.05954368  0.06789093  0.06010017  0.06121313  0.05731775
  0.05620479  0.05843072  0.05731775  0.05676127]
best regularization parameter :  250


<IPython.core.display.Javascript object>