# Training of the SVM

## Load the methods for feature vector generation

In [None]:
# Load library that calculates feature vectors
%run scripts/feature_vector.py


## Let's have a look at the import statements we need

In [None]:
# Python 3 syntax in Python 2
from __future__ import division
from __future__ import print_function

# Python Imaging Library adds image processing capabilities
from PIL import Image
from StringIO import StringIO

# Scikit learn
from sklearn import cross_validation
from sklearn import grid_search
from sklearn import svm
from sklearn import metrics

# Some basic Python libs
import sys
import os
import pickle

## Create all the feature vectors

In [None]:

training_path_a = 'logos/' 
training_path_b = 'non-logos/mixed/'

# calculate feature vectors (takes 5 minutes)
#training_a = get_feature_vectors_from_directory(training_path_a)
#training_b = get_feature_vectors_from_directory(training_path_b)

# load previously generated feature vectors
training_a = get_pickled_file('cache/training-logos.pickle')
training_b = get_pickled_file('cache/training-non-logos-all.pickle')
#training_b = get_pickled_file('cache/training-non-logos.pickle')

# data contains all the training data (a list of feature vectors)
data = training_a + training_b

# target is the list of target classes for each feature vector: 
# '1' for class A and '0' for class B
target = [1] * len(training_a) + [0] * len(training_b)


In [None]:
print("Array of feature vector set 0, size:", len(training_a))
print("Array of feature vector set 1, size:", len(training_b))

## Train the SVM

In [None]:
# In order to measure the quality of the classifier
# we split training data into a training set 
# and a test set
# the test set will contain 20% of the data
train, test, target_train, target_test = \
    cross_validation.train_test_split(
        data,
        target, 
        test_size=0.20)


# SVM needs some initial configuration parameters 
# ("hyperparameters") 

# Good hyperparameters can be obtained 
# by using grid search & "cross validation"

# But we need to define the parameter search space

parameters = {'kernel': ['linear', 'rbf'], 
              'C': [1, 5, 10, 50, 100, 1000],
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001]}

#   rbf = radial basis function = gauss kernel
#   Other kernels are rarely needed, said Andrew NG 
#   (Associate Professor of Computer Science at Stanford, founder Coursera)


# Search for the best classifier within the search space 
# A search consists of:
#   an estimator (regressor or classifier such as sklearn.svm.SVC());
#   a parameter space;
#   ...
# Documentation: 
#   http://scikit-learn.org/stable/modules/grid_search.html

clf = grid_search.GridSearchCV(svm.SVC(), parameters)
clf.fit(train, target_train)
classifier = clf.best_estimator_

print()
print('Parameters:', clf.best_params_)
print()
print('Best classifier score')
print(metrics.classification_report(
        target_test,
        classifier.predict(test)))


Explanation:
* precision = true positives / (true positives + false positives)
* recall = true positives / (true positives + false negatives)
* f1-score = 2 \* precision * recall / (precision+recall)

all: "the higher the better", 1 = perfect

In [None]:
# save classifier for later
#with open('classifiers/classifier5-logos-static-training-set.pickle', "w") as fp:
#    pickle.dump(classifier, fp)

### Add some styles for classification result visualisation

In [None]:
%%html
<style>
.res {margin:0 0 0 10px;border:5px solid red}
.cl1 {border-color:#ccc}
</style>

## Display classification result

In [None]:
from IPython.core.display import display, HTML

# Load classifier from file
#classifier = get_pickled_file('classifiers/classifier3-logos-static-training-set.pickle')

def test_directory(classifier, directory):
    number = 0
    string = ''
    for root, _, files in os.walk(directory):
        for file_name in files:
            number     += 1
            file_path   = os.path.join(root, file_name)
            img_feature = get_feature_vector_from_image_file(file_path)
            result      = classifier.predict([img_feature])
            string      = string + '<div class="res cl'+str(result[0])+'"style="float:left"><img src="' + file_path + '" width="100px" /></div>'

            if ( not(number % 4)):
                display(HTML(string))
                string =""
            if (number > 100 ):
                return

#test_directory( classifier, 'logos/' )
#test_directory( classifier, 'non-logos/car' )
test_directory( classifier, 'non-logos/schwierig' )
