# Support Vector Classifier Model

In [1]:
execution_mode = 'full'

## Table of Contents

- [Data Takeover](#Data-Takeover)
- [Support Vector Classifier](#Support-Vector-Classifier)
    - [Train/Test Split for Support Vector Classifier](#Train/Test-Split-for-Support-Vector-Classifier)
    - [Model Training for Support Vector Classifier](#Model-Training-for-Support-Vector-Classifier)
    - [Performance Measurement for Support Vector Classifier](#Performance-Measurement-for-Support-Vector-Classifier)
- [Support Vector Classifier with Cross-Validation](#Support-Vector-Classifier-with-Cross-Validation)
    - [Train/Test Split for Support Vector Classifier CV](#Train/Test-Split-for-Support-Vector-Classifier-CV)
    - [Model Training for Support Vector Classifier CV](#Model-Training-for-Support-Vector-Classifier-CV)
    - [Performance Measurement of Support Vector Classifier CV](#Performance-Measurement-of-Support-Vector-Classifier-CV)
- [Results Handover](#Results-Handover)

## Data Takeover

Read in DataFrame from chapter [Feature Matrix Generation](./3_FeatureMatrixGeneration.ipynb) as input for processing in this chapter.

In [2]:
import os
import pandas as pd

path_goldstandard = './daten_goldstandard'

# Restore results so far
df_labelled_feature_matrix = pd.read_pickle(os.path.join(path_goldstandard,
                                                         'labelled_feature_matrix.pkl'),
                                 compression=None)

df_attribute_with_sim_feature = pd.read_pickle(os.path.join(
    path_goldstandard, 'labelled_feature_matrix_full.pkl'), compression=None
                                              )

df_labelled_feature_matrix.head()

Unnamed: 0,coordinate_E_delta,coordinate_N_delta,corporate_full_delta,doi_delta,edition_delta,exactDate_delta,format_prefix_delta,format_postfix_delta,isbn_delta,ismn_delta,...,part_delta,person_100_delta,person_700_delta,person_245c_delta,pubinit_delta,scale_delta,ttlfull_245_delta,ttlfull_246_delta,volumes_delta,duplicates
0,-0.1,-0.1,-0.1,-0.1,-0.1,0.75,1.0,1.0,1.0,-0.1,...,1.0,1.0,1.0,1.0,1.0,-0.1,1.0,-0.1,1.0,1
1,-0.1,-0.1,-0.1,-0.1,-0.1,0.75,1.0,1.0,1.0,-0.1,...,1.0,1.0,-0.05,0.818905,0.848485,-0.1,0.787879,-0.1,1.0,1
2,-0.1,-0.1,-0.1,-0.1,-0.1,0.75,1.0,1.0,1.0,-0.1,...,1.0,1.0,-0.05,0.69774,0.848485,-0.1,1.0,-0.1,1.0,1
3,-0.1,-0.1,-0.1,-0.1,-0.1,0.75,1.0,1.0,1.0,-0.1,...,1.0,1.0,-0.05,0.818905,0.848485,-0.1,0.787879,-0.1,1.0,1
4,-0.1,-0.1,-0.1,-0.1,-0.1,0.75,1.0,1.0,1.0,-0.1,...,1.0,1.0,-0.1,1.0,1.0,-0.1,1.0,-0.1,1.0,1


In [3]:
print('Part of duplicates (1) on uniques (2) in units of [%]')
print(df_labelled_feature_matrix.duplicates.value_counts(normalize=True)*100)

Part of duplicates (1) on uniques (2) in units of [%]
0    99.432212
1     0.567788
Name: duplicates, dtype: float64


## Support Vector Classifier

A Support Vector Machine allows the use of regularization.

### Train/Test Split for Support Vector Classifier

The train/test split will be implemented here as a general function to be called in the models chapters.

In [4]:
import classifier_fitting_funcs as cff

X_tr, X_val, X_te, y_tr, y_val, y_te, idx_tr, idx_val, idx_te = cff.split_feature_target(
    df_labelled_feature_matrix, 'train_validation_test')

X_tr[:5], y_tr[:5], idx_tr[:5]

(array([[-0.1       , -0.1       , -0.1       , -0.05      , -0.1       ,
          0.625     ,  0.        ,  0.42857143,  1.        , -0.1       ,
         -0.05      , -0.05      ,  0.49267677, -0.05      ,  0.54033531,
         -0.05      , -0.1       ,  0.57608486, -0.1       , -0.05      ],
        [-0.1       , -0.1       , -0.1       , -0.1       , -0.05      ,
          0.5       ,  0.        ,  0.42857143,  0.        , -0.1       ,
         -0.1       ,  0.        , -0.05      , -0.05      ,  0.50978836,
         -0.05      , -0.1       ,  0.56688312, -0.1       ,  0.51111111],
        [-0.05      , -0.05      ,  0.06      , -0.1       , -0.1       ,
          0.5       ,  0.        ,  0.42857143,  0.        , -0.1       ,
         -0.1       , -0.05      , -0.1       , -0.1       , -0.05      ,
         -0.05      , -0.05      ,  0.46245348, -0.05      , -0.05      ],
        [-0.1       , -0.1       , -0.05      , -0.1       , -0.1       ,
          0.625     ,  0.        , 

In [5]:
print(X_tr.shape, y_tr.shape, X_val.shape, y_val.shape, X_te.shape, y_te.shape)

(166033, 20) (166033,) (41509, 20) (41509,) (51886, 20) (51886,)


### Model Training for Support Vector Classifier

In [6]:
if execution_mode == 'full' :
    # Kernel 'rbf' has long calculation times, but does not generate
    #  the best accuracy : Ommit in grid search.
    parameter_dictionary = {
        'kernel' : ['linear', 'poly'],
        'degree' : [2, 3, 4],
        'gamma' : [1.5, 2, 2.5, 'auto'],
        'C' : [0.5, 0.7, 0.8, 0.9, 1.0],
        'class_weight' : [None]
    }
elif execution_mode == 'restricted' :
    parameter_dictionary = {
        'kernel' : ['poly'],
        'degree' : [3],
        'gamma' : [2.0],
        'C' : [0.5],
        'class_weight' : [None]
    }

# Grid of values
grid = cff.generate_parameter_grid(parameter_dictionary)

The grid parameters are ...
kernel ['linear', 'poly']
degree [2, 3, 4]
gamma [1.5, 2, 2.5, 'auto']
C [0.5, 0.7, 0.8, 0.9, 1.0]
class_weight [None]
 => Number of combinations : 120


In [7]:
from sklearn.svm import SVC

sv = SVC(random_state=0)

# Save accuracy on test set
test_scores = []
for params_dict in grid :
    test_scores.append(cff.fit_model_measure_scores(sv, params_dict, X_tr, y_tr, X_val, y_val))

# Save measured accuracies
df_test_scores_sv = pd.DataFrame(test_scores).sort_values('accuracy_val', ascending=False)

Fitting with parameters {'C': 0.5, 'class_weight': None, 'degree': 2, 'gamma': 1.5, 'kernel': 'linear'}


 => validation score 99.805%
Fitting with parameters {'C': 0.5, 'class_weight': None, 'degree': 2, 'gamma': 1.5, 'kernel': 'poly'}


 => validation score 99.863%
Fitting with parameters {'C': 0.5, 'class_weight': None, 'degree': 2, 'gamma': 2, 'kernel': 'linear'}


 => validation score 99.805%
Fitting with parameters {'C': 0.5, 'class_weight': None, 'degree': 2, 'gamma': 2, 'kernel': 'poly'}


 => validation score 99.865%
Fitting with parameters {'C': 0.5, 'class_weight': None, 'degree': 2, 'gamma': 2.5, 'kernel': 'linear'}


 => validation score 99.805%
Fitting with parameters {'C': 0.5, 'class_weight': None, 'degree': 2, 'gamma': 2.5, 'kernel': 'poly'}


 => validation score 99.865%
Fitting with parameters {'C': 0.5, 'class_weight': None, 'degree': 2, 'gamma': 'auto', 'kernel': 'linear'}


 => validation score 99.805%
Fitting with parameters {'C': 0.5, 'class_weight': None, 'degree': 2, 'gamma': 'auto', 'kernel': 'poly'}


 => validation score 99.713%
Fitting with parameters {'C': 0.5, 'class_weight': None, 'degree': 3, 'gamma': 1.5, 'kernel': 'linear'}


 => validation score 99.805%
Fitting with parameters {'C': 0.5, 'class_weight': None, 'degree': 3, 'gamma': 1.5, 'kernel': 'poly'}


 => validation score 99.882%
Fitting with parameters {'C': 0.5, 'class_weight': None, 'degree': 3, 'gamma': 2, 'kernel': 'linear'}


 => validation score 99.805%
Fitting with parameters {'C': 0.5, 'class_weight': None, 'degree': 3, 'gamma': 2, 'kernel': 'poly'}


 => validation score 99.889%
Fitting with parameters {'C': 0.5, 'class_weight': None, 'degree': 3, 'gamma': 2.5, 'kernel': 'linear'}


 => validation score 99.805%
Fitting with parameters {'C': 0.5, 'class_weight': None, 'degree': 3, 'gamma': 2.5, 'kernel': 'poly'}


 => validation score 99.904%
Fitting with parameters {'C': 0.5, 'class_weight': None, 'degree': 3, 'gamma': 'auto', 'kernel': 'linear'}


 => validation score 99.805%
Fitting with parameters {'C': 0.5, 'class_weight': None, 'degree': 3, 'gamma': 'auto', 'kernel': 'poly'}


 => validation score 99.665%
Fitting with parameters {'C': 0.5, 'class_weight': None, 'degree': 4, 'gamma': 1.5, 'kernel': 'linear'}


 => validation score 99.805%
Fitting with parameters {'C': 0.5, 'class_weight': None, 'degree': 4, 'gamma': 1.5, 'kernel': 'poly'}


 => validation score 99.908%
Fitting with parameters {'C': 0.5, 'class_weight': None, 'degree': 4, 'gamma': 2, 'kernel': 'linear'}


 => validation score 99.805%
Fitting with parameters {'C': 0.5, 'class_weight': None, 'degree': 4, 'gamma': 2, 'kernel': 'poly'}


 => validation score 99.889%
Fitting with parameters {'C': 0.5, 'class_weight': None, 'degree': 4, 'gamma': 2.5, 'kernel': 'linear'}


 => validation score 99.805%
Fitting with parameters {'C': 0.5, 'class_weight': None, 'degree': 4, 'gamma': 2.5, 'kernel': 'poly'}


 => validation score 99.901%
Fitting with parameters {'C': 0.5, 'class_weight': None, 'degree': 4, 'gamma': 'auto', 'kernel': 'linear'}


 => validation score 99.805%
Fitting with parameters {'C': 0.5, 'class_weight': None, 'degree': 4, 'gamma': 'auto', 'kernel': 'poly'}


 => validation score 99.598%
Fitting with parameters {'C': 0.7, 'class_weight': None, 'degree': 2, 'gamma': 1.5, 'kernel': 'linear'}


 => validation score 99.814%
Fitting with parameters {'C': 0.7, 'class_weight': None, 'degree': 2, 'gamma': 1.5, 'kernel': 'poly'}


 => validation score 99.867%
Fitting with parameters {'C': 0.7, 'class_weight': None, 'degree': 2, 'gamma': 2, 'kernel': 'linear'}


 => validation score 99.814%
Fitting with parameters {'C': 0.7, 'class_weight': None, 'degree': 2, 'gamma': 2, 'kernel': 'poly'}


 => validation score 99.865%
Fitting with parameters {'C': 0.7, 'class_weight': None, 'degree': 2, 'gamma': 2.5, 'kernel': 'linear'}


 => validation score 99.814%
Fitting with parameters {'C': 0.7, 'class_weight': None, 'degree': 2, 'gamma': 2.5, 'kernel': 'poly'}


 => validation score 99.863%
Fitting with parameters {'C': 0.7, 'class_weight': None, 'degree': 2, 'gamma': 'auto', 'kernel': 'linear'}


 => validation score 99.814%
Fitting with parameters {'C': 0.7, 'class_weight': None, 'degree': 2, 'gamma': 'auto', 'kernel': 'poly'}


 => validation score 99.725%
Fitting with parameters {'C': 0.7, 'class_weight': None, 'degree': 3, 'gamma': 1.5, 'kernel': 'linear'}


 => validation score 99.814%
Fitting with parameters {'C': 0.7, 'class_weight': None, 'degree': 3, 'gamma': 1.5, 'kernel': 'poly'}


 => validation score 99.889%
Fitting with parameters {'C': 0.7, 'class_weight': None, 'degree': 3, 'gamma': 2, 'kernel': 'linear'}


 => validation score 99.814%
Fitting with parameters {'C': 0.7, 'class_weight': None, 'degree': 3, 'gamma': 2, 'kernel': 'poly'}


 => validation score 99.901%
Fitting with parameters {'C': 0.7, 'class_weight': None, 'degree': 3, 'gamma': 2.5, 'kernel': 'linear'}


 => validation score 99.814%
Fitting with parameters {'C': 0.7, 'class_weight': None, 'degree': 3, 'gamma': 2.5, 'kernel': 'poly'}


 => validation score 99.899%
Fitting with parameters {'C': 0.7, 'class_weight': None, 'degree': 3, 'gamma': 'auto', 'kernel': 'linear'}


 => validation score 99.814%
Fitting with parameters {'C': 0.7, 'class_weight': None, 'degree': 3, 'gamma': 'auto', 'kernel': 'poly'}


 => validation score 99.694%
Fitting with parameters {'C': 0.7, 'class_weight': None, 'degree': 4, 'gamma': 1.5, 'kernel': 'linear'}


 => validation score 99.814%
Fitting with parameters {'C': 0.7, 'class_weight': None, 'degree': 4, 'gamma': 1.5, 'kernel': 'poly'}


 => validation score 99.904%
Fitting with parameters {'C': 0.7, 'class_weight': None, 'degree': 4, 'gamma': 2, 'kernel': 'linear'}


 => validation score 99.814%
Fitting with parameters {'C': 0.7, 'class_weight': None, 'degree': 4, 'gamma': 2, 'kernel': 'poly'}


 => validation score 99.901%
Fitting with parameters {'C': 0.7, 'class_weight': None, 'degree': 4, 'gamma': 2.5, 'kernel': 'linear'}


 => validation score 99.814%
Fitting with parameters {'C': 0.7, 'class_weight': None, 'degree': 4, 'gamma': 2.5, 'kernel': 'poly'}


 => validation score 99.901%
Fitting with parameters {'C': 0.7, 'class_weight': None, 'degree': 4, 'gamma': 'auto', 'kernel': 'linear'}


 => validation score 99.814%
Fitting with parameters {'C': 0.7, 'class_weight': None, 'degree': 4, 'gamma': 'auto', 'kernel': 'poly'}


 => validation score 99.641%
Fitting with parameters {'C': 0.8, 'class_weight': None, 'degree': 2, 'gamma': 1.5, 'kernel': 'linear'}


 => validation score 99.814%
Fitting with parameters {'C': 0.8, 'class_weight': None, 'degree': 2, 'gamma': 1.5, 'kernel': 'poly'}


 => validation score 99.867%
Fitting with parameters {'C': 0.8, 'class_weight': None, 'degree': 2, 'gamma': 2, 'kernel': 'linear'}


 => validation score 99.814%
Fitting with parameters {'C': 0.8, 'class_weight': None, 'degree': 2, 'gamma': 2, 'kernel': 'poly'}


 => validation score 99.865%
Fitting with parameters {'C': 0.8, 'class_weight': None, 'degree': 2, 'gamma': 2.5, 'kernel': 'linear'}


 => validation score 99.814%
Fitting with parameters {'C': 0.8, 'class_weight': None, 'degree': 2, 'gamma': 2.5, 'kernel': 'poly'}


 => validation score 99.865%
Fitting with parameters {'C': 0.8, 'class_weight': None, 'degree': 2, 'gamma': 'auto', 'kernel': 'linear'}


 => validation score 99.814%
Fitting with parameters {'C': 0.8, 'class_weight': None, 'degree': 2, 'gamma': 'auto', 'kernel': 'poly'}


 => validation score 99.733%
Fitting with parameters {'C': 0.8, 'class_weight': None, 'degree': 3, 'gamma': 1.5, 'kernel': 'linear'}


 => validation score 99.814%
Fitting with parameters {'C': 0.8, 'class_weight': None, 'degree': 3, 'gamma': 1.5, 'kernel': 'poly'}


 => validation score 99.884%
Fitting with parameters {'C': 0.8, 'class_weight': None, 'degree': 3, 'gamma': 2, 'kernel': 'linear'}


 => validation score 99.814%
Fitting with parameters {'C': 0.8, 'class_weight': None, 'degree': 3, 'gamma': 2, 'kernel': 'poly'}


 => validation score 99.899%
Fitting with parameters {'C': 0.8, 'class_weight': None, 'degree': 3, 'gamma': 2.5, 'kernel': 'linear'}


 => validation score 99.814%
Fitting with parameters {'C': 0.8, 'class_weight': None, 'degree': 3, 'gamma': 2.5, 'kernel': 'poly'}


 => validation score 99.899%
Fitting with parameters {'C': 0.8, 'class_weight': None, 'degree': 3, 'gamma': 'auto', 'kernel': 'linear'}


 => validation score 99.814%
Fitting with parameters {'C': 0.8, 'class_weight': None, 'degree': 3, 'gamma': 'auto', 'kernel': 'poly'}


 => validation score 99.701%
Fitting with parameters {'C': 0.8, 'class_weight': None, 'degree': 4, 'gamma': 1.5, 'kernel': 'linear'}


 => validation score 99.814%
Fitting with parameters {'C': 0.8, 'class_weight': None, 'degree': 4, 'gamma': 1.5, 'kernel': 'poly'}


 => validation score 99.896%
Fitting with parameters {'C': 0.8, 'class_weight': None, 'degree': 4, 'gamma': 2, 'kernel': 'linear'}


 => validation score 99.814%
Fitting with parameters {'C': 0.8, 'class_weight': None, 'degree': 4, 'gamma': 2, 'kernel': 'poly'}


 => validation score 99.899%
Fitting with parameters {'C': 0.8, 'class_weight': None, 'degree': 4, 'gamma': 2.5, 'kernel': 'linear'}


 => validation score 99.814%
Fitting with parameters {'C': 0.8, 'class_weight': None, 'degree': 4, 'gamma': 2.5, 'kernel': 'poly'}


 => validation score 99.904%
Fitting with parameters {'C': 0.8, 'class_weight': None, 'degree': 4, 'gamma': 'auto', 'kernel': 'linear'}


 => validation score 99.814%
Fitting with parameters {'C': 0.8, 'class_weight': None, 'degree': 4, 'gamma': 'auto', 'kernel': 'poly'}


 => validation score 99.646%
Fitting with parameters {'C': 0.9, 'class_weight': None, 'degree': 2, 'gamma': 1.5, 'kernel': 'linear'}


 => validation score 99.812%
Fitting with parameters {'C': 0.9, 'class_weight': None, 'degree': 2, 'gamma': 1.5, 'kernel': 'poly'}


 => validation score 99.870%
Fitting with parameters {'C': 0.9, 'class_weight': None, 'degree': 2, 'gamma': 2, 'kernel': 'linear'}


 => validation score 99.812%
Fitting with parameters {'C': 0.9, 'class_weight': None, 'degree': 2, 'gamma': 2, 'kernel': 'poly'}


 => validation score 99.863%
Fitting with parameters {'C': 0.9, 'class_weight': None, 'degree': 2, 'gamma': 2.5, 'kernel': 'linear'}


 => validation score 99.812%
Fitting with parameters {'C': 0.9, 'class_weight': None, 'degree': 2, 'gamma': 2.5, 'kernel': 'poly'}


 => validation score 99.863%
Fitting with parameters {'C': 0.9, 'class_weight': None, 'degree': 2, 'gamma': 'auto', 'kernel': 'linear'}


 => validation score 99.812%
Fitting with parameters {'C': 0.9, 'class_weight': None, 'degree': 2, 'gamma': 'auto', 'kernel': 'poly'}


 => validation score 99.740%
Fitting with parameters {'C': 0.9, 'class_weight': None, 'degree': 3, 'gamma': 1.5, 'kernel': 'linear'}


 => validation score 99.812%
Fitting with parameters {'C': 0.9, 'class_weight': None, 'degree': 3, 'gamma': 1.5, 'kernel': 'poly'}


 => validation score 99.889%
Fitting with parameters {'C': 0.9, 'class_weight': None, 'degree': 3, 'gamma': 2, 'kernel': 'linear'}


 => validation score 99.812%
Fitting with parameters {'C': 0.9, 'class_weight': None, 'degree': 3, 'gamma': 2, 'kernel': 'poly'}


 => validation score 99.901%
Fitting with parameters {'C': 0.9, 'class_weight': None, 'degree': 3, 'gamma': 2.5, 'kernel': 'linear'}


 => validation score 99.812%
Fitting with parameters {'C': 0.9, 'class_weight': None, 'degree': 3, 'gamma': 2.5, 'kernel': 'poly'}


 => validation score 99.904%
Fitting with parameters {'C': 0.9, 'class_weight': None, 'degree': 3, 'gamma': 'auto', 'kernel': 'linear'}


 => validation score 99.812%
Fitting with parameters {'C': 0.9, 'class_weight': None, 'degree': 3, 'gamma': 'auto', 'kernel': 'poly'}


 => validation score 99.708%
Fitting with parameters {'C': 0.9, 'class_weight': None, 'degree': 4, 'gamma': 1.5, 'kernel': 'linear'}


 => validation score 99.812%
Fitting with parameters {'C': 0.9, 'class_weight': None, 'degree': 4, 'gamma': 1.5, 'kernel': 'poly'}


 => validation score 99.896%
Fitting with parameters {'C': 0.9, 'class_weight': None, 'degree': 4, 'gamma': 2, 'kernel': 'linear'}


 => validation score 99.812%
Fitting with parameters {'C': 0.9, 'class_weight': None, 'degree': 4, 'gamma': 2, 'kernel': 'poly'}


 => validation score 99.899%
Fitting with parameters {'C': 0.9, 'class_weight': None, 'degree': 4, 'gamma': 2.5, 'kernel': 'linear'}


 => validation score 99.812%
Fitting with parameters {'C': 0.9, 'class_weight': None, 'degree': 4, 'gamma': 2.5, 'kernel': 'poly'}


 => validation score 99.904%
Fitting with parameters {'C': 0.9, 'class_weight': None, 'degree': 4, 'gamma': 'auto', 'kernel': 'linear'}


 => validation score 99.812%
Fitting with parameters {'C': 0.9, 'class_weight': None, 'degree': 4, 'gamma': 'auto', 'kernel': 'poly'}


 => validation score 99.658%
Fitting with parameters {'C': 1.0, 'class_weight': None, 'degree': 2, 'gamma': 1.5, 'kernel': 'linear'}


 => validation score 99.807%
Fitting with parameters {'C': 1.0, 'class_weight': None, 'degree': 2, 'gamma': 1.5, 'kernel': 'poly'}


 => validation score 99.867%
Fitting with parameters {'C': 1.0, 'class_weight': None, 'degree': 2, 'gamma': 2, 'kernel': 'linear'}


 => validation score 99.807%
Fitting with parameters {'C': 1.0, 'class_weight': None, 'degree': 2, 'gamma': 2, 'kernel': 'poly'}


 => validation score 99.863%
Fitting with parameters {'C': 1.0, 'class_weight': None, 'degree': 2, 'gamma': 2.5, 'kernel': 'linear'}


 => validation score 99.807%
Fitting with parameters {'C': 1.0, 'class_weight': None, 'degree': 2, 'gamma': 2.5, 'kernel': 'poly'}


 => validation score 99.865%
Fitting with parameters {'C': 1.0, 'class_weight': None, 'degree': 2, 'gamma': 'auto', 'kernel': 'linear'}


 => validation score 99.807%
Fitting with parameters {'C': 1.0, 'class_weight': None, 'degree': 2, 'gamma': 'auto', 'kernel': 'poly'}


 => validation score 99.745%
Fitting with parameters {'C': 1.0, 'class_weight': None, 'degree': 3, 'gamma': 1.5, 'kernel': 'linear'}


 => validation score 99.807%
Fitting with parameters {'C': 1.0, 'class_weight': None, 'degree': 3, 'gamma': 1.5, 'kernel': 'poly'}


 => validation score 99.889%
Fitting with parameters {'C': 1.0, 'class_weight': None, 'degree': 3, 'gamma': 2, 'kernel': 'linear'}


 => validation score 99.807%
Fitting with parameters {'C': 1.0, 'class_weight': None, 'degree': 3, 'gamma': 2, 'kernel': 'poly'}


 => validation score 99.906%
Fitting with parameters {'C': 1.0, 'class_weight': None, 'degree': 3, 'gamma': 2.5, 'kernel': 'linear'}


 => validation score 99.807%
Fitting with parameters {'C': 1.0, 'class_weight': None, 'degree': 3, 'gamma': 2.5, 'kernel': 'poly'}


 => validation score 99.906%
Fitting with parameters {'C': 1.0, 'class_weight': None, 'degree': 3, 'gamma': 'auto', 'kernel': 'linear'}


 => validation score 99.807%
Fitting with parameters {'C': 1.0, 'class_weight': None, 'degree': 3, 'gamma': 'auto', 'kernel': 'poly'}


 => validation score 99.718%
Fitting with parameters {'C': 1.0, 'class_weight': None, 'degree': 4, 'gamma': 1.5, 'kernel': 'linear'}


 => validation score 99.807%
Fitting with parameters {'C': 1.0, 'class_weight': None, 'degree': 4, 'gamma': 1.5, 'kernel': 'poly'}


 => validation score 99.896%
Fitting with parameters {'C': 1.0, 'class_weight': None, 'degree': 4, 'gamma': 2, 'kernel': 'linear'}


 => validation score 99.807%
Fitting with parameters {'C': 1.0, 'class_weight': None, 'degree': 4, 'gamma': 2, 'kernel': 'poly'}


 => validation score 99.896%
Fitting with parameters {'C': 1.0, 'class_weight': None, 'degree': 4, 'gamma': 2.5, 'kernel': 'linear'}


 => validation score 99.807%
Fitting with parameters {'C': 1.0, 'class_weight': None, 'degree': 4, 'gamma': 2.5, 'kernel': 'poly'}


 => validation score 99.906%
Fitting with parameters {'C': 1.0, 'class_weight': None, 'degree': 4, 'gamma': 'auto', 'kernel': 'linear'}


 => validation score 99.807%
Fitting with parameters {'C': 1.0, 'class_weight': None, 'degree': 4, 'gamma': 'auto', 'kernel': 'poly'}


 => validation score 99.660%


In [8]:
best_params = cff.get_best_parameters(test_scores, parameter_dictionary)

# Create a decision tree
sv_best = SVC(gamma=best_params['gamma'], kernel=best_params['kernel'],
              C=best_params['C'], class_weight=best_params['class_weight'],
              degree=best_params['degree'], random_state=0
             )

# Fit estimator
sv_best.fit(X_tr, y_tr)
y_pred_sv = sv_best.predict(X_te)

The parameters for the best model are ...
kernel = poly
degree = 4
gamma = 1.5
C = 0.5
class_weight = None


### Performance Measurement for Support Vector Classifier

In [9]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_te, y_pred_sv)

array([[51568,    23],
       [   22,   273]])

In [10]:
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score

print('Score {:.1f}%'.format(100*sv_best.score(X_te, y_te)))
print('Area under the curve {:.1f}% - accuracy {:.1f}% - precision {:.1f}% - recall {:.1f}%'.format(100*roc_auc_score(y_te, y_pred_sv),
                100*accuracy_score(y_te, y_pred_sv),
                100*precision_score(y_te, y_pred_sv),
                100*recall_score(y_te, y_pred_sv)
               ))

Score 99.9%
Area under the curve 96.2% - accuracy 99.9% - precision 92.2% - recall 92.5%


In [11]:
import results_analysis_funcs as raf

In [12]:
df_feature_base_full_te = df_attribute_with_sim_feature.iloc[idx_te]
df_feature_base_full_tr = df_attribute_with_sim_feature.iloc[idx_tr]

# Extend display to number of columns of DataFrame
pd.options.display.max_columns = len(df_feature_base_full_te.columns)

df_feature_base_full_te.sort_index().sample(n=20)

Unnamed: 0,duplicates,coordinate_E_delta,coordinate_E_x,coordinate_E_y,coordinate_N_delta,coordinate_N_x,coordinate_N_y,corporate_full_delta,corporate_full_x,corporate_full_y,doi_delta,doi_x,doi_y,edition_delta,edition_x,edition_y,exactDate_delta,exactDate_x,exactDate_y,format_postfix_delta,format_postfix_x,format_postfix_y,format_prefix_delta,format_prefix_x,format_prefix_y,isbn_delta,isbn_x,isbn_y,ismn_delta,ismn_x,ismn_y,musicid_delta,musicid_x,musicid_y,part_delta,part_x,part_y,person_100_delta,person_100_x,person_100_y,person_245c_delta,person_245c_x,person_245c_y,person_700_delta,person_700_x,person_700_y,pubinit_delta,pubinit_x,pubinit_y,scale_delta,scale_x,scale_y,ttlfull_245_delta,ttlfull_245_x,ttlfull_245_y,ttlfull_246_delta,ttlfull_246_x,ttlfull_246_y,volumes_delta,volumes_x,volumes_y
172870,0,-0.1,,,-0.1,,,-0.1,,,-0.1,,,0.0,3.0,2.0,0.5,2015aaaa,2000uuuu,1.0,20000,20000,1.0,bk,bk,0.0,"[978-3-648-07838-9, 3-648-07838-0]",[0-582-41794-5],-0.1,,,-0.1,,,-0.05,208,,0.623131,basuandreas,austenjane,0.659791,andreas basu ; liane faust,jane austen ; retold by annette barnes,0.530256,faustliane,barnesannette,0.469841,haufe,pearson education ltd,-0.1,,,0.526667,gewaltfreie kommunikation,emma,-0.1,,,0.0,128,59
169433,0,-0.1,,,-0.1,,,-0.05,les arts florissants,,-0.1,,,-0.1,,,0.375,1996aaaa,1840uuuu,0.428571,40100,10200,1.0,mu,mu,1.0,[],[],-0.1,,,-0.05,630.0,,-0.1,,,1.0,mozartwolfgang amadeus,mozartwolfgang amadeus,0.642359,wolfgang amadeus mozart ; libretto: emanuel sc...,von w.a. mozart ; [die deutsche dichtung ist v...,0.592593,"mozartwolfgang amadeus, schikanederemanuel, ch...",schikanederemanuel,-0.1,,,-0.1,,,0.700673,"die zauberflöte, the magic flute : opera in tw...","die zauberflöte, oper : [kv 620]",-0.1,,,0.0,2,1 132
92703,0,-0.1,,,-0.1,,,-0.1,,,-0.1,,,-0.05,,144.0,0.25,1880aaaa,20151475,1.0,20000,20000,1.0,bk,bk,0.0,[],[978-88-7922-121-4],-0.1,,,-0.1,,,0.0,21,1,0.61221,mozartwolfgang amadeus,petrarcafrancesco,0.545104,von emanuel schikaneder ; musik von w.a. mozart,francesco petrarca ; direttore editoriale: luc...,0.530983,schikanederemanuel,"lapinibernardo, sommarugaluciana, valsangiacom...",-0.05,breitkopf & härtel,,-0.1,,,0.554674,"die zauberflöte, il flauto magico : deutsche o...","trionfi, bologna, annibale, malpigli, 1475",-0.1,,,0.0,34,2
85054,0,-0.1,,,-0.1,,,-0.1,,,-0.1,,,-0.05,5.0,,0.625,2009aaaa,2002uuuu,0.428571,20000,20053,1.0,bk,bk,0.0,[978-3-13-127285-0],"[978-3-598-31515-2 (print), 978-3-11-097839-1]",-0.1,,,-0.1,,,-0.05,,35 35,0.668386,schusterhans-peter,mortzfeldpeter,0.569707,"hans-peter schuster, hans-joachim trappe","mortzfeld, peter; raabe, paul",0.644444,trappehans-joachim,raabepaul,-0.05,,de gruyter saur,-0.1,,,0.544154,ekg-kurs für isabel,katalog der graphischen porträts in der herzog...,-0.1,,,0.511111,312,1 442
46946,0,-0.1,,,-0.1,,,-0.1,,,-0.05,,10.5169/seals-515356,-0.1,,,0.625,2005aaaa,2015uuuu,0.428571,10300,10053,0.0,vm,bk,1.0,[],[],-0.1,,,-0.05,99064.0,,-0.05,,294 2015,-0.05,,bührerwalter,0.394444,ein film von luc jacquet,[walter bührer],-0.05,jacquetluc,,-0.05,bonne pioche,,-0.1,,,0.456755,"die reise der pinguine, die natur schreibt die...",blick in die welt,-0.1,,,-0.05,1,
236099,0,-0.1,,,-0.1,,,-0.1,,,-0.1,,,-0.05,,6.0,0.625,2016aaaa,2015uuuu,1.0,20000,20000,1.0,bk,bk,0.0,[978-2-226-31734-6],"[978-3-13-128546-1, 3-13-128546-X]",-0.1,,,-0.1,,,-0.1,,,0.589995,moriartyliane,möllerhans-jürgen,0.584254,liane moriarty ; trad. de l'anglais (australie...,"hans-jürgen möller, gerd laux, arno deister ; ...",0.533056,taupeaubéatrice,"lauxgerd, deisterarno, schulte-körnegerd",-0.05,albin michel,,-0.1,,,0.59886,"petits secrets, grands mensonges, roman","psychiatrie, psychosomatik und psychotherapie",-0.1,,,0.555556,475,670
222144,0,-0.1,,,-0.1,,,-0.1,,,-0.1,,,-0.1,,,0.5,2016aaaa,2007uuuu,1.0,20000,20000,1.0,bk,bk,0.0,[978-2-07-046833-1],[978-3-15-020008-7],-0.1,,,-0.1,,,0.374074,3870 3870,20008,0.514722,voltaire,austenjane,0.611745,voltaire ; éd. établie et annotée par jacques ...,jane austen ; nachwort und anmerkungen von chr...,0.524812,"van den heuveljacques, sollersphilippe",grawechristian,0.5,gallimard,reclam,-0.1,,,0.489405,"traité sur la tolérance, à l'occasion de la mo...","emma, roman",-0.1,,,0.0,153,600
63888,0,-0.1,,,-0.1,,,-0.1,,,-0.1,,,-0.05,,11.0,0.5,aaaaaaaa,1988uuuu,0.428571,10200,20000,0.0,mu,bk,1.0,[],[],-0.1,,,-0.05,245.0,,-0.1,,,0.581818,mozartwolfgang amadeus,austenjane,0.52887,von emanuel schikaneder ; [musik von] wolfgang...,jane austen,-0.05,"kienzlwilhelm, schikanederemanuel",,-0.1,,,-0.1,,,0.490741,"die zauberflöte (il flauto magico), oper in zw...",emma,-0.05,"die zauberflöte, ausgabe für gesang und klavier",,0.0,1,367
85471,0,-0.1,,,-0.1,,,-0.1,,,-0.1,,,-0.1,,,0.25,2007aaaa,1996uuuu,1.0,20000,20000,1.0,bk,bk,0.0,"[978-3-7815-1531-4, 3-7815-1531-1]",[3-596-22191-9],-0.1,,,-0.1,,,-0.05,,2191,-0.1,,,0.463513,hrsg. von michaela gläser-zikuda und tina hascher,jane austen,0.506928,"gläser-zikudamichaela, haschertina",austenjane,-0.05,,fischer,-0.1,,,0.591463,"lernprozesse dokumentieren, reflektieren und b...",emma,-0.1,,,0.555556,304,414
104947,0,-0.1,,,-0.1,,,-0.1,,,-0.1,,,-0.05,8.0,,0.5,2011aaaa,2008uuuu,1.0,20000,20000,1.0,bk,bk,0.0,"[978-3-642-16480-4, 3-642-16480-3, 978-3-642-1...",[978-2-07-042871-7],-0.1,,,-0.1,,,-0.05,,3870,-0.05,,voltaire,0.393939,erland erdmann (hrsg.),voltaire,-0.05,erdmannerland,,0.490741,springer,gallimard,-0.1,,,0.557423,"klinische kardiologie, krankheiten des herzens...",traité sur la tolérance à l'occasion de la mor...,-0.1,,,0.0,607,143


In [13]:
import results_saving_funcs as rsf

idx = {}
idx['true_predicted_uniques'], idx['true_predicted_duplicates'], idx['false_predicted_uniques'], idx['false_predicted_duplicates'] = raf.get_confusion_matrix_indices(y_te, y_pred_sv)

wrong_prediction_groups = ['false_predicted_uniques', 'false_predicted_duplicates']

for i in wrong_prediction_groups :
    rsf.add_wrong_predictions(path_goldstandard, 
                              sv_best, i, df_feature_base_full_te.loc[idx[i]])

## Support Vector Classifier with Cross-Validation

### Train/Test Split for Support Vector CV

In [14]:
X_tr, _, X_te, y_tr, _, y_te, idx_tr, _, idx_te = cff.split_feature_target(
    df_labelled_feature_matrix, 'train_test')

X_tr[:5], y_tr[:5], idx_tr[:5]

(array([[-0.1       , -0.1       , -0.05      , -0.1       , -0.1       ,
          0.25      ,  0.        ,  0.42857143,  0.        , -0.1       ,
          0.16666667, -0.1       , -0.05      , -0.05      ,  0.53888889,
          0.47991021, -0.1       ,  0.59978811, -0.1       ,  0.78333333],
        [-0.1       , -0.1       , -0.1       , -0.1       , -0.1       ,
          0.4375    ,  0.        ,  0.11111111,  1.        , -0.1       ,
         -0.05      , -0.1       ,  1.        ,  0.57605284,  0.59184563,
          0.41919192, -0.1       ,  0.7332472 , -0.1       ,  0.        ],
        [-0.1       , -0.1       ,  0.05      , -0.1       , -0.1       ,
          0.25      ,  1.        ,  1.        ,  1.        , -0.1       ,
         -0.1       , -0.1       , -0.05      ,  0.52608873,  0.61453149,
          0.41568627, -0.1       ,  0.51855227, -0.1       ,  0.        ],
        [-0.1       , -0.1       , -0.1       , -0.1       , -0.1       ,
          0.5       ,  1.        , 

In [15]:
print(X_tr.shape, y_tr.shape, X_te.shape, y_te.shape)

(207542, 20) (207542,) (51886, 20) (51886,)


### Model Training for Support Vector CV

In [16]:
from sklearn.model_selection import GridSearchCV
import numpy as np

# Create cross-validation object with DecisionTreeClassifer
grid_cv = GridSearchCV(SVC(random_state=0),
                       param_grid = parameter_dictionary, cv=5
                       , verbose=1
                      )

# Fit estimator
grid_cv.fit(X_tr, y_tr)

# Get the results with 'cv_results_', get parameters with their scores
params = pd.DataFrame(grid_cv.cv_results_['params'])
scores = pd.DataFrame(grid_cv.cv_results_['mean_test_score'], columns=['accuracy_val'])
log_scores = pd.DataFrame(np.log(1-grid_cv.cv_results_['mean_test_score']), columns=['log_accuracy_val'])
scores_std = pd.DataFrame(grid_cv.cv_results_['std_test_score'], columns=['std_accuracy_val'])

# Create a DataFrame of (parameters, score, std) pairs
df_test_scores_svcv = params.merge(scores, how='inner', left_index=True, right_index=True)
df_test_scores_svcv = df_test_scores_svcv.merge(
    scores_std, how='inner', left_index=True, right_index=True).sort_values(
    'accuracy_val', ascending=False)
df_test_scores_svcv = df_test_scores_svcv.merge(
    log_scores, how='inner', left_index=True, right_index=True)

Fitting 5 folds for each of 120 candidates, totalling 600 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[Parallel(n_jobs=1)]: Done 600 out of 600 | elapsed: 100.5min finished


In [17]:
df_test_scores_svcv

Unnamed: 0,C,class_weight,degree,gamma,kernel,accuracy_val,std_accuracy_val,log_accuracy_val
81,0.9,,3,1.5,poly,0.999075,0.000179,-6.985594
105,1.0,,3,1.5,poly,0.999070,0.000177,-6.980399
13,0.5,,3,2.5,poly,0.999070,0.000185,-6.980399
107,1.0,,3,2,poly,0.999070,0.000185,-6.980399
57,0.8,,3,1.5,poly,0.999065,0.000163,-6.975231
...,...,...,...,...,...,...,...,...
119,1.0,,4,auto,poly,0.996796,0.000173,-5.743302
95,0.9,,4,auto,poly,0.996699,0.000153,-5.713670
71,0.8,,4,auto,poly,0.996632,0.000192,-5.693439
47,0.7,,4,auto,poly,0.996526,0.000201,-5.662450


In [18]:
svcv_best = grid_cv.best_estimator_
y_pred_svcv = svcv_best.predict(X_te)

### Performance Measurement of Support Vector Classifier CV

In [19]:
confusion_matrix(y_te, y_pred_svcv)

array([[51559,    32],
       [   24,   271]])

In [20]:
print('Score {:.1f}%'.format(100*svcv_best.score(X_te, y_te)))
print('Area under the curve {:.1f}% - accuracy {:.1f}% - precision {:.1f}% - recall {:.1f}%'.format(
    100*roc_auc_score(y_te, y_pred_svcv),
                100*accuracy_score(y_te, y_pred_svcv),
                100*precision_score(y_te, y_pred_svcv),
                100*recall_score(y_te, y_pred_svcv)
               ))

Score 99.9%
Area under the curve 95.9% - accuracy 99.9% - precision 89.4% - recall 91.9%


In [21]:
idx = {}
idx['true_predicted_uniques'], idx['true_predicted_duplicates'], idx['false_predicted_uniques'], idx['false_predicted_duplicates'] = raf.get_confusion_matrix_indices(y_te, y_pred_svcv)

wrong_prediction_groups = ['false_predicted_uniques', 'false_predicted_duplicates']

for i in wrong_prediction_groups :
    rsf.add_wrong_predictions(path_goldstandard, 
                              svcv_best, i, df_feature_base_full_te.loc[idx[i]], '_CV')

## Results Handover

In [22]:
rsf.add_result_to_results(path_goldstandard,
                          df_test_scores_sv, sv_best, X_te, y_te, y_pred_sv)
rsf.add_result_to_results(path_goldstandard, 
                          df_test_scores_svcv, svcv_best, X_te, y_te, y_pred_svcv, '_CV')