# Support Vector Classifier Model

In [1]:
execution_mode = 'restricted'

## Table of Contents

- [Data Takeover](#Data-Takeover)
- [Support Vector Classifier](#Support-Vector-Classifier)
    - [Train/Test Split for Support Vector Classifier](#Train/Test-Split-for-Support-Vector-Classifier)
    - [Model Training for Support Vector Classifier](#Model-Training-for-Support-Vector-Classifier)
    - [Performance Measurement for Support Vector Classifier](#Performance-Measurement-for-Support-Vector-Classifier)
- [Support Vector Classifier with Cross-Validation](#Support-Vector-Classifier-with-Cross-Validation)
    - [Train/Test Split for Support Vector Classifier CV](#Train/Test-Split-for-Support-Vector-Classifier-CV)
    - [Model Training for Support Vector Classifier CV](#Model-Training-for-Support-Vector-Classifier-CV)
    - [Performance Measurement of Support Vector Classifier CV](#Performance-Measurement-of-Support-Vector-Classifier-CV)
- [Results Handover](#Results-Handover)

## Data Takeover

Read in DataFrame from chapter [Feature Matrix Generation](./3_FeatureMatrixGeneration.ipynb) as input for processing in this chapter.

In [2]:
import os
import pandas as pd

path_goldstandard = './daten_goldstandard'

# Restore results so far
df_labelled_feature_matrix = pd.read_pickle(os.path.join(path_goldstandard,
                                                         'labelled_feature_matrix.pkl'),
                                 compression=None)

df_attribute_with_sim_feature = pd.read_pickle(os.path.join(
    path_goldstandard, 'labelled_feature_matrix_full.pkl'), compression=None
                                              )

df_labelled_feature_matrix.head()

Unnamed: 0,duplicates,coordinate_E_delta,coordinate_N_delta,corporate_full_delta,doi_delta,edition_delta,exactDate_delta,format_postfix_delta,format_prefix_delta,isbn_delta,...,musicid_delta,part_delta,person_100_delta,person_245c_delta,person_700_delta,pubinit_delta,scale_delta,ttlfull_245_delta,ttlfull_246_delta,volumes_delta
0,1,-1.0,-1.0,-1.0,-1.0,-1.0,1.0,1.0,1.0,1.0,...,-1.0,1.0,1.0,1.0,1.0,1.0,-1.0,1.0,-1.0,1.0
1,1,-1.0,-1.0,-1.0,-1.0,-1.0,1.0,1.0,1.0,1.0,...,-1.0,1.0,1.0,0.818905,-0.5,0.848485,-1.0,0.787879,-1.0,1.0
2,1,-1.0,-1.0,-1.0,-1.0,-1.0,1.0,1.0,1.0,1.0,...,-1.0,1.0,1.0,0.69774,-0.5,0.848485,-1.0,1.0,-1.0,1.0
3,1,-1.0,-1.0,-1.0,-1.0,-1.0,1.0,1.0,1.0,1.0,...,-1.0,1.0,1.0,0.818905,-0.5,0.848485,-1.0,0.787879,-1.0,1.0
4,1,-1.0,-1.0,-1.0,-1.0,-1.0,1.0,1.0,1.0,1.0,...,-1.0,1.0,1.0,1.0,-1.0,1.0,-1.0,1.0,-1.0,1.0


In [3]:
print('Part of duplicates (1) on uniques (2) in units of [%]')
print(df_labelled_feature_matrix.duplicates.value_counts(normalize=True)*100)

Part of duplicates (1) on uniques (2) in units of [%]
0    99.435054
1     0.564946
Name: duplicates, dtype: float64


## Support Vector Classifier

A Support Vector Machine allows the use of regularization.

### Train/Test Split for Support Vector Classifier

The train/test split will be implemented here as a general function to be called in the models chapters.

In [4]:
import classifier_fitting_funcs as cff

X_tr, X_val, X_te, y_tr, y_val, y_te, idx_tr, idx_val, idx_te = cff.split_feature_target(
    df_labelled_feature_matrix, 'train_validation_test')

X_tr[:5], y_tr[:5], idx_tr[:5]

(array([[-1.        , -1.        , -0.5       , -1.        , -1.        ,
          0.5       ,  0.11111111,  0.        ,  0.        , -1.        ,
         -0.5       , -1.        , -0.5       ,  0.50833333,  0.52603359,
          0.51222697, -1.        ,  0.43181818, -1.        ,  0.        ],
        [-1.        , -1.        , -1.        , -1.        , -1.        ,
          0.75      ,  1.        ,  1.        ,  1.        , -1.        ,
         -1.        , -1.        ,  0.53721279,  0.61813782,  0.54661654,
         -0.5       , -1.        ,  0.59449891, -1.        ,  0.        ],
        [-1.        , -1.        , -0.5       , -0.5       , -1.        ,
          0.875     ,  0.42857143,  1.        ,  1.        , -1.        ,
         -1.        , -0.5       , -0.5       , -0.5       , -1.        ,
         -1.        , -1.        ,  0.53099238, -0.5       , -0.5       ],
        [-1.        , -1.        , -1.        , -1.        , -1.        ,
          0.25      ,  0.42857143, 

In [5]:
print(X_tr.shape, y_tr.shape, X_val.shape, y_val.shape, X_te.shape, y_te.shape)

(166868, 20) (166868,) (41718, 20) (41718,) (52147, 20) (52147,)


### Model Training for Support Vector Classifier

In [1]:
if execution_mode == 'full' :
    # Kernel 'rbf' has long calculation times, but does not generate
    #  the best accuracy : Ommit in grid search.
    parameter_dictionary = {
        'kernel' : ['linear', 'poly'],
        'degree' : [2, 3, 4],
        'gamma' : [1.5, 2, 2.5, 'auto'],
        'C' : [0.5, 0.7, 0.8, 0.9, 1.0],
        'class_weight' : [None]
    }
elif execution_mode == 'restricted' :
    parameter_dictionary = {
        'kernel' : ['poly'],
        'degree' : [3],
        'gamma' : [2.0],
        'C' : [0.5],
        'class_weight' : [None]
    }

# Grid of values
grid = cff.generate_parameter_grid(parameter_dictionary)

NameError: name 'execution_mode' is not defined

In [7]:
from sklearn.svm import SVC

sv = SVC(random_state=0)

# Save accuracy on test set
test_scores = []
for params_dict in grid :
    test_scores.append(cff.fit_model_measure_scores(sv, params_dict, X_tr, y_tr, X_val, y_val))

# Save measured accuracies
df_test_scores_sv = pd.DataFrame(test_scores).sort_values('accuracy_val', ascending=False)

Fitting with parameters {'C': 0.1, 'class_weight': None, 'degree': 3, 'gamma': 0.1, 'kernel': 'poly'}
 => validation score 99.851%


In [8]:
best_params = cff.get_best_parameters(test_scores, parameter_dictionary)

# Create a decision tree
sv_best = SVC(gamma=best_params['gamma'], kernel=best_params['kernel'],
              C=best_params['C'], class_weight=best_params['class_weight'],
              degree=best_params['degree'], random_state=0
             )

# Fit estimator
sv_best.fit(X_tr, y_tr)
y_pred_sv = sv_best.predict(X_te)

The parameters for the best model are ...
kernel = poly
degree = 3
gamma = 0.1
C = 0.1
class_weight = None


### Performance Measurement for Support Vector Classifier

In [9]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_te, y_pred_sv)

array([[51829,    23],
       [   64,   231]])

In [10]:
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score

print('Score {:.1f}%'.format(100*sv_best.score(X_te, y_te)))
print('Area under the curve {:.1f}% - accuracy {:.1f}% - precision {:.1f}% - recall {:.1f}%'.format(100*roc_auc_score(y_te, y_pred_sv),
                100*accuracy_score(y_te, y_pred_sv),
                100*precision_score(y_te, y_pred_sv),
                100*recall_score(y_te, y_pred_sv)
               ))

Score 99.8%
Area under the curve 89.1% - accuracy 99.8% - precision 90.9% - recall 78.3%


In [11]:
import results_analysis_funcs as raf

In [12]:
df_feature_base_full_te = df_attribute_with_sim_feature.iloc[idx_te]
df_feature_base_full_tr = df_attribute_with_sim_feature.iloc[idx_tr]

# Extend display to number of columns of DataFrame
pd.options.display.max_columns = len(df_feature_base_full_te.columns)

df_feature_base_full_te.sort_index().sample(n=20)

Unnamed: 0,duplicates,coordinate_E_delta,coordinate_E_x,coordinate_E_y,coordinate_N_delta,coordinate_N_x,coordinate_N_y,corporate_full_delta,corporate_full_x,corporate_full_y,doi_delta,doi_x,doi_y,edition_delta,edition_x,edition_y,exactDate_delta,exactDate_x,exactDate_y,format_postfix_delta,format_postfix_x,format_postfix_y,format_prefix_delta,format_prefix_x,format_prefix_y,isbn_delta,isbn_x,isbn_y,ismn_delta,ismn_x,ismn_y,musicid_delta,musicid_x,musicid_y,part_delta,part_x,part_y,person_100_delta,person_100_x,person_100_y,person_245c_delta,person_245c_x,person_245c_y,person_700_delta,person_700_x,person_700_y,pubinit_delta,pubinit_x,pubinit_y,scale_delta,scale_x,scale_y,ttlfull_245_delta,ttlfull_245_x,ttlfull_245_y,ttlfull_246_delta,ttlfull_246_x,ttlfull_246_y,volumes_delta,volumes_x,volumes_y
54781,0,-1.0,,,-1.0,,,-1.0,,,-1.0,,,-1.0,,,0.5,2003uuuu,1994uuuu,0.111111,10100,20000,0.0,mu,bk,1.0,[],[],-1.0,,,-1.0,,,-1.0,,,-0.5,mozartwolfgang amadeus,,0.561912,wolfgang amadeus mozart ; arr.: carlo balmelli,[sigrid kessler ... et al.] ; [hrsg.: interkan...,-0.5,"balmellicarlo, mozartwolfgang amadeus, mozartw...",,0.480375,power music,[staatlicher lehrmittelverlag],-1.0,,,0.652146,"die zauberflöte, the magic flute : overture : ...","bonne chance!, cours de langue française, 1, e...",-1.0,,,0.583333,1 36,12
260050,0,-1.0,,,-1.0,,,-1.0,,,-1.0,,,-1.0,,,0.75,2005uuuu,2017uuuu,0.111111,10300,20053,0.0,vm,bk,0.0,[],"[978-1-118-62114-1 (hardback), 978-1-118-62110...",-1.0,,,-1.0,,,-1.0,,,-0.5,,richterdavid h.,0.486111,ein film von luc jacquet,david h. richter,-0.5,jacquetluc,,-1.0,,,-1.0,,,0.630669,die reise der pinguine,reading the eighteenth-century novel,-1.0,,,0.75,1 82,1
7614,0,-1.0,,,-1.0,,,-0.5,"bonne pioche (paris), buena vista internationa...",,-1.0,,,-1.0,,,0.875,2006uuuu,2003uuuu,0.111111,10300,20000,0.0,vm,bk,0.0,[],[1-904633-00-5],-1.0,,,-0.5,501326.0,,-1.0,,,-0.5,,austenjane,0.42096,ein film von luc jacquet,jane austen,-0.5,"jacquetluc, bohringerromane, berlingcharles, s...",,0.45679,kinowelt home entertainment,crw,-1.0,,,0.431818,die reise der pinguine,emma,-1.0,,,0.0,1 82,590
124622,0,-1.0,,,-1.0,,,-0.5,interkantonale lehrmittelzentrale (luzern),,-0.5,,10.5169/seals-377160,-1.0,,,0.25,19829999,2001uuuu,0.111111,30000,10053,0.0,vm,bk,1.0,[],[],-1.0,,,-1.0,,,-0.5,,280(2001),-0.5,,bührerwalter,0.52735,sigrid kessler... [et al.] ; [éd.:] interkanto...,[walter bührer],-0.5,kesslersigrid,,-0.5,staatlicher lehrmittelverlag,,-1.0,,,0.493284,"bonne chance!, cours de langue française, deux...",blick in die welt,-1.0,,,-1.0,,
32665,0,-1.0,,,-1.0,,,-1.0,,,-1.0,,,-1.0,,,0.0,20071990,19942008,1.0,10300,30100,1.0,vm,vm,0.0,[],"[3-906721-51-5 (Livre de l'élève), 978-3-90672...",-1.0,,,-1.0,,,-0.5,,1,-1.0,,,0.589224,ein volker schlöndorff film ; nach dem gleichn...,sigrid kessler ... [et al.] ; [éditeurs: inter...,0.567269,"schlöndorffvolker, frischmax, shepardsam, delp...",kesslersigrid,-0.5,,staatlicher lehrmittelverlag,-1.0,,,0.566981,homo faber,"bonne chance!, cours de langue française, étap...",-1.0,,,0.0,2 109,10
53100,0,-1.0,,,-1.0,,,-1.0,,,-0.5,,10.5169/seals-377079,-1.0,,,1.0,1999uuuu,1999uuuu,0.111111,20000,10053,1.0,bk,bk,0.0,[3-495-47879-5],[],-1.0,,,-1.0,,,0.403704,"bd. 57, 57",278(1999),0.505556,fluryandreas,bührerwalter,0.564957,andreas flury,[walter bührer],-1.0,,,-0.5,k. alber,,-1.0,,,0.520215,"der moralische status der tiere, henry salt, p...",blick in die welt,-1.0,,,-0.5,316,
129056,0,-1.0,,,-1.0,,,-1.0,,,-1.0,,,-1.0,,,0.5,1999uuuu,2011uuuu,1.0,20000,20000,1.0,bk,bk,0.0,[3-495-47879-5],[978-3-936438-35-2],-1.0,,,-1.0,,,0.733333,"bd. 57, 57",bd. 4,-0.5,fluryandreas,,0.641026,andreas flury,"andreas kagermeier, tobias reeh (hrsg.)",-1.0,,,-0.5,k. alber,,-1.0,,,0.656778,"der moralische status der tiere, henry salt, p...","trends, herausforderungen und perspektiven für...",-1.0,,,0.0,316,280
32346,0,-1.0,,,-1.0,,,-0.5,,arts florissants,-1.0,,,-1.0,,,0.5,2005uuuu,1996uuuu,0.428571,20000,40000,0.0,bk,mu,0.0,"[978-0-521-82437-8, 0-521-82437-0]",[],-1.0,,,-0.5,,630.0,-1.0,,,0.581818,austenjane,mozartwolfgang amadeus,0.409722,jane austen ; ed. by richard cronin ... [et al.],mozart,0.574876,croninrichard,"christiewilliam, dessaynatalie, mannionrosa, b...",-0.5,,erato,-1.0,,,0.528986,emma,"die zauberflöte, kv 620",-1.0,,,0.0,600,2 1
73836,0,-1.0,,,-1.0,,,-1.0,,,-1.0,,,-1.0,,,0.875,1999uuuu,1990uuuu,1.0,20000,20000,1.0,bk,bk,0.0,[3-495-47879-5],[],-1.0,,,-1.0,,,0.0,bd. 57,7,-0.5,fluryandreas,,0.488462,andreas flury,sigrid kessler... [et al.] ; [éd.:] interkanto...,-1.0,,,-0.5,,[staatlicher lehrmittelverlag],-1.0,,,0.704393,"der moralische status der tiere, henry salt, p...","bonne chance!, cours de langue française, troi...",-1.0,,,0.0,316,1
36150,0,-1.0,,,-1.0,,,-1.0,,,-1.0,,,-1.0,,,0.5,uuuuuuuu,18701880,0.111111,40100,20000,0.0,mu,bk,1.0,[],[],-1.0,,,-1.0,,,-1.0,,,1.0,mozartwolfgang amadeus,mozartwolfgang amadeus,0.57217,wolfgang amadeus mozart,oper in zwei akten von emanuel schikaneder ; m...,0.577576,"mathisedith, karajanherbert von",schikanederemanuel,-1.0,,,-1.0,,,0.559867,zauberflöte,"die zauberflöte, text der gesänge : officielle...",-1.0,,,0.833333,3,30


In [13]:
import results_saving_funcs as rsf

idx = {}
idx['true_predicted_uniques'], idx['true_predicted_duplicates'], idx['false_predicted_uniques'], idx['false_predicted_duplicates'] = raf.get_confusion_matrix_indices(y_te, y_pred_sv)

wrong_prediction_groups = ['false_predicted_uniques', 'false_predicted_duplicates']

for i in wrong_prediction_groups :
    rsf.add_wrong_predictions(path_goldstandard, 
                              sv_best, i, df_feature_base_full_te.loc[idx[i]])

## Support Vector Classifier with Cross-Validation

### Train/Test Split for Support Vector CV

In [14]:
X_tr, _, X_te, y_tr, _, y_te, idx_tr, _, idx_te = cff.split_feature_target(
    df_labelled_feature_matrix, 'train_test')

X_tr[:5], y_tr[:5], idx_tr[:5]

(array([[-1.        , -1.        , -1.        , -1.        , -1.        ,
          0.5       ,  0.42857143,  0.        ,  1.        , -1.        ,
         -0.5       , -0.5       , -0.5       ,  0.50165426, -0.5       ,
          0.48593074, -1.        ,  0.60439973, -1.        ,  0.        ],
        [-1.        , -1.        , -0.5       , -1.        , -1.        ,
          0.25      ,  0.42857143,  0.        ,  0.        , -1.        ,
         -1.        , -1.        , -0.5       ,  0.54435379, -1.        ,
         -0.5       , -1.        ,  0.54177001, -1.        , -0.5       ],
        [-1.        , -1.        , -0.5       , -1.        , -1.        ,
          0.25      ,  1.        ,  1.        ,  0.        , -1.        ,
         -1.        , -0.5       , -0.5       ,  0.6020276 ,  0.53663004,
          0.49448622, -1.        ,  0.57046955, -1.        , -0.5       ],
        [-1.        , -1.        , -1.        , -1.        , -1.        ,
          0.75      ,  1.        , 

In [15]:
print(X_tr.shape, y_tr.shape, X_te.shape, y_te.shape)

(208586, 20) (208586,) (52147, 20) (52147,)


### Model Training for Support Vector CV

In [16]:
from sklearn.model_selection import GridSearchCV
import numpy as np

# Create cross-validation object with DecisionTreeClassifer
grid_cv = GridSearchCV(SVC(random_state=0),
                       param_grid = parameter_dictionary, cv=5
                       , verbose=1
                      )

# Fit estimator
grid_cv.fit(X_tr, y_tr)

# Get the results with 'cv_results_', get parameters with their scores
params = pd.DataFrame(grid_cv.cv_results_['params'])
scores = pd.DataFrame(grid_cv.cv_results_['mean_test_score'], columns=['accuracy_val'])
log_scores = pd.DataFrame(np.log(1-grid_cv.cv_results_['mean_test_score']), columns=['log_accuracy_val'])
scores_std = pd.DataFrame(grid_cv.cv_results_['std_test_score'], columns=['std_accuracy_val'])

# Create a DataFrame of (parameters, score, std) pairs
df_test_scores_svcv = params.merge(scores, how='inner', left_index=True, right_index=True)
df_test_scores_svcv = df_test_scores_svcv.merge(
    scores_std, how='inner', left_index=True, right_index=True).sort_values(
    'accuracy_val', ascending=False)
df_test_scores_svcv = df_test_scores_svcv.merge(
    log_scores, how='inner', left_index=True, right_index=True)

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   33.9s finished


In [17]:
df_test_scores_svcv

Unnamed: 0,C,class_weight,degree,gamma,kernel,accuracy_val,std_accuracy_val,log_accuracy_val
0,0.1,,3,0.1,poly,0.998351,0.000143,-6.407464


In [18]:
svcv_best = grid_cv.best_estimator_
y_pred_svcv = svcv_best.predict(X_te)

### Performance Measurement of Support Vector Classifier CV

In [19]:
confusion_matrix(y_te, y_pred_svcv)

array([[51829,    23],
       [   60,   235]])

In [20]:
print('Score {:.1f}%'.format(100*svcv_best.score(X_te, y_te)))
print('Area under the curve {:.1f}% - accuracy {:.1f}% - precision {:.1f}% - recall {:.1f}%'.format(
    100*roc_auc_score(y_te, y_pred_svcv),
                100*accuracy_score(y_te, y_pred_svcv),
                100*precision_score(y_te, y_pred_svcv),
                100*recall_score(y_te, y_pred_svcv)
               ))

Score 99.8%
Area under the curve 89.8% - accuracy 99.8% - precision 91.1% - recall 79.7%


In [21]:
idx = {}
idx['true_predicted_uniques'], idx['true_predicted_duplicates'], idx['false_predicted_uniques'], idx['false_predicted_duplicates'] = raf.get_confusion_matrix_indices(y_te, y_pred_svcv)

wrong_prediction_groups = ['false_predicted_uniques', 'false_predicted_duplicates']

for i in wrong_prediction_groups :
    rsf.add_wrong_predictions(path_goldstandard, 
                              svcv_best, i, df_feature_base_full_te.loc[idx[i]], '_CV')

## Results Handover

In [22]:
rsf.add_result_to_results(path_goldstandard,
                          df_test_scores_sv, sv_best, X_te, y_te, y_pred_sv)
rsf.add_result_to_results(path_goldstandard, 
                          df_test_scores_svcv, svcv_best, X_te, y_te, y_pred_svcv, '_CV')