In [1]:
execution_mode = 'restricted'
factor = 0.1
exactDate_mode = 'xor'

# Feature Matrix Generation

In chapter [Goldstandard and Data Preparation](./2_GoldstandardDataPreparation.ipynb), Swissbib's goldstandard data has been processed to form records of pairs of duplicate and pairs of unique records. These records are the starting point for the final feature matrix generation and that is the reason, why the DataFrame was called feature base. As described in [[JudACaps](./A_References.ipynb#judacaps)], the next step will be an attribute-wise comparison of each attribute pair of each record in the original feature base. This comparison will generate similarity values for each attribute pair. It will halve the number of attributes replacing each attribute pair with one value expressing their degree of similarity. The goal of this chapter is a DataFrame with the full and final feature attributes. The values of these feature attributes will be used for training and performance testing of the machine learning models in the chapters to follow.

This chapter introduces similarity metrics for string comparisons. The metrics to be used for calculating its similarity will be decided for each attribute pair of the DataFrame built in the previous chapters.

## Table of Contents

- [Data Takeover](#Data-Takeover)
- [Object Distance and Similarity](#Object-Distance-and-Similarity)
    - [Mathematical Definitions](#Mathematical-Definitions)
    - [Library TextDistance](#Library-TextDistance)
- [Similarity Metrics on Attribute Level](#Similarity-Metrics-on-Attribute-Level)
    - [Table of Contents of Attribute Similarities](#Table-of-Contents-of-Attribute-Similarities)
- [DataFrame with Attributes and Similarity Features](#DataFrame-with-Attributes-and-Similarity-Features)
- [Summary](#Summary)
    - [Full Feature Matrix with Target Vector Handover](#Full-Feature-Matrix-with-Target-Vector-Handover)

## Data Takeover

Swissbib's raw data of the goldstandard has been processed in chapter [Goldstandard and Data Preparation](./2_GoldstandardDataPreparation.ipynb). As the first step of this chapter, this data is loaded for further processing to the feature matrix and target vector for the subsequent machine learning model chapters.

In [2]:
import os
import pandas as pd
import pickle as pk
import bz2
import _pickle as cPickle

path_goldstandard = './daten_goldstandard'

# Restore metadata so far
with open(os.path.join(path_goldstandard, 'columns_metadata.pkl'), 'rb') as handle:
    columns_metadata_dict = pk.load(handle)

# Restore DataFrame with features from compressed pickle file
with bz2.BZ2File((os.path.join(
    path_goldstandard, 'feature_base_df.pkl')), 'rb') as file:
    df_feature_base = cPickle.load(file)

# Extend display to number of columns of DataFrame
pd.options.display.max_columns = len(df_feature_base.columns)

df_feature_base.head()

Unnamed: 0,035liste_x,035liste_y,century_x,century_y,coordinate_E_x,coordinate_E_y,coordinate_N_x,coordinate_N_y,coordinate_x,coordinate_y,corporate_110_x,corporate_110_y,corporate_710_x,corporate_710_y,corporate_full_x,corporate_full_y,decade_x,decade_y,docid_x,docid_y,doi_x,doi_y,duplicates,edition_x,edition_y,exactDate_x,exactDate_y,format_postfix_x,format_postfix_y,format_prefix_x,format_prefix_y,isbn_x,isbn_y,ismn_x,ismn_y,masters_docid,musicid_x,musicid_y,pages_x,pages_y,part_x,part_y,person_100_x,person_100_y,person_245c_x,person_245c_y,person_700_x,person_700_y,pubinit_x,pubinit_y,pubword_x,pubword_y,pubyear_x,pubyear_y,scale_x,scale_y,ttlfull_245_x,ttlfull_245_y,ttlfull_246_x,ttlfull_246_y,ttlpart_x,ttlpart_y,volumes_x,volumes_y
0,"[(OCoLC)731635279, (ABN)000539983]","[(OCoLC)731635279, (ABN)000539983]",2009,2009,,,,,[],[],,,,,,,2009,2009,311049,311049,,,1,,,2009uuuu,2009uuuu,20000,20000,bk,bk,[978-3-15-020008-7],[978-3-15-020008-7],,,504389793,,,[600 S.],[600 S.],20008,20008,austenjane,austenjane,jane austen ; aus dem englischen übersetzt von...,jane austen ; aus dem englischen übersetzt von...,"grawechristian, graweursula","grawechristian, graweursula",reclam jun.,reclam jun.,[Reclam jun.],[Reclam jun.],2009,2009,,,"emma, roman","emma, roman",,,"{'245': ['Emma', 'Roman']}","{'245': ['Emma', 'Roman']}",600,600
1,"[(OCoLC)731635279, (ABN)000539983]","[(OCoLC)731635279, (NEBIS)009587153]",2009,2009,,,,,[],[],,,,,,,2009,2009,311049,196506476,,,1,,,2009uuuu,2009uuuu,20000,20000,bk,bk,[978-3-15-020008-7],[978-3-15-020008-7],,,504389793,,,[600 S.],[600 S.],20008,20008,austenjane,austenjane,jane austen ; aus dem englischen übersetzt von...,jane austen ; aus dem engl. übers. von ursula ...,"grawechristian, graweursula",,reclam jun.,reclam,[Reclam jun.],[Reclam],2009,2009,,,"emma, roman",emma,,,"{'245': ['Emma', 'Roman']}",{'245': ['Emma']},600,600
2,"[(OCoLC)731635279, (ABN)000539983]","[(OCoLC)731635279, (LIBIB)000315536]",2009,2009,,,,,[],[],,,,,,,2009,2009,311049,323173349,,,1,,,2009uuuu,2009uuuu,20000,20000,bk,bk,[978-3-15-020008-7],[978-3-15-020008-7],,,504389793,,,[600 S.],[600 S.],20008,20008,austenjane,austenjane,jane austen ; aus dem englischen übersetzt von...,jane austen,"grawechristian, graweursula",,reclam jun.,reclam,[Reclam jun.],[Reclam],2009,2009,,,"emma, roman","emma, roman",,,"{'245': ['Emma', 'Roman']}","{'245': ['Emma', 'Roman']}",600,600
3,"[(OCoLC)731635279, (NEBIS)009587153]","[(OCoLC)731635279, (ABN)000539983]",2009,2009,,,,,[],[],,,,,,,2009,2009,196506476,311049,,,1,,,2009uuuu,2009uuuu,20000,20000,bk,bk,[978-3-15-020008-7],[978-3-15-020008-7],,,504389793,,,[600 S.],[600 S.],20008,20008,austenjane,austenjane,jane austen ; aus dem engl. übers. von ursula ...,jane austen ; aus dem englischen übersetzt von...,,"grawechristian, graweursula",reclam,reclam jun.,[Reclam],[Reclam jun.],2009,2009,,,emma,"emma, roman",,,{'245': ['Emma']},"{'245': ['Emma', 'Roman']}",600,600
4,"[(OCoLC)731635279, (NEBIS)009587153]","[(OCoLC)731635279, (NEBIS)009587153]",2009,2009,,,,,[],[],,,,,,,2009,2009,196506476,196506476,,,1,,,2009uuuu,2009uuuu,20000,20000,bk,bk,[978-3-15-020008-7],[978-3-15-020008-7],,,504389793,,,[600 S.],[600 S.],20008,20008,austenjane,austenjane,jane austen ; aus dem engl. übers. von ursula ...,jane austen ; aus dem engl. übers. von ursula ...,,,reclam,reclam,[Reclam],[Reclam],2009,2009,,,emma,emma,,,{'245': ['Emma']},{'245': ['Emma']},600,600


Chapter [Overview and Summary](./0_OverviewSummary.ipynb) assesses the predictions of the various models. One measure to analyse the results will be the resulting confusion matrix for a model prediction. The confusion matrix will reveal cases in the testing data that the model will predict opposite to the target values of the testing data. These predictions are called false positives and false negatives. To be able to analyse the original attribute values of the affected records in detail, columns $\texttt{035liste}\_\texttt{x}$, $\texttt{035liste}\_\texttt{y}$, $\texttt{docid}\_\texttt{x}$, and $\texttt{docid}\_\texttt{y}$ need to be restored. These four attributes will now be saved to a separate DataFrame to be reloaded in later chapters.

In [3]:
# Store docid's for fast identification of row pairs in model results
df_index_docids = df_feature_base[['035liste_x', '035liste_y', 'docid_x', 'docid_y']]
# The DataFrame of pairs with target information
df_feature_base = df_feature_base[columns_metadata_dict['columns_to_use']]

df_feature_base.sample(n=5)

Unnamed: 0,duplicates,coordinate_E_x,coordinate_E_y,coordinate_N_x,coordinate_N_y,corporate_full_x,corporate_full_y,doi_x,doi_y,edition_x,edition_y,exactDate_x,exactDate_y,format_prefix_x,format_prefix_y,format_postfix_x,format_postfix_y,isbn_x,isbn_y,ismn_x,ismn_y,musicid_x,musicid_y,part_x,part_y,person_100_x,person_100_y,person_700_x,person_700_y,person_245c_x,person_245c_y,pubinit_x,pubinit_y,scale_x,scale_y,ttlfull_245_x,ttlfull_245_y,ttlfull_246_x,ttlfull_246_y,volumes_x,volumes_y
258677,0,,e0074147,,n0460833,,eidgenössische landestopographie,,,,,2005uuuu,1943uuuu,vm,mp,10300,10300,[],[],,,,,,23 23 1943,,,jacquetluc,"dufourguillaume-henri, müllhauptheinrich",ein film von luc jacquet,g. h. dufour direxit ; h. müllhaupt sculpsit,,[eidg. landestopographie],,100000.0,die reise der pinguine,"domo d'ossola, arona",,"[domodossola, arona]",1 82,2
283238,1,,,,,,,,,,,2003uuuu,2003uuuu,bk,bk,20000,20000,[3-15-002620-2],[3-15-002620-2],,,,,2620 2620,2620 2620,mozartwolfgang amadeus,mozartwolfgang amadeus,"kochhans-albrecht, schikanederemanuel","kochhans-albrecht, schikanederemanuel",wolfgang amadeus mozart ; libretto von emanuel...,wolfgang amadeus mozart ; libretto von emanuel...,p. reclam,p. reulam,,,"die zauberflöte, kv 620 : eine grosse oper in ...","die zauberflöte, kv 620 : eine grosse oper i n...",,,90,90
314277,1,,,,,,,,,,,19001950,19001950,mu,mu,10200,10200,[],[],,,245.0,245.0,,,mozartwolfgang amadeus,mozartwolfgang amadeus,"kienzlwilhelm, mozartwolfgang amadeus","kienzlwilhelm, mozartwlofgang amadus",von w.a. mozart ; klavierauszug neu rev. von w...,von w.a. mozart ; klavierauszug neu rev. von w...,universal-edition,universa-ledltion,,,"die zauberflöte, oper in zwei akten = il flaut...","die zauberflöte, oper inz wei akten = il flaut...","die zauberflöte, ausgabe für gesang und klavier","die zauberflöte, ausgabe für gesang und klaiver",1 167,1 167
140784,0,,,,,,,,,,,2005uuuu,1989uuuu,bk,bk,20000,20053,[978-3-8067-5097-3],[978-3-598-31490-2 (print)],,,,,,10 10,jacquetluc,mortzfeldpeter,,raabepaul,luc jacquet ; übers. aus dem franz. von cornel...,"mortzfeld, peter; raabe, paul",gerstenberg,de gruyter saur,,,die reise der pinguine,katalog der graphischen porträts in der herzog...,,,64,1 406
211539,0,,,,,schweizerische normen-vereinigung,,,,,,2013uuuu,2010uuuu,bk,mu,20053,30100,[],[978-1-85584-230-4],,,,,,,,steinerrudolf,,bridgmontpeter,,rudolf steiner,,,,,informatique de santé - communication entre di...,"how do i find the christ?, a lecture",medizinische informatik - kommunikation von ge...,,1,1


In [4]:
print('Number of rows labelled as duplicates {:,d}'.format(len(df_feature_base[
    df_feature_base.duplicates==1])))
print('Number of rows labelled as uniques {:,d}'.format(len(df_feature_base[
    df_feature_base.duplicates==0])))
print('Total number of rows in DataFrame {:,d}'.format(df_feature_base.shape[0],
      'number of columns', df_feature_base.shape[1]))

Number of rows labelled as duplicates 67,158
Number of rows labelled as uniques 257,955
Total number of rows in DataFrame 325,113


In [5]:
print('Part of duplicates (1) on uniques (2) in units of [%]')
print(round(100*df_feature_base.duplicates.value_counts(normalize=True), 1))

Part of duplicates (1) on uniques (2) in units of [%]
0    79.3
1    20.7
Name: duplicates, dtype: float64


DataFrame feature base is the starting point used for the further processing in this chapter.

## Object Distance and Similarity

A mathematical idea of distance and similarity is needed for understanding object pair comparison. This section starts with a motivation for calculating similarities and afterwards gives a very basic definition of the two central terms, distance and similarity. The text of this section is a summary of [[Chri2012](./A_References.ipynb#chri2012)].

### Mathematical Definitions

The attributes to be used for pair comparison may contain values of poor quality. The quality originates in the way the data has been entered at the very source. Manual data entry may suffer from mistyping, automatically scanned data may suffer from insufficiencies of the scanned base material or the recognition algorithm in the optical character recognition (OCR) processing. The basic step of a deduplication process is to identify the probability of two strings of a pair to be a pair of duplicates. This is done by calculating a similarity value between the two strings compared, rather than using an exact comparison function. Based on this common similarity value for an attribute pair, their being duplicates can be decided.

The term similarity is strongly coupled to the term of distance of two values of an attribute. Mathematically, a distance can be explained with the help of a distance function. A _distance function_ or _distance metric_ $dist(o_i, o_j)$ between two points or data objects $o_i$ and $o_j$ must fulfill four requirements.

1. $dist(o_i, o_i)=0$, the distance from an object to itself is zero.
- $dist(o_i, o_j)\ge 0$, the distance between two objects is a non-negative number.
- $dist(o_i, o_j)=dist(o_j, o_i)$, the distance between two objects is symmetric.
- $dist(o_i, o_j)\le dist(o_i, o_k)+dist(o_k, o_j)$, the triangular inequality must hold. It states that the direct distance beween two objects is never larger than the combined distance when going through a third object.

A distance value expresses the dissimilarity $d$ of two objects [[HanK2012](./A_References.ipynb#hank2012)] and can therefore be converted into a similarity value $s$, calculating $s = \frac{1}{d}$, assuming $d\gt 0$. Alternatively, assuming the distance value is normalised $0\le d\le 1$, the similarity value can be calculated to $s = 1-d$. A _similarity function_ $sim(a_i, aj)$ between two attributes which can be strings, numbers, dates, geographic locations, text, XML documents, etc. fulfills the general requirements.

1. $sim(a_i, a_i)=1$, the result of comparing a value with itself is an exact similarity.
- $sim(a_i, a_j)=0$, the similarity of values that are completely different from each other is 0. What accounts for 'complete different' depends upon the type of data that are compared.
- $0\lt sim(a_i, a_j)\lt 1$, an approximate similarity between exact similarity and total dissimilarity is calculated if two attribute values are somewhat similar to each other. What accounts for 'somewhat different' depends upon the type of data that are compared.

The dissimilarity between two objects $o_i$ and $o_j$ can be computed based on the ratio of mismatches,
$$
d(o_i, o_j) = \frac{p-m}{p},
$$
where $m$ is the number of matching attributes and $p$ is the total number of attributes describing the objects [[HanK2012](./A_References.ipynb#hank2012)]. Thus the similarity between two objects can be computed as
$$
sim(o_i, o_j) = 1 - d(o_i, o_j) = \frac{m}{p}.
$$

For data deduplication, a comparison function needs to be tailored to the type of underlying data. Although there is a correspondence between a similarity function and the mathematical concept of a distance function, not all known and implemented similarity comparison functions used for string pair comparison fulfill the requirements of a distance function. Some similarity functions are not symmetric, others do not fulfill the triangular inequality. Decision taking on the best similarity function for a string pair, will be based on the effect, a similarity function has for the purpose needed. In the case of this capstone project, this purpose is its capability to contribute to the prediction whether a pair of records is a pair duplicates or a pair of uniques.

### Library TextDistance

An internet research on string distance calculation with Python has revealed libraries [[StSi](./A_References.ipynb#stsi)], [[TeDi](./A_References.ipynb#tedi)] and seperate code snippets for distinct algorithms. After trying the referenced libraries and a downloaded code snippet for a Smith Waterman similarity [[SmWa](./A_References.ipynb#smwa)], the text distance library [[TeDi](./A_References.ipynb#tedi)] has been decided to be the best decision for this capstone project. The decision is based on the GitHub statistics of stars and the date of the latest pull requests, indicating its popularity and maintenance activity of the library. A look at the API of the library, reveals the Python library to be a complete implementation (compared to suggestions of similarity metrics in [[Chri2012](./A_References.ipynb#chri2012)]) and easy to use.

In [6]:
# Install textdistance Python library - if not done, yet.
! pip install textdistance



For using the library, see documentation in [[TeDi](./A_References.ipynb#tedi)]. For the purposes of this chapter, function $\texttt{.normalized}\_\texttt{similarity()}$ of an instantiated textdistance object will be used.

In [7]:
import textdistance as tedi

With the code line above, the library is imported for application in this chapter. In appendix [Comparison of Similarity Metrics](./B_CompareSimilarities.ipynb) the effects of the similarity metrics of the library are compared for better understanding of their specific behaviour. This comparison for each attribute is the basis of deciding the best similarity metric available for an attribute pair.

## Similarity Metrics on Attribute Level

This section implements the decision for calculating the similarity metric for each attribute of the raw data based on appendix [Comparison of Similarity Metrics](./B_CompareSimilarities.ipynb). The implementation is applied on a pair of attributes of two records, resulting in a new attribute, the similarity value, of the final feature matrix. A general function $\texttt{build_delta_feature}$ is provided by the code file [data_preparation_funcs.py](./data_preparation_funcs.py) for transforming two attributes into their feature attribute holding their similarity value.

In [8]:
import data_preparation_funcs as dpf

### Table of Contents of Attribute Similarities

- [coordinate](#coordinate)
- [corporate](#corporate)
- [doi](#doi)
- [edition](#edition)
- [exactDate](#exactDate)
- [format](#format)
- [isbn](#isbn)
- [ismn](#ismn)
- [musicid](#musicid)
- [part](#part)
- [person](#person)
- [pubinit](#pubinit)
- [scale](#scale)
- [ttlfull](#ttlfull)
- [volumes](#volumes)

In [9]:
# These dictionary attributes will be filled in function dpf.build_delta_feature()
columns_metadata_dict['similarity_metrics'] = {}
columns_metadata_dict['features'] = []

### coordinate

As discussed in chapter [Data Analysis](./1_DataAnalysis.ipynb), attribute $\texttt{coordinate}$ holds coordinates of maps. To decide whether a map covers the same geographical range, a metric will be chosen that compares the coordinate number digits from left to right. The more digits are found to be the equal, the higher the similarity value is calculated. The comparison stops with the first digit pair that differs. This algorithm is satisfyed by the LCS (Longest Common Substring comparison) algorithm and generates the wanted result, see appendix [Comparison of Similarity Metrics](./B_CompareSimilarities.ipynb).

In [10]:
attribute = 'coordinate'

columns_metadata_dict['similarity_metrics'][attribute+'_E'] = tedi.LCSStr()
columns_metadata_dict['similarity_metrics'][attribute+'_N'] = tedi.LCSStr()

ne_values = ['_E', '_N']

for ne in ne_values :
    df_feature_base, columns_metadata_dict = dpf.build_delta_feature(
        df_feature_base, attribute+ne,
        columns_metadata_dict['similarity_metrics'][attribute+ne],
        columns_metadata_dict)

The length of attribute $\texttt{coordinate}$ is exactly eight digits. The distinct similarity values that may occurr form a discrete set of values with a distance of $\frac{1}{8}$ between adjacent values.

In [11]:
uniques, uniques_len = {}, {}

for ne in ne_values :
    uniques[attribute+ne], uniques_len[attribute+ne] = dpf.determine_similarity_values(
        df_feature_base, attribute+ne)

coordinate_E values range [0.    0.125 0.25  0.375 0.5   0.625 0.875 1.   ]
coordinate_N values range [0.    0.375 0.5   0.625 0.75  0.875 1.   ]


Looking at some samples of the feature matrix reveals a good match to the expectations.

In [12]:
position = 3

for ne in ne_values :
    dpf.show_samples_interval(
        df_feature_base, attribute+ne,
        uniques[attribute+ne][uniques_len[attribute+ne]-position],
        uniques[attribute+ne][uniques_len[attribute+ne]-position+1]
    )

Unnamed: 0,duplicates,coordinate_E_delta,coordinate_E_x,coordinate_E_y
196658,0,0.625,e0055700,e0055009
854,1,0.875,e0080855,e0080851
182806,0,0.875,e0080855,e0080851
96166,0,0.625,e0080851,e0080900
863,1,0.875,e0080851,e0080855


0.625 <= coordinate_E_delta <= 0.875


Unnamed: 0,duplicates,coordinate_N_delta,coordinate_N_x,coordinate_N_y
267550,1,0.75,n0460833,0n460833
54937,0,0.75,n0460826,n0460833
182777,0,0.75,n0460826,n0460833
318040,1,0.75,n0474800,0n474800
270949,1,0.75,n0460833,0n460833


0.75 <= coordinate_N_delta <= 0.875


The samples above show the wanted similarity behaviour for value ranges greater than 0. The metric has the weakness, though, that empty coordinate values, e.g. for bibliographical units other than maps, have each been calculated to a similarity of 0. Some samples for duplicates in the training data are shown below.

In [13]:
dpf.show_samples_interval(
    df_feature_base[df_feature_base.duplicates==1],
    attribute+'_E', uniques[attribute+'_E'][0], uniques[attribute+'_E'][1], 10)

Unnamed: 0,duplicates,coordinate_E_delta,coordinate_E_x,coordinate_E_y
272180,1,0.0,,
320755,1,0.0,,
284253,1,0.0,,
314881,1,0.0,,
299514,1,0.0,,
308352,1,0.0,,
262745,1,0.0,,
312262,1,0.0,,
296907,1,0.125,e0074147,n0460833
304992,1,0.0,,


0.0 <= coordinate_E_delta <= 0.125


This downside shall be avoided by marking pairs of missing coordinate values on both sides with a special negative value, which will point out to the models to be trained, the special case of missing information in a row. The implementation of this logic is done in function $\texttt{.mark}\_\texttt{missing}()$. The absolute value of this negative number is conrolled by a factor which is passed to the function as a parameter. The function handles explicitly two cases. The first one is missing information in both attributes (resulting in $-1*\texttt{factor}$) and the second one is missing information in only one of the two attributes (resulting in $-0.5*\texttt{factor}$).

In [14]:
for ne in ne_values :
    df_feature_base = dpf.mark_missing(df_feature_base, attribute+ne, factor)

### corporate

Attribute $\texttt{corporate}$ is a collection of corporate names. The Monge-Elkan metric compares string tokens pairwise [[Chri2012](./A_References.ipynb#chri2012)] while the LCS metric searches for the longest common substring. Assessing the differences of these two metrics with the help of their values distribution in chapter [Features Discussion and Dummy Classifier Baseline](./5_FeatureDiscussionDummyBaseline.ipynb), reveals a better distribution behaviour for LCS. Therefore, the LCS metric will be chosen for this attribute.

In [15]:
attribute = 'corporate_full'

columns_metadata_dict['similarity_metrics'][attribute] = tedi.LCSStr()
#tedi.StrCmp95()
#tedi.MongeElkan()

df_feature_base, columns_metadata_dict = dpf.build_delta_feature(
    df_feature_base, attribute,
    columns_metadata_dict['similarity_metrics'][attribute],
    columns_metadata_dict)

In [16]:
uniques[attribute], uniques_len[attribute] = dpf.determine_similarity_values(
    df_feature_base, attribute)

corporate_full values range [0.         0.01333333 0.01428571 0.01639344 0.01666667 0.01694915
 0.01818182 0.01960784 0.02       0.02040816 0.0212766  0.02173913
 0.02272727 0.02380952 0.025      0.02531646 0.02542373 0.02666667
 0.02702703 0.02727273 0.02857143 0.03       0.03030303 0.03125
 0.03225806 0.03278689 0.03333333 0.03389831 0.03448276 0.03571429
 0.03636364 0.03773585 0.03797468 0.03846154 0.03921569 0.04
 0.04081633 0.04166667 0.04237288 0.04255319 0.04285714 0.04347826
 0.04444444 0.04545455 0.046875   0.04761905 0.04878049 0.04918033
 0.05       0.05263158 0.05333333 0.05405405 0.05454545 0.05555556
 0.05660377 0.05714286 0.05882353 0.05932203 0.06       0.06060606
 0.06122449 0.0625     0.06329114 0.06363636 0.06382979 0.06451613
 0.06521739 0.06557377 0.06666667 0.06779661 0.06818182 0.06896552
 0.07       0.07142857 0.07272727 0.07317073 0.075      0.0754717
 0.07594937 0.07627119 0.07692308 0.078125   0.07843137 0.07894737
 0.08       0.08108108 0.08163265 0.08181818

Its $110$ part is sparsely filled and even its $710$ part comes along with a little more than $10\%$ of filling, only. The LCS metric generates a similarity of 1 for the cases where both strings of a pair are empty. Missing values on both sides may be an indicator for a pair of duplicates but due to the sparsely available information, it is a weak indicator. Therefore, the pairs with missing data on both sides of the pair, will be marked with the negative value.

In [17]:
df_feature_base = dpf.mark_missing(df_feature_base, attribute, factor)

Some sample cases are shown below for both $\texttt{corporate}$ features.

In [18]:
dpf.show_samples_interval(
    df_feature_base[df_feature_base.duplicates==1],
    attribute, 0.0, 1.0, 20
)

Unnamed: 0,duplicates,corporate_full_delta,corporate_full_x,corporate_full_y
312672,1,1.0,"interkantonale lehrmittelzentrale (rapperswil,...","interkantonale lehrmittelzentrale (rapperswil,..."
266618,1,1.0,interkantonale lehrmittelzentrale,interkantonale lehrmittelzentrale
312733,1,0.606557,"interkantonale lehrmittelzentrale (rapperswil,...",interkantonale lehrmittelzentrale (raupperswli...
297161,1,1.0,schweizeidgenössisches topographisches bureau,schweizeidgenössisches topographisches bureau
306006,1,1.0,schweizerische normen-vereinigung,schweizerische normen-vereinigung
315010,1,1.0,les arts florissants,les arts florissants
297354,1,0.666667,eidgenössische landestopographie,eidgevnösshsche landestopographie
259590,1,0.666667,wiener philharmoniker,wienerlphilharmoniker
273398,1,0.866667,bühnen kölnoper,bühnen kölnopre
284267,1,0.5,"interkantonale lehrmittelzentrale (rapperswil,...","interkantonale lehrmittelzentoale (rapperswil,..."


0.0 <= corporate_full_delta <= 1.0


In [19]:
position = uniques_len[attribute]//2 # Let's have a look in the middle range of the similarities.

dpf.show_samples_interval(
    df_feature_base, attribute,
    uniques[attribute][uniques_len[attribute]-position],
    uniques[attribute][uniques_len[attribute]-position+2], 20)

Unnamed: 0,duplicates,corporate_full_delta,corporate_full_x,corporate_full_y
284262,1,0.616667,"interkantonale lehrmittelzentrale (rapperswil,...","interkantonale lehrmitkelzentrale (rapperswil,..."
270914,1,0.615385,eidgenössisches topographisches bureau,eidgenössisches topograpahisches bureau
284444,1,0.616667,"interkantonale lehrmittelzentrale (rapperswil,...","interkantonalel ehrmittelzentrale (rapperswil,..."
302246,1,0.615385,"opernhaus (zürich)orchester, opernhaus (zürich...","opernhaus (zürich)orchester, opevnhaus (zürich..."
281782,1,0.616667,"interkantonale lehrmittelzentrale (rapperswil,...",interkantonale mlehrmittelzentrale (rapperswil...
287815,1,0.616667,"interkantonale lehrmittelzentrale (rapperswil,...","interkantonae lehrmittmelzentrale (rapperswil,..."
301638,1,0.616667,"interkantonale lehrmittelzentrale (rapperswil,...","interkantonale lehrmitnelzentrale (rapperswil,..."
281680,1,0.616667,"interkantonale lehrmittelzentrale (rapperswil,...","interkantonale lehrmittelzentrale (raiperswil,..."
276489,1,0.614286,"interkantonale lehrmittelzentrale, staatlicher...","interkantonale lehrmittelzentrale, staatlicehr..."
315222,1,0.615385,"berliner philharmoniker, deutsche oper (berlin...","bverliner philharmoziker, deutsche oper (berli..."


0.6142857142857143 <= corporate_full_delta <= 0.6166666666666667


### doi

Swissbib uses an explicit $\texttt{doi}$ attribute for its deduplication implementation. In chapter [Goldstandard and Data Preparation](./2_GoldstandardDataPreparation.ipynb), the real doi identifier has been isolated with the help of a preprocessing function $\texttt{.reduce}\_\texttt{to}\_\texttt{doi}\_\texttt{element()}$, see [Data Analysis](./1_DataAnalysis.ipynb). Attribute $\texttt{doi}$ contains a single string value. The Identity metric will be used for comparing the string values of a pair in a row, calculating a similarity value of 1.0 or 0.0 for each pair. If one list is empty a value of 0 is returned.

In [20]:
attribute = 'doi'

columns_metadata_dict['similarity_metrics'][attribute] = tedi.Identity()

df_feature_base, columns_metadata_dict = dpf.build_delta_feature(
    df_feature_base, attribute,
    columns_metadata_dict['similarity_metrics'][attribute],
    columns_metadata_dict)

df_feature_base['doi_delta'].unique()

array([1., 0.])

Some sample cases are shown below for each category of $\texttt{doi}\_\texttt{delta}$.

In [21]:
uniques[attribute], uniques_len[attribute] = dpf.determine_similarity_values(
    df_feature_base, attribute)

for doi_delta_value in df_feature_base['doi_delta'].unique():
    number_of_max_samples = min(
        10,
        len(df_feature_base[df_feature_base['doi_delta']==doi_delta_value])
    )

    dpf.show_samples_distinct(df_feature_base, 'doi', doi_delta_value, number_of_max_samples)
    print(f'doi_delta = {doi_delta_value}')

doi values range [0. 1.]


Unnamed: 0,duplicates,doi_delta,doi_x,doi_y
46677,0,1.0,,
158078,0,1.0,,
28712,0,1.0,,
30232,0,1.0,,
143720,0,1.0,,
44436,0,1.0,,
41721,0,1.0,,
245413,0,1.0,,
30424,0,1.0,,
146825,0,1.0,,


doi_delta = 1.0


Unnamed: 0,duplicates,doi_delta,doi_x,doi_y
233119,0,0.0,,10.5169/seals-377251
14280,0,0.0,,10.5169/seals-376437
122307,0,0.0,,10.3931/e-rara-50903
21968,0,0.0,,10.5167/uzh-67756
120434,0,0.0,,10.5169/seals-376689
241450,0,0.0,,10.5169/seals-515356
88997,0,0.0,,10.5169/seals-376396
190222,0,0.0,10.1055/b-002-26639,
202876,0,0.0,,10.5169/seals-377251
255352,0,0.0,10.1055/b-005-143650,


doi_delta = 0.0


In [22]:
# Let's have a look at some non-empty doi elements
df_doi_with_element = df_feature_base[df_feature_base.doi_x.apply(lambda x : len(x) > 0)]

for doi_delta_value in df_feature_base['doi_delta'].unique():
    number_of_max_samples = min(
        10,
        len(df_feature_base[df_feature_base['doi_delta']==doi_delta_value])
    )

    dpf.show_samples_distinct(df_doi_with_element, 'doi', doi_delta_value, number_of_max_samples)
    print(f'doi_delta = {doi_delta_value}')

Unnamed: 0,duplicates,doi_delta,doi_x,doi_y
290453,1,1.0,10.5169/seals-376510,10.5169/seals-376510
319283,1,1.0,10.1007/978-3-642-41698-9,10.1007/978-3-642-41698-9
292577,1,1.0,10.5169/seals-377392,10.5169/seals-377392
291419,1,1.0,10.5169/seals-376925,10.5169/seals-376925
290640,1,1.0,10.5169/seals-376572,10.5169/seals-376572
292705,1,1.0,10.5169/seals-377422,10.5169/seals-377422
292046,1,1.0,10.5169/seals-377188,10.5169/seals-377188
291245,1,1.0,10.5169/seals-376850,10.5169/seals-376850
291491,1,1.0,10.5169/seals-376961,10.5169/seals-376961
290524,1,1.0,10.5169/seals-376539,10.5169/seals-376539


doi_delta = 1.0


Unnamed: 0,duplicates,doi_delta,doi_x,doi_y
168034,0,0.0,10.1093/cid/ciu795,
200480,0,0.0,10.1007/978-3-642-41698-9,10.5169/seals-376359
193508,0,0.0,10.5451/unibas-006499413,
207370,0,0.0,10.1093/cid/ciu795,
194206,0,0.0,10.5451/unibas-006503313,
154814,0,0.0,10.5167/uzh-53042,
194685,0,0.0,10.5451/unibas-006503313,
162765,0,0.0,10.1007/978-3-642-41698-9,
253151,0,0.0,10.1055/b-005-143650,
255482,0,0.0,10.1055/b-005-143650,


doi_delta = 0.0


As can be seen above, a value of 1.0 is returned if both strings of a pair are empty. As the attribute filling of $\texttt{doi}$ is sparse, see chapter [Data Analysis](./1_DataAnalysis.ipynb), the $\texttt{doi}\_\texttt{delta}$ indicates strongly a pair of duplicates for most rows. To avoid such misleading identity indication, function $\texttt{.mark}\_\texttt{missing()}$ will be applyed to the attribute.

In [23]:
df_feature_base = dpf.mark_missing(df_feature_base, attribute, factor)

### edition

In its original form in Swissbib's raw data, the edition statement is a string value which may have several words. The modelling on this attribute has been tried with and without stripping letter characters from the string. The final decision for the best processing will be documented in chapter [Overview and Summary](./0_OverviewSummary.ipynb). A Jaccard similarity is tried for this attribute.

In [24]:
attribute = 'edition'

columns_metadata_dict['similarity_metrics'][attribute] = tedi.Jaccard()

df_feature_base, columns_metadata_dict = dpf.build_delta_feature(
    df_feature_base, attribute,
    columns_metadata_dict['similarity_metrics'][attribute],
    columns_metadata_dict)

In [25]:
uniques[attribute], uniques_len[attribute] = dpf.determine_similarity_values(
    df_feature_base, attribute)

import numpy as np

edition_delta_uniques = np.sort(df_feature_base['edition_delta'].unique())
edition_delta_uniques_len = len(edition_delta_uniques)
print('edition values range', edition_delta_uniques[:30])

edition values range [0.         0.125      0.14285714 0.16666667 0.2        0.25
 0.28571429 0.33333333 0.4        0.5        0.6        1.        ]
edition values range [0.         0.125      0.14285714 0.16666667 0.2        0.25
 0.28571429 0.33333333 0.4        0.5        0.6        1.        ]


The comparison results in a wide number of distinct similarity values for the goldstandard data set. Below, some examples are shown.

In [26]:
position = edition_delta_uniques_len

dpf.show_samples_interval(
    df_feature_base, 'edition',
    edition_delta_uniques[edition_delta_uniques_len-position-2],
    edition_delta_uniques[edition_delta_uniques_len-position-1], 10)
dpf.show_samples_interval(
    df_feature_base, 'edition',
    edition_delta_uniques[edition_delta_uniques_len-position],
    edition_delta_uniques[edition_delta_uniques_len-position+2], 10)

position = edition_delta_uniques_len//2

dpf.show_samples_interval(
    df_feature_base, 'edition',
    edition_delta_uniques[edition_delta_uniques_len-position-2],
    edition_delta_uniques[edition_delta_uniques_len-position-1], 10)

Unnamed: 0,duplicates,edition_delta,edition_x,edition_y
306961,1,1.0,,
7603,0,1.0,,
169190,0,1.0,,
88926,0,1.0,,
90888,0,1.0,,
103558,0,1.0,,
21850,0,1.0,,
85809,0,1.0,,
250349,0,1.0,,
320569,1,1.0,,


0.6 <= edition_delta <= 1.0


Unnamed: 0,duplicates,edition_delta,edition_x,edition_y
141418,0,0.0,,3.0
177066,0,0.0,,4.0
73421,0,0.0,,2.0
166175,0,0.0,,6.0
126915,0,0.0,,1994.0
181863,0,0.0,6.0,
135117,0,0.0,,3.0
246635,0,0.0,,2.0
90681,0,0.0,,2.0
234935,0,0.0,,1889.0


0.0 <= edition_delta <= 0.1428571428571428


Unnamed: 0,duplicates,edition_delta,edition_x,edition_y
55265,0,0.25,1,1943
41807,0,0.2,1791,13
41919,0,0.2,1791,10
181843,0,0.25,6,1926
248220,0,0.2,10425,2
104748,0,0.25,8,1899
184459,0,0.2,1863,11
123136,0,0.25,2,1926
248419,0,0.2,10425,2
105112,0,0.25,8,1899


0.19999999999999996 <= edition_delta <= 0.25


Again, for $\texttt{edition}\_\texttt{delta} = 1$, many empty values of the $\texttt{edition}$ attribute can be observed. These will be marked with the special negative value in the data with the goal to distinguish them from the matching attribute pairs.

In [27]:
df_feature_base = dpf.mark_missing(df_feature_base, 'edition', factor)

In [28]:
position = edition_delta_uniques_len

dpf.show_samples_interval(
    df_feature_base, 'edition',
    edition_delta_uniques[edition_delta_uniques_len-position-2],
    edition_delta_uniques[edition_delta_uniques_len-position-1], 10)

Unnamed: 0,duplicates,edition_delta,edition_x,edition_y
317821,1,1.0,1863,1863
294230,1,1.0,5,5
321238,1,1.0,8,8
264619,1,1.0,2,2
267405,1,1.0,1885,1885
289203,1,1.0,11,11
983,1,1.0,8,8
282850,1,1.0,1,1
317167,1,1.0,2,2
297227,1,1.0,1928,1928


0.6 <= edition_delta <= 1.0


### exactDate

As discussed in chapter [Data Analysis](./1_DataAnalysis.ipynb), attribute $\texttt{exactDate}$ holds a year number stored in the first four digits. Letter 'u' is used as a placeholder for an unknown digit. The attribute may hold some month and day or a second year information in its second four digits, additionally.

The attribute will be kept as a string and will not be transformed to an integer. The feature attribute of the record pair to be compared will be calculated with a modified Hamming algorithm, see appendix [Comparison of Similarity Metrics](./B_CompareSimilarities.ipynb). The resulting similarity will be stored in a new attribute $\texttt{exactDate}\_\texttt{delta}$ which will be taken for the model calculation.

As can be seen in chapter [Decision Tree Model](./6_DecisionTreeModel.ipynb), this attribute is important for prediction. Different ways of increasing the weight of the unknown status of a digit have been tried. The different ways can be seen in the implementations below. The algorithm decided for the final simulation will be documented in chapter [Overview and Summary](./0_OverviewSummary.ipynb).

In [29]:
import string

def no_xor (x_side, y_side) :
    number = 0
    for i in range(len(x_side)) :
        if ((x_side[i] in string.ascii_lowercase) | (y_side[i] in string.ascii_lowercase)) & (x_side[i] != y_side[i]) :
            number = number + 1
    return number

print('Example comparison results in a value of', no_xor ('202a0aaa', '1920uuuu'))

Example comparison results in a value of 5


In [30]:
attribute = 'exactDate'

# Replace letter 'u' with letter 'a' for one of the two strings.
#  As an effect, the resulting Hamming similarity for a letter
#  instead of a numerical digit in either string will add with an amount 0.
df_feature_base[attribute+'_x'] = df_feature_base.exactDate_x.str.replace('u', 'a')

# Compute Hamming similarity for century string pair.
columns_metadata_dict['similarity_metrics'][attribute] = tedi.Hamming()

unknown_share = 16

df_feature_base, columns_metadata_dict = dpf.build_delta_feature(
    df_feature_base, attribute,
    columns_metadata_dict['similarity_metrics'][attribute],
    columns_metadata_dict)

if exactDate_mode == 'added_u':
    # Add amount of 1/16 to Hamming similarity for every letter digit.
    #  But only maximum number of letter digits in both strings of a pair.
    df_feature_base[attribute+'_delta'] = df_feature_base[[
        attribute+'_x', attribute+'_y', attribute+'_delta']].apply(
        lambda x : x[attribute+'_delta'] + 
        max(x[attribute+'_x'].count('a'), x[attribute+'_y'].count('u'))/unknown_share, axis=1
    )
elif exactDate_mode == 'xor':
    # Add amount of 1/16 to Hamming similarity for every letter digit.
    #  But only number of position-wise xor-ed letter digits in the two strings of a pair.
    df_feature_base[attribute+'_delta'] = df_feature_base[[
        attribute+'_x', attribute+'_y', attribute+'_delta']].apply(
        lambda x : x[attribute+'_delta'] + 
        no_xor(x[attribute+'_x'], x[attribute+'_y'])/unknown_share, axis=1
    )

In [31]:
df_feature_base[['exactDate_x', 'exactDate_y', 'exactDate_delta']].sample(n=10)

Unnamed: 0,exactDate_x,exactDate_y,exactDate_delta
265732,16001700,16001790,0.875
117616,1959aaaa,2017uuuu,0.25
178196,2005aaaa,19969999,0.25
193868,2016aaaa,1763uuuu,0.25
128808,1999aaaa,1870uuuu,0.375
75963,1960aaaa,1993uuuu,0.5
296559,2007aaaa,2007uuuu,0.75
48527,1979aaaa,uuuuuuuu,0.5
177,19791999,19791999,1.0
235262,1989aaaa,19952006,0.5


All resulting values of equal strings are equal to 1.

In [32]:
df_feature_base[['exactDate_x', 'exactDate_y', 'exactDate_delta']][
    df_feature_base.exactDate_x == df_feature_base.exactDate_y
].sort_values('exactDate_delta', ascending=False).head()

Unnamed: 0,exactDate_x,exactDate_y,exactDate_delta
159,20022000,20022000,1.0
303973,20172016,20172016,1.0
303990,20172016,20172016,1.0
303989,20172016,20172016,1.0
303988,20172016,20172016,1.0


A discrete set of different similarity values can be found in the attribute deltas. Some sample records are shown below.

In [33]:
exactDate_deltas = np.sort(df_feature_base.exactDate_delta.unique())
exactDate_deltas

array([0.    , 0.125 , 0.25  , 0.3125, 0.375 , 0.4375, 0.5   , 0.5625,
       0.625 , 0.6875, 0.75  , 0.8125, 0.875 , 1.    ])

In [34]:
sample_size = 5

for i in exactDate_deltas :
    dpf.show_samples_distinct(df_feature_base, 'exactDate', i, sample_size)
    print(f'exactDate_delta = {i}')

Unnamed: 0,duplicates,exactDate_delta,exactDate_x,exactDate_y
79314,0,0.0,20062005,19679999
26268,0,0.0,19791999,20002005
27743,0,0.0,20092005,19989999
43907,0,0.0,20091990,19282011
17425,0,0.0,19911990,20130627


exactDate_delta = 0.0


Unnamed: 0,duplicates,exactDate_delta,exactDate_x,exactDate_y
186560,0,0.125,20151475,16001700
103601,0,0.125,20091991,18631869
121754,0,0.125,19911794,20121970
51356,0,0.125,19969999,16001700
154005,0,0.125,18501875,19669999


exactDate_delta = 0.125


Unnamed: 0,duplicates,exactDate_delta,exactDate_x,exactDate_y
85525,0,0.25,2007aaaa,1889uuuu
172460,0,0.25,2015aaaa,1980uuuu
30794,0,0.25,20071990,1994uuuu
78767,0,0.25,2005aaaa,19972008
159001,0,0.25,2013aaaa,1994uuuu


exactDate_delta = 0.25


Unnamed: 0,duplicates,exactDate_delta,exactDate_x,exactDate_y
12901,0,0.3125,2005aaaa,181uuuuu
143871,0,0.3125,1982aaaa,200uuuuu
166661,0,0.3125,2013aaaa,192uuuuu
251282,0,0.3125,183aaaaa,20022010
133278,0,0.3125,2000aaaa,193uuuuu


exactDate_delta = 0.3125


Unnamed: 0,duplicates,exactDate_delta,exactDate_x,exactDate_y
135643,0,0.375,1764aaaa,1996uuuu
78608,0,0.375,2005aaaa,1995uuuu
155117,0,0.375,1970aaaa,1796uuuu
135616,0,0.375,1764aaaa,1979uuuu
234581,0,0.375,2009aaaa,1979uuuu


exactDate_delta = 0.375


Unnamed: 0,duplicates,exactDate_delta,exactDate_x,exactDate_y
90054,0,0.4375,170aaaaa,1992uuuu
242256,0,0.4375,1763aaaa,189uuuuu
119048,0,0.4375,1987aaaa,181uuuuu
5056,0,0.4375,1988aaaa,189uuuuu
89921,0,0.4375,170aaaaa,1885uuuu


exactDate_delta = 0.4375


Unnamed: 0,duplicates,exactDate_delta,exactDate_x,exactDate_y
69158,0,0.5,1963aaaa,1955uuuu
75488,0,0.5,1970aaaa,1992uuuu
144582,0,0.5,1982aaaa,1995uuuu
170740,0,0.5,1981aaaa,1999uuuu
237912,0,0.5,1999aaaa,1900uuuu


exactDate_delta = 0.5


Unnamed: 0,duplicates,exactDate_delta,exactDate_x,exactDate_y
217066,0,0.5625,1970aaaa,192uuuuu
181230,0,0.5625,2014aaaa,200uuuuu
33061,0,0.5625,1999aaaa,193uuuuu
30096,0,0.5625,1999aaaa,193uuuuu
230530,0,0.5625,1987aaaa,193uuuuu


exactDate_delta = 0.5625


Unnamed: 0,duplicates,exactDate_delta,exactDate_x,exactDate_y
144806,0,0.625,1982aaaa,1902uuuu
107312,0,0.625,2005aaaa,2004uuuu
23223,0,0.625,2010aaaa,2012uuuu
246731,0,0.625,2017aaaa,2013uuuu
108848,0,0.625,1982aaaa,1983uuuu


exactDate_delta = 0.625


Unnamed: 0,duplicates,exactDate_delta,exactDate_x,exactDate_y
264759,1,0.6875,1980aaaa,198u0uuu
266244,1,0.6875,200aaaaa,200uuuuu
283129,1,0.6875,2003aaaa,200u3uuu
274166,1,0.6875,1970aaaa,197u0uuu
293037,1,0.6875,2014aaaa,201u4uuu


exactDate_delta = 0.6875


Unnamed: 0,duplicates,exactDate_delta,exactDate_x,exactDate_y
297782,1,0.75,2015aaaa,2015uuuu
284059,1,0.75,1993aaaa,1993uuuu
265571,1,0.75,2010aaaa,2010uuuu
297824,1,0.75,2015aaaa,2015uuuu
282112,1,0.75,1998aaaa,1998uuuu


exactDate_delta = 0.75


Unnamed: 0,duplicates,exactDate_delta,exactDate_x,exactDate_y
263006,1,0.8125,19aa9999,1u9u9999
262996,1,0.8125,19aa9999,19u9u999
263032,1,0.8125,19aa9999,19u9u999
263018,1,0.8125,19aa9999,1u9u9999
263007,1,0.8125,19aa9999,1u9u9999


exactDate_delta = 0.8125


Unnamed: 0,duplicates,exactDate_delta,exactDate_x,exactDate_y
312718,1,0.875,19972002,19972006
311686,1,0.875,201310aa,201310uu
311631,1,0.875,201310aa,201310uu
263061,1,0.875,19aa9999,19uu9999
272636,1,0.875,200919aa,200919uu


exactDate_delta = 0.875


Unnamed: 0,duplicates,exactDate_delta,exactDate_x,exactDate_y
262315,1,1.0,19811995,19811995
271022,1,1.0,19809999,19809999
262091,1,1.0,19841992,19841992
262044,1,1.0,19841992,19841992
321446,1,1.0,19739999,19739999


exactDate_delta = 1.0


### format

Due to the discussion in chapter [Data Analysis](./1_DataAnalysis.ipynb), attribute $\texttt{format}$ has been split up into two new attributes $\texttt{format}\_\texttt{prefix}$ and $\texttt{format}\_\texttt{postfix}$ which will be compared by a different similarity metrics.

- As the quality of $\texttt{format}\_\texttt{prefix}$ is expected to be high, an identity comparison should be sufficient.
- Due to the observed structure of $\texttt{format}\_\texttt{postfix}$, a q-gram based comparison will be chosen.

In [35]:
attribute = 'format'

columns_metadata_dict['similarity_metrics'][attribute+'_prefix'] = tedi.Identity()
columns_metadata_dict['similarity_metrics'][attribute+'_postfix'] = tedi.Jaccard(qval=2)

pfix_values = ['_prefix', '_postfix']

for pf in pfix_values :
    df_feature_base, columns_metadata_dict = dpf.build_delta_feature(
        df_feature_base, attribute+pf,
        columns_metadata_dict['similarity_metrics'][attribute+pf],
        columns_metadata_dict)

In [36]:
for i in df_feature_base.format_prefix_delta[
    df_feature_base.format_prefix_x != df_feature_base.format_prefix_y].unique():
    
    dpf.show_samples_distinct(df_feature_base, 'format_prefix', i)
    print(f'format_prefix_delta = {i}')

Unnamed: 0,duplicates,format_prefix_delta,format_prefix_x,format_prefix_y
93958,0,0.0,mu,bk
158163,0,0.0,vm,mu
172879,0,0.0,bk,mu
194377,0,0.0,bk,vm
67137,0,0.0,bk,mp


format_prefix_delta = 0.0


In [37]:
for i in df_feature_base.format_postfix_delta[
    df_feature_base.format_postfix_x != df_feature_base.format_postfix_y].unique():
    
    dpf.show_samples_distinct(df_feature_base, 'format_postfix', i)
    print(f'format_postfix_delta = {i}')

Unnamed: 0,duplicates,format_postfix_delta,format_postfix_x,format_postfix_y
102040,0,0.428571,10300,10200
193022,0,0.428571,20300,20000
157994,0,0.428571,20000,20053
192969,0,0.428571,20300,20053
246961,0,0.428571,20000,20053


format_postfix_delta = 0.4285714285714286


Unnamed: 0,duplicates,format_postfix_delta,format_postfix_x,format_postfix_y
33539,0,0.111111,30300,40100
97625,0,0.111111,10300,20000
213176,0,0.111111,20053,10000
5880,0,0.111111,10300,20000
149425,0,0.111111,40100,20000


format_postfix_delta = 0.11111111111111116


Unnamed: 0,duplicates,format_postfix_delta,format_postfix_x,format_postfix_y
248615,0,0.25,10200,20353
181224,0,0.25,20000,20353
76041,0,0.25,10200,20347
40089,0,0.25,20000,20353
92644,0,0.25,20000,20353


format_postfix_delta = 0.25


Unnamed: 0,duplicates,format_postfix_delta,format_postfix_x,format_postfix_y
17390,0,0.0,40100,30653
173908,0,0.0,20000,30653
209111,0,0.0,20000,10347
247267,0,0.0,30653,20000
247492,0,0.0,30653,10000


format_postfix_delta = 0.0


Unnamed: 0,duplicates,format_postfix_delta,format_postfix_x,format_postfix_y
313893,1,1.0,10200,10200
313189,1,1.0,10100,10100
272578,1,1.0,10100,10100
288926,1,1.0,10000,10000
322938,1,1.0,20300,20300


format_postfix_delta = 1.0


Unnamed: 0,duplicates,format_postfix_delta,format_postfix_x,format_postfix_y
316551,1,0.666667,20000,200000
267710,1,0.666667,10300,100300
291464,1,0.666667,10053,100053
282442,1,0.666667,20000,200000
292399,1,0.666667,10053,100053


format_postfix_delta = 0.6666666666666666


### isbn

Swissbib uses each string element of the $\texttt{isbn}$ list separately for comparing with each string element of its comparison $\texttt{isbn}$ list. If two bibliographic units hold at least one element in common, this is interpreted as a strong indicator for duplicates [[WiCo2001](./A_References.ipynb#wico2001)].

This hard logic is used in a modified way in the context of this capstone project. A special comparison function $\texttt{.build}\_\texttt{delta}\_\texttt{isbn()}$ has been implemented that compares each list element of the left-hand side with each list element of the right-hand side of a pair. According to Swissbib's implementation, the Identity metric is used for string comparison, calculating a similarity value of 1.0 or 0.0 for each list element pair. For normalisation reasons, the sum of similarity values is divided by the number of elements of the smaller list. If both lists are empty a value of 1.0 is returned. If only one list is empty a value of 0.0 is returned.

In [38]:
attribute = 'isbn'

columns_metadata_dict['similarity_metrics'][attribute] = tedi.Identity()

df_feature_base, columns_metadata_dict = dpf.build_delta_feature(
    df_feature_base, attribute,
    columns_metadata_dict['similarity_metrics'][attribute],
    columns_metadata_dict)

df_feature_base[attribute+'_delta'].unique()

array([1. , 0. , 0.5])

Some sample cases are shown below for each category of $\texttt{isbn_delta}$.

In [39]:
for isbn_delta_value in df_feature_base['isbn_delta'].unique():
    number_of_max_samples = min(
        10,
        len(df_feature_base[df_feature_base['isbn_delta']==isbn_delta_value])
    )

    dpf.show_samples_distinct(df_feature_base, 'isbn', isbn_delta_value, number_of_max_samples)
    print(f'isbn_delta = {isbn_delta_value}')

Unnamed: 0,duplicates,isbn_delta,isbn_x,isbn_y
17541,0,1.0,[],[]
293071,1,1.0,[978-3-944063-13-3],[978-3-944063-13-3]
300854,1,1.0,[],[]
300774,1,1.0,[],[]
309336,1,1.0,"[978-3-598-31501-5 (print), 978-3-11-097003-6]","[978-3-598-31501-5 (print), 978-3-11-097003-6]"
1414,1,1.0,[],[]
69451,0,1.0,[],[]
21980,0,1.0,[],[]
321747,1,1.0,"[978-3-495-48796-9, 3-495-48796-4]","[978-3-495-48796-9, 3-495-48796-4]"
10887,0,1.0,[],[]


isbn_delta = 1.0


Unnamed: 0,duplicates,isbn_delta,isbn_x,isbn_y
157440,0,0.0,"[978-3-13-127286-7, 978-3-13-150826-3 (PDF)]",[978-0-307-38684-7]
216339,0,0.0,[2-08-070552-0],"[3-906721-51-5 (Livre de l'élève), 978-3-90672..."
60917,0,0.0,[],[978-1-85584-230-4]
55747,0,0.0,[978-3-86772-104-2],[978-3-906721-38-5]
222140,0,0.0,[978-2-07-046833-1],[]
159126,0,0.0,[978-3-643-12370-1],[978-0-7294-1151-6]
21483,0,0.0,[3-499-17476-6],"[978-3-13-128546-1, 3-13-128546-X, 978-3-13-20..."
41101,0,0.0,[],[978-2-01-322734-6]
212576,0,0.0,[],"[978-0-19-953552-1, 0-19-953552-3]"
165511,0,0.0,[978-3-15-020008-7],[]


isbn_delta = 0.0


Unnamed: 0,duplicates,isbn_delta,isbn_x,isbn_y
1205,1,0.5,"[978-3-13-127286-7, 3-13-127286-4]","[978-3-13-127286-7, 978-3-13-150826-3 (PDF)]"
1199,1,0.5,"[978-3-13-127286-7, 978-3-13-150826-3 (PDF)]","[978-3-13-127286-7, 3-13-127286-4]"
1195,1,0.5,"[978-3-13-127286-7, 3-13-127286-4]","[978-3-13-127286-7, 978-3-13-150826-3 (PDF)]"
1202,1,0.5,"[978-3-13-127286-7, 978-3-13-150826-3 (PDF)]","[978-3-13-127286-7, 3-13-127286-4]"
1201,1,0.5,"[978-3-13-127286-7, 978-3-13-150826-3 (PDF)]","[978-3-13-127286-7, 3-13-127286-4]"
1210,1,0.5,"[978-3-13-127286-7, 3-13-127286-4]","[978-3-13-127286-7, 978-3-13-150826-3 (PDF)]"


isbn_delta = 0.5


For attribute $\texttt{isbn}$, the special marking of missing values is omitted.

### ismn

This attribute will be processed with the identity similarity metric. The reasoning for this decision is the same as for similar attributes above. 

In [40]:
attribute = 'ismn'

columns_metadata_dict['similarity_metrics'][attribute] = tedi.Identity()
#tedi.Jaccard()

df_feature_base, columns_metadata_dict = dpf.build_delta_feature(
    df_feature_base, attribute,
    columns_metadata_dict['similarity_metrics'][attribute],
    columns_metadata_dict)

In [41]:
uniques[attribute], uniques_len[attribute] = dpf.determine_similarity_values(
    df_feature_base, attribute)

ismn values range [0. 1.]


In [42]:
for ismn_delta_value in df_feature_base[attribute+'_delta'].unique():
    number_of_max_samples = min(
        10,
        len(df_feature_base[df_feature_base[attribute+'_delta']==ismn_delta_value])
    )

    dpf.show_samples_distinct(df_feature_base, 'ismn', ismn_delta_value, number_of_max_samples)
    print(f'ismn_delta = {ismn_delta_value}')

Unnamed: 0,duplicates,ismn_delta,ismn_x,ismn_y
319784,1,1.0,,
155951,0,1.0,,
58926,0,1.0,,
97119,0,1.0,,
86112,0,1.0,,
88048,0,1.0,,
310289,1,1.0,,
41264,0,1.0,,
322181,1,1.0,,
39780,0,1.0,,


ismn_delta = 1.0


Unnamed: 0,duplicates,ismn_delta,ismn_x,ismn_y
233676,0,0.0,,m008060205
79903,0,0.0,m006546756,
250981,0,0.0,m006204687,
79921,0,0.0,m006546756,
137162,0,0.0,m006204687,
79883,0,0.0,m006546756,
137258,0,0.0,m006204687,
250104,0,0.0,m200205343,
256384,0,0.0,"m006546756 (kritischer bericht, leinen)",
154156,0,0.0,,m006450510


ismn_delta = 0.0


As can be seen in the previous chapters, attribute $\texttt{ismn}$ is filled sparsely. A lot of missing values calculate to a value of 1.0 in the chosen similarity metrics. To mark these cases specifically, they will be transformed to a negative value.

In [43]:
df_feature_base = dpf.mark_missing(df_feature_base, attribute, factor)

### musicid

Chapter [Data Analysis](./1_DataAnalysis.ipynb) shows that attribute $\texttt{musicid}$ is an identifyer for a music record. A Jaccard metric has been tested on this attribute, resulting in a distribution of many high similarity values on uniques. Comparing this result with the LCS metric, the latter has been decided.

In [44]:
attribute = 'musicid'

columns_metadata_dict['similarity_metrics'][attribute] = tedi.LCSStr()
#tedi.Jaccard()

df_feature_base, columns_metadata_dict = dpf.build_delta_feature(
    df_feature_base, attribute,
    columns_metadata_dict['similarity_metrics'][attribute],
    columns_metadata_dict)

In [45]:
uniques[attribute], uniques_len[attribute] = dpf.determine_similarity_values(
    df_feature_base, attribute)

musicid values range [0.         0.125      0.14285714 0.16666667 0.2        0.25
 0.28571429 0.33333333 0.375      0.4        0.42857143 0.44444444
 0.5        0.55555556 0.57142857 0.6        0.625      0.66666667
 0.71428571 0.75       0.77777778 0.8        0.83333333 0.85714286
 0.875      1.        ]


In [46]:
position = uniques_len[attribute]

dpf.show_samples_interval(
    df_feature_base, attribute,
    uniques[attribute][uniques_len[attribute]-position-2],
    uniques[attribute][uniques_len[attribute]-position-1], 10)
dpf.show_samples_interval(
    df_feature_base, attribute,
    uniques[attribute][uniques_len[attribute]-position],
    uniques[attribute][uniques_len[attribute]-position+2], 10)

position = uniques_len[attribute]//2

dpf.show_samples_interval(
    df_feature_base, attribute,
    uniques[attribute][uniques_len[attribute]-position],
    uniques[attribute][uniques_len[attribute]-position+1], 10)

Unnamed: 0,duplicates,musicid_delta,musicid_x,musicid_y
1140,1,1.0,601,601
314047,1,1.0,245,245
314399,1,1.0,3714,3714
324431,1,1.0,134,134
287473,1,1.0,5714,5714
314567,1,1.0,4553,4553
323998,1,1.0,502023,502023
288196,1,1.0,502430,502430
314318,1,1.0,245,245
298696,1,1.0,172,172


0.875 <= musicid_delta <= 1.0


Unnamed: 0,duplicates,musicid_delta,musicid_x,musicid_y
213025,0,0.0,,
24553,0,0.0,3714.0,
185888,0,0.0,,
50068,0,0.0,,
21221,0,0.0,,
49461,0,0.0,,
242536,0,0.0,,
10801,0,0.0,,
38831,0,0.0,,
33390,0,0.0,,


0.0 <= musicid_delta <= 0.1428571428571429


Unnamed: 0,duplicates,musicid_delta,musicid_x,musicid_y
271685,1,0.571429,134134,13m4134
271679,1,0.571429,134134,1341d34
260931,1,0.571429,502023,50c2023
322710,1,0.571429,501326,5013c26
268558,1,0.571429,901842,9018w42
321033,1,0.571429,502430,5024i30
304762,1,0.571429,4357412,437412
260961,1,0.571429,502023,50h2023
271692,1,0.571429,134134,1341s34
324067,1,0.571429,502023,50m2023


0.5555555555555556 <= musicid_delta <= 0.5714285714285714


In [47]:
dpf.show_samples_interval(
    df_feature_base[df_feature_base.duplicates==1], attribute,
    uniques[attribute][0],
    uniques[attribute][uniques_len[attribute]-1], 20)

Unnamed: 0,duplicates,musicid_delta,musicid_x,musicid_y
305360,1,0.0,,
274883,1,0.0,,
291261,1,0.0,,
262041,1,0.0,,
310499,1,0.0,,
321250,1,0.0,,
269176,1,0.0,4355.0,
283366,1,0.0,,
282259,1,0.0,,
289655,1,0.0,,


0.0 <= musicid_delta <= 1.0


The attribute is filled with a degree of below $10\%$. The chosen metric for it results in a similarity value of 1.0 for empty value pairs. This effect can be adjusted with function $\texttt{.mark}\_\texttt{missing}()$ as above. 

In [48]:
df_feature_base = dpf.mark_missing(df_feature_base, 'musicid', factor)

### part

Analogous to attribute $\texttt{edition}$ described above, the string value of this attribute can be stripped to pure number digits. Both ways, with and without letter stripping have been tried for modelling. The final decision for the best processing will be documented in chapter [Overview and Summary](./0_OverviewSummary.ipynb). Three different metrics have been tried for attribute $\texttt{part}$. Finally, metric Jaro will be used.

In [49]:
attribute = 'part'

columns_metadata_dict['similarity_metrics'][attribute] = tedi.StrCmp95()
#tedi.Jaro()
#tedi.Hamming()
#tedi.LCSStr()

df_feature_base, columns_metadata_dict = dpf.build_delta_feature(
    df_feature_base, attribute,
    columns_metadata_dict['similarity_metrics'][attribute],
    columns_metadata_dict)

In [50]:
uniques[attribute], uniques_len[attribute] = dpf.determine_similarity_values(
    df_feature_base, attribute)

part values range [0.         0.27407407 0.28240741 0.29202279 0.30740741 0.31318681
 0.31481481 0.32407407 0.32898551 0.33333333 0.33597884 0.33838384
 0.33921569 0.34444444 0.3452381  0.35128205 0.35185185 0.35897436
 0.36111111 0.36231884 0.36666667 0.37037037 0.37179487 0.37254902
 0.37301587 0.37407407 0.375      0.37777778 0.38333333 0.38461538
 0.38568376 0.38888889 0.38927739 0.39070048 0.39215686 0.39393939
 0.3952381  0.3960114  0.39646465 0.39722222 0.39814815 0.4
 0.4031746  0.4037037  0.40604575 0.40740741 0.40842491 0.40855763
 0.41025641 0.41111111 0.41125541 0.41176471 0.41203704 0.41282051
 0.41388889 0.41449275 0.41452991 0.41507937 0.41666667 0.4178744
 0.41798942 0.41851852 0.41919192 0.42083333 0.42222222 0.42261905
 0.42390289 0.4241453  0.42564103 0.42592593 0.42777778 0.42810458
 0.42857143 0.42948718 0.43030303 0.43055556 0.43115942 0.43174603
 0.43236715 0.43333333 0.43518519 0.43557423 0.43589744 0.43627451
 0.43650794 0.43703704 0.4375     0.43813131 0.43888

In [51]:
position = uniques_len[attribute]

dpf.show_samples_interval(
    df_feature_base, attribute,
    uniques[attribute][uniques_len[attribute]-position-2],
    uniques[attribute][uniques_len[attribute]-position-1], 10)
dpf.show_samples_interval(
    df_feature_base, attribute,
    uniques[attribute][uniques_len[attribute]-position],
    uniques[attribute][uniques_len[attribute]-position+2], 10)

position = uniques_len[attribute]//7

dpf.show_samples_interval(
    df_feature_base, attribute,
    uniques[attribute][uniques_len[attribute]-position-2],
    uniques[attribute][uniques_len[attribute]-position-1], 10)
dpf.show_samples_interval(
    df_feature_base, attribute,
    uniques[attribute][uniques_len[attribute]-position],
    uniques[attribute][uniques_len[attribute]-position+2], 10)

Unnamed: 0,duplicates,part_delta,part_x,part_y
201573,0,1.0,,
46142,0,1.0,,
313971,1,1.0,,
121724,0,1.0,,
45662,0,1.0,,
174332,0,1.0,,
110754,0,1.0,,
99577,0,1.0,,
288142,1,1.0,,
303176,1,1.0,,


0.9215686274509803 <= part_delta <= 1.0


Unnamed: 0,duplicates,part_delta,part_x,part_y
135940,0,0.0,,9
136327,0,0.0,,2620 2620
148542,0,0.0,,2
209705,0,0.0,,24 23 1863
138546,0,0.0,,23 23 1869
219673,0,0.0,,20 20
219822,0,0.0,63 63,7
252699,0,0.0,313 2017,
202032,0,0.0,,7
252030,0,0.0,2017 313 2017,


0.0 <= part_delta <= 0.28240740740740744


Unnamed: 0,duplicates,part_delta,part_x,part_y
35449,0,0.714286,552 552,2
257260,0,0.714286,23 1862,2
159635,0,0.714286,5,285 285
214466,0,0.714286,912 912,2
197341,0,0.714286,23 1862,3
121147,0,0.714286,2,241 319
35312,0,0.714286,552 552,2
216314,0,0.714286,552 552,2
123315,0,0.714286,5,552 552
197324,0,0.714286,23 1862,2


0.7142857142857143 <= part_delta <= 0.7146464646464646


Unnamed: 0,duplicates,part_delta,part_x,part_y
197596,0,0.717338,23 1862,23 23 1870 23
257495,0,0.717338,23 1862,23 23 1889 23
166866,0,0.716667,28 10 2013 2421 2431,2 2
197570,0,0.717338,23 1862,23 23 1906 23
197585,0,0.717338,23 1862,23 23 1946 23
183896,0,0.716667,26 2000 26 2000,279 2000
197572,0,0.717338,23 1862,23 23 1870 23
257479,0,0.717338,23 1862,23 23 1988 23
183903,0,0.716667,26 2000 26 2000,286 2007
166702,0,0.716667,28 10 2013 2421 2431,282 2003


0.7159544159544161 <= part_delta <= 0.7173382173382173


In this attribute, too, moving pairs of empty values to negative values will result in a clearer distinction between pairs of uniques and duplicates, as will be seen in the graphical comparison of capter [Features Discussion and Dummy Classifier Baseline](./5_FeatureDiscussionDummyBaseline.ipynb).

In [52]:
df_feature_base = dpf.mark_missing(df_feature_base, 'part', factor)

### person

As a result of chapter [Data Analysis](./1_DataAnalysis.ipynb), attribute $\texttt{person}$ has been split into three specific attributes. Attribute $\texttt{person}\_{100}$ and $\texttt{person}\_{700}$ hold strongly standardised string values. For comparing pure strings, a Levenshtein metric is recommended [[Chri2012](./A_References.ipynb#chri2012)]. Unfortunately, this metric shows a very long calculation time on the data of the capstone project. Comparing the similarity values of the Levenshtein metric with the similarity values of other metrics in appendix [Comparison of Similarity Metrics](./B_CompareSimilarities.ipynb), similarity metric StrCmp95 has been decided to use.

In [53]:
attribute = 'person'

columns_metadata_dict['similarity_metrics'][attribute+'_100'] = tedi.StrCmp95()
columns_metadata_dict['similarity_metrics'][attribute+'_700'] = tedi.StrCmp95()
#tedi.Levenshtein()

pe_values = ['_100', '_700']

for pe in pe_values :
    print('Calculating person'+pe)
    df_feature_base, columns_metadata_dict = dpf.build_delta_feature(
        df_feature_base, attribute+pe,
        columns_metadata_dict['similarity_metrics'][attribute+pe],
        columns_metadata_dict)

Calculating person_100


Calculating person_700


In [54]:
pe = '_100'

uniques[attribute+pe], uniques_len[attribute+pe] = dpf.determine_similarity_values(
    df_feature_base, attribute+pe)

person_100 values range [0.         0.31944444 0.32777778 ... 0.99090909 0.99130435 1.        ]


In [55]:
position = uniques_len[attribute+pe]

dpf.show_samples_interval(
    df_feature_base, attribute+pe,
    uniques[attribute+pe][uniques_len[attribute+pe]-position-2],
    uniques[attribute+pe][uniques_len[attribute+pe]-position-1], 10)
dpf.show_samples_interval(
    df_feature_base, attribute+pe,
    uniques[attribute+pe][uniques_len[attribute+pe]-position],
    uniques[attribute+pe][uniques_len[attribute+pe]-position+2], 10)

Unnamed: 0,duplicates,person_100_delta,person_100_x,person_100_y
245726,0,1.0,,
281769,1,1.0,,
122492,0,1.0,,
301776,1,1.0,,
226638,0,1.0,,
323523,1,1.0,voltaire,voltaire
17288,0,1.0,mozartwolfgang amadeus,mozartwolfgang amadeus
110026,0,1.0,mozartwolfgang amadeus,mozartwolfgang amadeus
295258,1,1.0,,
26073,0,1.0,,


0.9913043478260869 <= person_100_delta <= 1.0


Unnamed: 0,duplicates,person_100_delta,person_100_x,person_100_y
168492,0,0.0,,mozartwolfgang amadeus
163121,0,0.0,,bührerwalter
234199,0,0.0,rosoffmeg,
198452,0,0.0,,schubertfranz
35147,0,0.0,voltaire,
113629,0,0.0,,austenjane
119862,0,0.0,,bührerwalter
35957,0,0.0,mozartwolfgang amadeus,
93199,0,0.0,,bührerwalter
156703,0,0.0,trappehans-joachim,


0.0 <= person_100_delta <= 0.3277777777777777


For comparing person names, like in attribute $\texttt{person}\_{245c}$, a Jaro metric will be tested [[Chri2012](./A_References.ipynb#chri2012)].

In [56]:
pe = '_245c'

columns_metadata_dict['similarity_metrics'][attribute+pe] = tedi.Jaro()

df_feature_base, columns_metadata_dict = dpf.build_delta_feature(
    df_feature_base, attribute+pe,
    columns_metadata_dict['similarity_metrics'][attribute+pe],
    columns_metadata_dict)

In [57]:
uniques[attribute+pe], uniques_len[attribute+pe] = dpf.determine_similarity_values(
    df_feature_base, attribute+pe)

person_245c values range [0.         0.24632035 0.25462963 ... 0.99881797 0.99882214 1.        ]


In [58]:
position = uniques_len[attribute+pe]

dpf.show_samples_interval(
    df_feature_base, attribute+pe,
    uniques[attribute+pe][uniques_len[attribute+pe]-position-2],
    uniques[attribute+pe][uniques_len[attribute+pe]-position-1], 10)
dpf.show_samples_interval(
    df_feature_base, attribute+pe,
    uniques[attribute+pe][uniques_len[attribute+pe]-position],
    uniques[attribute+pe][uniques_len[attribute+pe]-position+2], 10)

Unnamed: 0,duplicates,person_245c_delta,person_245c_x,person_245c_y
292041,1,1.0,[walter bührer],[walter bührer]
49977,0,1.0,,
309031,1,1.0,"mortzfeld, peter; raabe, paul","mortzfeld, peter; raabe, paul"
312926,1,1.0,von w.a. mozart,von w.a. mozart
316623,1,1.0,hrsg. von attila csampai und dietmar holland,hrsg. von attila csampai und dietmar holland
292090,1,1.0,[walter bührer],[walter bührer]
292236,1,1.0,[walter bührer],[walter bührer]
273102,1,1.0,sigrid kessler ... [et al.],sigrid kessler ... [et al.]
210280,0,1.0,,
280525,1,1.0,"voltaire ; introd., notes , bibliogr., chronol...","voltaire ; introd., notes , bibliogr., chronol..."


0.9988221436984688 <= person_245c_delta <= 1.0


Unnamed: 0,duplicates,person_245c_delta,person_245c_x,person_245c_y
212668,0,0.0,,rudolf steiner
233882,0,0.0,,"mortzfeld, peter; raabe, paul"
62684,0,0.0,,g.h. dufour direxit ; h. müllhaupt sculpsit
234574,0,0.0,mozart,
145909,0,0.0,,mozart
210774,0,0.0,,"mortzfeld, peter; raabe, paul"
219825,0,0.0,beatrice käser,
165880,0,0.0,"[a.u. scherrer, b. ledergerber, v. von wyl, j....",
145813,0,0.0,,sigrid kessler... [et al.] ; [hrsg.:] interkan...
114194,0,0.0,wolfgang amadeus mozart ; einf. und komm. von ...,


0.0 <= person_245c_delta <= 0.25462962962962954


The similarities of all three $\texttt{person}$ attributes are affected by empty values. These will be handled the same way as the attributes above.

In [59]:
pe_values = ['_100', '_245c', '_700']

for pe in pe_values :
    df_feature_base = dpf.mark_missing(df_feature_base, 'person'+pe, factor)

### pubinit

This attribute holds publisher strings that have a similar representation as attribute $\texttt{person}$. A Jaro metric will be used.

In [60]:
attribute = 'pubinit'

columns_metadata_dict['similarity_metrics'][attribute] = tedi.Jaro()

df_feature_base, columns_metadata_dict = dpf.build_delta_feature(
    df_feature_base, attribute,
    columns_metadata_dict['similarity_metrics'][attribute],
    columns_metadata_dict)

In [61]:
uniques[attribute], uniques_len[attribute] = dpf.determine_similarity_values(
    df_feature_base, attribute)

pubinit values range [0.         0.25132275 0.25303644 ... 0.9957265  0.99578059 1.        ]


In [62]:
position = uniques_len[attribute]//3

dpf.show_samples_interval(
    df_feature_base, attribute,
    uniques[attribute][uniques_len[attribute]-position-1],
    uniques[attribute][uniques_len[attribute]-position], 10)
dpf.show_samples_interval(
    df_feature_base, attribute,
    uniques[attribute][uniques_len[attribute]-5],
    uniques[attribute][uniques_len[attribute]-1], 10)

Unnamed: 0,duplicates,pubinit_delta,pubinit_x,pubinit_y
139222,0,0.593651,kinowelt home entertainment arthaus,peters
35564,0,0.593651,flammarion,"w. goldmann, musikverlag b. schott's söhne"
117991,0,0.593651,flammarion,"w. goldmann, musikverlag b. schott's söhne"
139390,0,0.593651,kinowelt home entertainment arthaus,peters
216429,0,0.593651,flammarion,"w. goldmann, musikverlag b. schott's söhne"
139371,0,0.593651,kinowelt home entertainment arthaus,peters
38877,0,0.593561,theater st. gallen,ed. del teatro alla scala
139219,0,0.593651,kinowelt home entertainment arthaus,peters
53702,0,0.593561,theater st. gallen,ed. del teatro alla scala


0.5935612535612536 <= pubinit_delta <= 0.5936507936507937


Unnamed: 0,duplicates,pubinit_delta,pubinit_x,pubinit_y
314588,1,1.0,,
77240,0,1.0,,
233373,0,1.0,,
65038,0,1.0,,
311491,1,1.0,,
196935,0,1.0,,
312934,1,1.0,,
273204,1,1.0,uam,uam
79184,0,1.0,,
283359,1,1.0,staatlicher lehrmittelverlag,staatlicher lehrmittelverlag


0.9954337899543378 <= pubinit_delta <= 1.0


The similarities of $\texttt{pubinit}$ is affected by empty values. These will be transformed to negative values.

In [63]:
df_feature_base = dpf.mark_missing(df_feature_base, attribute, factor)

### scale

Comparing the similarity metrics of some sample value pairs of attribute $\texttt{scale}$ in appendix [Comparison of Similarity Metrics](./B_CompareSimilarities.ipynb), a Jaccard metrics has been identifyed to express the best matching behaviour for purely numerical values stored in the attribute.

In [64]:
attribute = 'scale'

columns_metadata_dict['similarity_metrics'][attribute] = tedi.Jaccard()

df_feature_base, columns_metadata_dict = dpf.build_delta_feature(
    df_feature_base, attribute,
    columns_metadata_dict['similarity_metrics'][attribute],
    columns_metadata_dict)

In [65]:
uniques[attribute], uniques_len[attribute] = dpf.determine_similarity_values(
    df_feature_base, attribute)

scale values range [0.         0.17857143 0.21428571 0.57142857 1.        ]


In [66]:
position = uniques_len[attribute]

dpf.show_samples_interval(
    df_feature_base, attribute,
    uniques[attribute][uniques_len[attribute]-position],
    uniques[attribute][uniques_len[attribute]-position+1], 10)
dpf.show_samples_interval(
    df_feature_base, attribute,
    uniques[attribute][uniques_len[attribute]-3],
    uniques[attribute][uniques_len[attribute]-2], 10)
dpf.show_samples_interval(
    df_feature_base, attribute,
    uniques[attribute][uniques_len[attribute]-4],
    uniques[attribute][uniques_len[attribute]-3], 10)

Unnamed: 0,duplicates,scale_delta,scale_x,scale_y
103286,0,0.0,,100000.0
134727,0,0.0,,100000.0
253919,0,0.0,,100000.0
19354,0,0.0,,100000.0
38696,0,0.0,,100000.0
128189,0,0.0,,100000.0
34185,0,0.0,,100000.0
13430,0,0.0,,100000.0
238519,0,0.0,,100000.0
69380,0,0.0,50000.0,


0.0 <= scale_delta <= 0.1785714285714286


Unnamed: 0,duplicates,scale_delta,scale_x,scale_y
69170,0,0.571429,50000,100000
182434,0,0.571429,50000,100000
227827,0,0.214286,50 000 8 10 8 35 45 55 46 05,100000
54932,0,0.571429,50000,100000
69171,0,0.571429,50000,100000
227503,0,0.214286,50 000 8 10 8 35 45 55 46 05,100000
196999,0,0.571429,100000,50000
182746,0,0.571429,50000,100000
55280,0,0.571429,50000,100000
227838,0,0.214286,50 000 8 10 8 35 45 55 46 05,100000


0.2142857142857143 <= scale_delta <= 0.5714285714285714


Unnamed: 0,duplicates,scale_delta,scale_x,scale_y
856,1,0.178571,50000,50 000 8 10 8 35 45 55 46 05
227817,0,0.214286,50 000 8 10 8 35 45 55 46 05,100000
227842,0,0.214286,50 000 8 10 8 35 45 55 46 05,100000
861,1,0.178571,50000,50 000 8 10 8 35 45 55 46 05
227496,0,0.214286,50 000 8 10 8 35 45 55 46 05,100000
227500,0,0.214286,50 000 8 10 8 35 45 55 46 05,100000
875,1,0.178571,50 000 8 10 8 35 45 55 46 05,50000
227815,0,0.214286,50 000 8 10 8 35 45 55 46 05,100000
227834,0,0.214286,50 000 8 10 8 35 45 55 46 05,100000
227845,0,0.214286,50 000 8 10 8 35 45 55 46 05,100000


0.1785714285714286 <= scale_delta <= 0.2142857142857143


Attribute $\texttt{scale}$ is filled for maps, only. Due to its sparse filling, the similarities of the attribute are affected strongly by empty values. These empty values will be marked with a special negative value.

In [67]:
df_feature_base = dpf.mark_missing(df_feature_base, attribute, factor)

### ttlfull

Due to the discussion in chapter [Data Analysis](./1_DataAnalysis.ipynb), attribute $\texttt{ttlfull}$ has been split up into two new attributes $\texttt{ttlfull_245}$ and $\texttt{ttlfull_246}$ which will be compared by the same similarity metrics. A visual analysis of the values stored in the attribute, reveals a string of words, comparable to the strings in attribute $\texttt{person_245c}$, above. The same similarity metric will be used for both title attributes, therefore.

In [68]:
attribute = 'ttlfull'

columns_metadata_dict['similarity_metrics'][attribute+'_245'] = tedi.Jaro()
columns_metadata_dict['similarity_metrics'][attribute+'_246'] = tedi.Jaro()

tf_values = ['_245', '_246']

for tf in tf_values :
    df_feature_base, columns_metadata_dict = dpf.build_delta_feature(
        df_feature_base, attribute+tf,
        columns_metadata_dict['similarity_metrics'][attribute+tf],
        columns_metadata_dict)

In [69]:
for tf in tf_values :
    uniques[attribute+tf], uniques_len[attribute+tf] = dpf.determine_similarity_values(
        df_feature_base, attribute+tf)

ttlfull_245 values range [0.         0.24175824 0.24358974 ... 0.99947257 0.99947341 1.        ]
ttlfull_246 values range [0.         0.27407407 0.35555556 ... 0.99944072 0.99945085 1.        ]


In [70]:
tf = '_245'
position = uniques_len[attribute+tf]

dpf.show_samples_interval(
    df_feature_base, attribute+tf,
    uniques[attribute+tf][uniques_len[attribute+tf]-position],
    uniques[attribute+tf][uniques_len[attribute+tf]-position+1], 10)
dpf.show_samples_interval(
    df_feature_base, attribute+tf,
    uniques[attribute+tf][uniques_len[attribute+tf]-3],
    uniques[attribute+tf][uniques_len[attribute+tf]-2], 10)
dpf.show_samples_interval(
    df_feature_base, attribute+tf,
    uniques[attribute+tf][uniques_len[attribute+tf]-4],
    uniques[attribute+tf][uniques_len[attribute+tf]-3], 10)

Unnamed: 0,duplicates,ttlfull_245_delta,ttlfull_245_x,ttlfull_245_y
115815,0,0.0,bildungsforschung und bildungspraxis,emma
107982,0,0.0,emma,blick in die welt
137622,0,0.0,emma,blick in die welt
107986,0,0.0,emma,blick in die welt
33564,0,0.0,arts,blick in die welt
33850,0,0.0,arts,blick in die welt
137640,0,0.0,emma,blick in die welt
4249,0,0.0,emma,blick in die welt
186251,0,0.0,emma,blick in die welt
33852,0,0.0,arts,blick in die welt


0.0 <= ttlfull_245_delta <= 0.2417582417582418


Unnamed: 0,duplicates,ttlfull_245_delta,ttlfull_245_x,ttlfull_245_y
320293,1,0.999473,health informatics - personal health device co...,health informatics - personal health device co...
320326,1,0.999473,health informatics - personal health device co...,health ifnormatics - personal health device co...
320327,1,0.999473,health informatics - personal health device co...,health informatics - personal health device co...
320268,1,0.999473,health informatics - personal health device co...,health informatics - personal health device co...
320321,1,0.999473,health informatics - personal health device co...,health informatics - personal health edvice co...
320284,1,0.999473,health informatics - personal health device co...,health informatics - personal health device co...
320296,1,0.999473,health informatics - personal health device co...,health informatics - personal health device co...
320255,1,0.999473,health informatics - personal health device co...,health informatics - personal health device co...
320313,1,0.999473,health informatics - personal health device co...,health informatics - personal health device co...
320300,1,0.999473,health informatics - personal health device co...,health informatics - personal health device co...


0.9994725738396625 <= ttlfull_245_delta <= 0.9994734070563455


Unnamed: 0,duplicates,ttlfull_245_delta,ttlfull_245_x,ttlfull_245_y
320327,1,0.999473,health informatics - personal health device co...,health informatics - personal health device co...
320255,1,0.999473,health informatics - personal health device co...,health informatics - personal health device co...
320268,1,0.999473,health informatics - personal health device co...,health informatics - personal health device co...
320317,1,0.999473,health informatics - personal health device co...,health informatics - personal health device co...
319795,1,0.999463,health informatics - personal health device co...,health informatics - personal health device co...
319758,1,0.999463,health informatics - personal health device co...,health informatics - personal health device co...
320321,1,0.999473,health informatics - personal health device co...,health informatics - personal health edvice co...
320296,1,0.999473,health informatics - personal health device co...,health informatics - personal health device co...
320326,1,0.999473,health informatics - personal health device co...,health ifnormatics - personal health device co...
320300,1,0.999473,health informatics - personal health device co...,health informatics - personal health device co...


0.9994632313472893 <= ttlfull_245_delta <= 0.9994725738396625


Attribute $\texttt{ttlfull}\_\texttt{245}$ is filled for all data rows of Swissbib's raw data as can be seen in chapter [Data Analysis](./1_DataAnalysis.ipynb). For attribute $\texttt{ttlfull}\_\texttt{245}$, the filling is below $10\%$. The data pairs with missing values will be marked with a negative value as has been done for similar cases above.

In [71]:
df_feature_base = dpf.mark_missing(df_feature_base, attribute+'_246', factor)

### volumes

This attribute is described in chapter [Data Analysis](./1_DataAnalysis.ipynb) to hold a kind of contents that resembles the contents of attribute $\texttt{part}$. The same similarity metrics will be used for attribute $\texttt{volumes}$ as for attribute $\texttt{part}$, therefore.

In [72]:
attribute = 'volumes'

columns_metadata_dict['similarity_metrics'][attribute] = tedi.StrCmp95()
#tedi.Jaro()
#tedi.LCSSeq()
#tedi.MongeElkan()

df_feature_base, columns_metadata_dict = dpf.build_delta_feature(
    df_feature_base, attribute,
    columns_metadata_dict['similarity_metrics'][attribute],
    columns_metadata_dict)

In [73]:
uniques[attribute], uniques_len[attribute] = dpf.determine_similarity_values(
    df_feature_base, attribute)

volumes values range [0.         0.31318681 0.32222222 0.32905983 0.35128205 0.37301587
 0.37407407 0.38333333 0.38461538 0.3952381  0.40740741 0.41111111
 0.41666667 0.41737892 0.42857143 0.43650794 0.43703704 0.44017094
 0.44166667 0.44200244 0.44230769 0.44761905 0.4537037  0.45555556
 0.45833333 0.46296296 0.46428571 0.46581197 0.46666667 0.47008547
 0.47222222 0.47619048 0.47777778 0.48148148 0.48333333 0.48412698
 0.48611111 0.48888889 0.49007937 0.49145299 0.49206349 0.49365079
 0.4991453  0.5        0.50793651 0.51111111 0.51190476 0.52222222
 0.52380952 0.52564103 0.52777778 0.53333333 0.53653846 0.53703704
 0.53968254 0.54074074 0.54166667 0.54304029 0.54401709 0.54722222
 0.54761905 0.5491453  0.55       0.55128205 0.55555556 0.56031746
 0.56111111 0.56190476 0.56507937 0.56630037 0.56666667 0.56944444
 0.57407407 0.57478632 0.57777778 0.58148148 0.58333333 0.58730159
 0.58862434 0.58888889 0.59444444 0.5952381  0.59722222 0.5982906
 0.6        0.60119048 0.60320513 0.606837

In [74]:
position = uniques_len[attribute]

dpf.show_samples_interval(
    df_feature_base, attribute,
    uniques[attribute][uniques_len[attribute]-position],
    uniques[attribute][uniques_len[attribute]-position+1], 10)
dpf.show_samples_interval(
    df_feature_base, attribute,
    uniques[attribute][uniques_len[attribute]-3],
    uniques[attribute][uniques_len[attribute]-2], 10)
dpf.show_samples_interval(
    df_feature_base, attribute,
    uniques[attribute][uniques_len[attribute]-4],
    uniques[attribute][uniques_len[attribute]-3], 10)

Unnamed: 0,duplicates,volumes_delta,volumes_x,volumes_y
159710,0,0.0,675,1
88431,0,0.0,3 1,
250336,0,0.0,1 32,600
235795,0,0.0,475,58
6147,0,0.0,2,1 56
61169,0,0.0,1,600
11134,0,0.0,3 64 71,1
167185,0,0.0,1,3 1
238545,0,0.0,135,
161071,0,0.0,1,249


0.0 <= volumes_delta <= 0.3131868131868132


Unnamed: 0,duplicates,volumes_delta,volumes_x,volumes_y
54818,0,0.933333,1 36,1 346
247561,0,0.933333,1 45,1 245
247463,0,0.916667,1 45,145
193148,0,0.933333,1 166,1 16
93524,0,0.933333,1 167,1 16
100431,0,0.916667,1 82,1 2
258762,0,0.916667,1 82,1 2
251050,0,0.933333,1 379,1 39
80266,0,0.933333,1 379,1 39
250334,0,0.916667,1 32,1 23


0.9166666666666666 <= volumes_delta <= 0.9333333333333332


Unnamed: 0,duplicates,volumes_delta,volumes_x,volumes_y
258760,0,0.916667,1 82,1 2
320,1,0.904762,1 169,1 0 169
13,1,0.904762,1 169,1 0 169
247463,0,0.916667,1 45,145
2114,0,0.904762,1 0 169,1 169
165068,0,0.916667,1 82,1 2
10889,0,0.916667,1 82,1 2
107548,0,0.916667,1 82,1 2
23932,0,0.916667,1 82,1 2
86200,0,0.916667,1 82,1 2


0.9047619047619048 <= volumes_delta <= 0.9166666666666666


Attribute $\texttt{volumes}$ holds rows with missing data. The data pairs with missing values will be marked with a special negative value.

In [75]:
df_feature_base = dpf.mark_missing(df_feature_base, attribute, factor)

## DataFrame with Attributes and Similarity Features

The metric for each attribute of the feature DataFrame has been decided and the similarity features have been calculated. In this last step, the columns of the DataFrame are reordered in order to place the $\_\texttt{delta}$ columns close to their input origins $\_\texttt{x}$ and $\_\texttt{y}$ and some sample records are shown.

In [76]:
# Take _x, _y, and _delta columns together
fb_col_list = df_feature_base.columns.tolist()
fb_col_list.sort()
# Move target column to first place
fb_col_list.insert(0, fb_col_list.pop(fb_col_list.index('duplicates')))
# Reorder DataFrame columns
df_attribute_with_sim_feature = pd.DataFrame(df_feature_base, columns=fb_col_list)

# Extend display to number of columns of DataFrame
pd.options.display.max_columns = len(df_attribute_with_sim_feature.columns)

class_label = ['uniques', 'duplicate']

for i in class_label:
    display(df_attribute_with_sim_feature[df_attribute_with_sim_feature.duplicates==class_label.index(i)].sample(n=10))
    print(i)

Unnamed: 0,duplicates,coordinate_E_delta,coordinate_E_x,coordinate_E_y,coordinate_N_delta,coordinate_N_x,coordinate_N_y,corporate_full_delta,corporate_full_x,corporate_full_y,doi_delta,doi_x,doi_y,edition_delta,edition_x,edition_y,exactDate_delta,exactDate_x,exactDate_y,format_postfix_delta,format_postfix_x,format_postfix_y,format_prefix_delta,format_prefix_x,format_prefix_y,isbn_delta,isbn_x,isbn_y,ismn_delta,ismn_x,ismn_y,musicid_delta,musicid_x,musicid_y,part_delta,part_x,part_y,person_100_delta,person_100_x,person_100_y,person_245c_delta,person_245c_x,person_245c_y,person_700_delta,person_700_x,person_700_y,pubinit_delta,pubinit_x,pubinit_y,scale_delta,scale_x,scale_y,ttlfull_245_delta,ttlfull_245_x,ttlfull_245_y,ttlfull_246_delta,ttlfull_246_x,ttlfull_246_y,volumes_delta,volumes_x,volumes_y
127939,0,-0.1,,,-0.1,,,-0.1,,,-0.1,,,-0.1,,,0.375,19001950,19819999,0.428571,10200,30100,0.0,mu,vm,1.0,[],[],-0.1,,,-0.05,245.0,,-0.1,,,-0.05,mozartwolfgang amadeus,,0.543241,von w.a. mozart ; klavierauszug neu rev. von w...,sigrid kessler ... [et al.],0.506324,"kienzlwilhelm, mozartwolfgang amadeus",kesslersigrid,-0.05,universal-edition,,-0.1,,,0.585522,"die zauberflöte, oper in zwei akten = il flaut...","bonne chance, cours de langue française, deuxi...",-0.1,,,-0.05,1 167,
202790,0,-0.1,,,-0.1,,,-0.05,schweizerische normen-vereinigung,,-0.1,,,-0.1,,,0.25,2011aaaa,1979uuuu,0.428571,20053,20000,1.0,bk,bk,0.0,[],[3-442-33001-7],-0.1,,,-0.05,,33001.0,-0.05,,33001 33001,-0.05,,mozartwolfgang amadeus,-0.05,,wolfgang amadeus mozart ; dieser opernführer w...,-0.05,,pahlenkurt,0.496337,schweizerische normen-vereinigung (snv),"w. goldmann, musikverlag b. schott's söhne",-0.1,,,0.497999,health informatics - personal health device co...,"die zauberflöte, originalausgabe",-0.05,medizinische informatik - kommunikation von ge...,,-0.05,,252
101968,0,-0.1,,,-0.1,,,-0.05,"metropolitan operaorchestra, metropolitan oper...",,-0.1,,,-0.1,,,0.5,2000aaaa,19001950,0.428571,10300,10100,0.0,vm,mu,1.0,[],[],-0.1,,,0.0,73.0,226.0,-0.1,,,1.0,mozartwolfgang amadeus,mozartwolfgang amadeus,0.593254,w.a. mozart ; libretto emanuel schikaneder,von w. a. mozart,0.513964,"schikanederemanuel, hockneydavid, coxjohn, lev...",mozartwolfgang amadeus,0.489352,deutsche grammophon gesellschaft,breitkopf & härtel,-0.1,,,0.719367,"die zauberflöte, oper in zwei aufzügen : kv 620","die zauberflöte, deutsche oper in 2 acten : kö...",-0.1,,,0.783333,1 169,1 26
251748,0,-0.1,,,-0.1,,,-0.1,,,-0.1,,,-0.1,,,0.25,2017aaaa,1970uuuu,0.428571,10000,10100,0.0,bk,mu,1.0,[],[],-0.1,,,-0.05,,4553.0,0.525641,2017 313 2017,19,-0.05,,mozartwolfgang amadeus,-0.05,,wolfgang amadeus mozart ; vorgelegt von gernot...,-0.1,,,-0.1,,,-0.1,,,0.506323,"alles wissen dieser welt, warum bibliotheken n...","die zauberflöte, [kv 620]",-0.05,bibliotheken,,0.0,74,379
208253,0,-0.05,,e0074147,-0.05,,n0460833,-0.05,,eidgenössische landestopographie,-0.05,10.1093/cid/cir669,,-0.1,,,0.25,20111201,1926uuuu,0.428571,10053,10300,0.0,bk,mp,1.0,[],[],-0.1,,,-0.1,,,0.526548,53 11 2011 12 01 1143 1152,23 1926 23 1926 23,-0.1,,,0.525329,"[alexandra u. scherrer, bruno ledergerber, vik...",g. h. dufour direxit ; h. müllhaupt sculpsit,0.543723,"scherreralexandra u., ledergerberbruno, von wy...","dufourguillaume-henri, müllhauptheinrich",-0.05,,[eidg. landestopographie],-0.05,,100000.0,0.505947,improved virological outcome in white patients...,"domo d'ossola, arona",-0.05,,"[domodossola, arona]",-0.05,,1
113690,0,-0.1,,,-0.1,,,-0.1,,,-0.1,,,-0.1,,,0.25,20071990,1835uuuu,0.428571,10300,10200,0.0,vm,mu,1.0,[],[],-0.1,,,0.166667,502023.0,134134.0,-0.1,,,-0.05,,mozartwolfgang amadeus,0.53805,regie: volker schlöndorff ; drehbuch: volker s...,von w.a. mozart ; clavier-auszug mit ital. u. ...,-0.05,"schlöndorffvolker, wurlitzerrudy, frischmax, m...",,-0.05,kinowelt home entertainment,,-0.1,,,0.480952,homo faber,"die zauberflöte, grosse oper in zwei aufzügen ...",-0.1,,,0.6,2 109,1 127
6188,0,-0.1,,,-0.1,,,-0.05,opernhaus orchester zürich,,-0.1,,,-0.1,,,0.625,2002aaaa,2007uuuu,0.111111,10300,20053,0.0,vm,bk,0.0,[],"[978-3-598-31805-4 (print), 978-3-11-096274-1]",-0.1,,,-0.1,,,-0.05,,45 45,0.633285,mozartwolfgang amadeus,mortzfeldpeter,-0.05,,"mortzfeld, peter; raabe, paul",-0.05,,raabepaul,0.642735,tdk recording media europe,de gruyter saur,-0.1,,,0.533119,die zauberflöte,katalog der graphischen porträts in der herzog...,-0.1,,,0.0,2,1 417
60227,0,-0.1,,,-0.1,,,-0.1,,,-0.1,,,-0.1,,,0.5,1959aaaa,uuuuuuuu,0.428571,20000,10200,0.0,bk,vm,1.0,[],[],-0.1,,,-0.1,,,-0.05,2620,,-0.05,mozartwolfgang amadeus,,0.715058,wolfgang amadeus mozart ; dichtung von emanuel...,"wolfgang amadeus mozart, rens groot",0.591246,"schikanederemanuel, zentnerwilhelm","grootrens, mozartwolfgang amadeus",-0.05,,philips,-0.1,,,0.611643,"die zauberflöte, oper in zwei aufzügen",zauberflöte,-0.1,,,0.0,71,1
30470,0,-0.1,,,-0.1,,,-0.1,,,-0.1,,,-0.05,,2.0,0.25,1999aaaa,2016uuuu,0.428571,20000,20053,1.0,bk,bk,0.0,[3-495-47879-5],[978-3-495-81796-4],-0.1,,,-0.1,,,1.0,57,57,1.0,fluryandreas,fluryandreas,1.0,andreas flury,andreas flury,-0.1,,,-0.1,,,-0.1,,,1.0,"der moralische status der tiere, henry salt, p...","der moralische status der tiere, henry salt, p...",-0.1,,,0.0,316,1
207489,0,-0.1,,,-0.1,,,-0.1,,,-0.05,10.1093/cid/ciu795,,-0.1,,,0.25,20150201,1982uuuu,0.111111,10053,20000,1.0,bk,bk,1.0,[],[],-0.1,,,-0.1,,,-0.05,60 3 2015 02 01 432 437,,-0.1,,,0.59056,"[virginie rozot, amelio patrizia, selena vigan...",heidy binder... [et al.] ; [éd.:] interkantona...,-0.05,"rozotvirginie, patriziaamelio, viganoselena, m...",,-0.05,,staatlicher lehrmittelverl.,-0.1,,,0.587057,combined use of mycobacterium tuberculosis-spe...,"bonne chance!, cours de langue française, prem...",-0.1,,,-0.05,,72


uniques


Unnamed: 0,duplicates,coordinate_E_delta,coordinate_E_x,coordinate_E_y,coordinate_N_delta,coordinate_N_x,coordinate_N_y,corporate_full_delta,corporate_full_x,corporate_full_y,doi_delta,doi_x,doi_y,edition_delta,edition_x,edition_y,exactDate_delta,exactDate_x,exactDate_y,format_postfix_delta,format_postfix_x,format_postfix_y,format_prefix_delta,format_prefix_x,format_prefix_y,isbn_delta,isbn_x,isbn_y,ismn_delta,ismn_x,ismn_y,musicid_delta,musicid_x,musicid_y,part_delta,part_x,part_y,person_100_delta,person_100_x,person_100_y,person_245c_delta,person_245c_x,person_245c_y,person_700_delta,person_700_x,person_700_y,pubinit_delta,pubinit_x,pubinit_y,scale_delta,scale_x,scale_y,ttlfull_245_delta,ttlfull_245_x,ttlfull_245_y,ttlfull_246_delta,ttlfull_246_x,ttlfull_246_y,volumes_delta,volumes_x,volumes_y
281752,1,-0.1,,,-0.1,,,1.0,"interkantonale lehrmittelzentrale (rapperswil,...","interkantonale lehrmittelzentrale (rapperswil,...",-0.1,,,1.0,2.0,2.0,0.75,1999aaaa,1999uuuu,1.0,20000,20000,1.0,bk,bk,1.0,[3-906721-48-5],[3-906721-48-5],-0.1,,,-0.1,,,-0.1,,,-0.1,,,0.994326,sigrid kessler... [et al.] ; [hrsg.:] interkan...,sigrid kessler..j. [et al.] ;[ hrsg.:] interka...,0.852293,"kesslersigrid, hubercharles","rkesslersigrid, hubercharles",0.97619,staatlicher lehrmittelverlag,staatlichear lehrmittelerlag,-0.1,,,0.974155,"bonne chance!, cours de langue française, 3, c...","bonn echance!, cours de langul française, 3, c...",-0.1,,,1.0,181,181
263688,1,-0.1,,,-0.1,,,-0.1,,,-0.1,,,-0.1,,,0.75,2003aaaa,2003uuuu,1.0,10300,10300,1.0,vm,vm,1.0,[],[],-0.1,,,-0.1,,,-0.1,,,-0.1,,,1.0,w.a. mozart ; video director brian large,w.a. mozart ; video director brian large,0.988235,"mozartwolfgang amadeus, largebrian","mozartwolfgang amadeus, largebrimn",0.884259,del prado,del rayo,-0.1,,,0.931746,die zauberflöte,dei zauberfyöte,-0.1,,,1.0,1 169,1 169
285703,1,-0.1,,,-0.1,,,-0.1,,,-0.1,,,-0.1,,,0.75,1998aaaa,1998uuuu,1.0,20000,20000,1.0,bk,bk,1.0,[],[],-0.1,,,-0.1,,,-0.1,,,0.943162,kesslersigrid,keslersigrid,1.0,sigrid kessler [u.a.],sigrid kessler [u.a.],-0.1,,,0.928982,staatlicher lehrmittelverlag,staatliciher lehrmittelverlag,-0.1,,,1.0,"bonne chance!, cours de langue française, nouv...","bonne chance!, cours de langue française, nouv...",-0.1,,,1.0,107,107
259757,1,-0.1,,,-0.1,,,-0.1,,,-0.1,,,-0.1,,,0.75,2000aaaa,2000uuuu,1.0,20047,20047,1.0,bk,bk,1.0,[3-932992-42-3],[3-932992-42-3],-0.1,,,-0.1,,,-0.1,,,-0.1,,,0.981481,max frisch (u.a.),max frisfch (u.a.),0.977778,frischmax,frishmax,0.888889,terzio,terzao,-0.1,,,0.901122,"homo faber, originaltext, interpretation, biog...","homo faber, originalext, interpretation, biogr...",-0.1,,,1.0,1,1
266249,1,-0.1,,,-0.1,,,-0.1,,,-0.1,,,-0.1,,,0.6875,200aaaaa,200uuuuu,1.0,20000,20000,1.0,bk,bk,1.0,[978-0-14-062010-8],[978-0-14-062010-8],-0.1,,,-0.1,,,-0.1,,,0.98,austenjane,austenjan,1.0,jane austen,jane austen,-0.1,,,0.948718,penguin books,penguin boofk,-0.1,,,0.933333,emma,embma,-0.1,,,1.0,367,367
282246,1,-0.1,,,-0.1,,,-0.1,,,-0.1,,,-0.1,,,0.75,1999aaaa,19997uuu,1.0,20000,20000,1.0,bk,bk,1.0,[0-7294-0706-3],[0-7294-0706-3],-0.1,,,-0.1,,,-0.1,,,0.977778,voltaire,voltaifre,1.0,voltaire ; introd. et notes par john renwick,voltaire ; introd. et notes par john renwick,0.945455,renwickjohn,raenwicjohn,1.0,voltaire foundation,voltaire foundation,-0.1,,,1.0,traité sur la tolérance,traité sur la tolérance,-0.1,,,1.0,142,142
267002,1,-0.1,,,-0.1,,,-0.1,,,-0.1,,,-0.1,,,0.75,2006aaaa,2006uuuu,1.0,20000,20000,1.0,bk,bk,1.0,"[978-1-85584-193-2, 1-85584-193-2]","[978-1-85584-193-2, 1-85584-193-2]",-0.1,,,-0.1,,,-0.1,,,1.0,steinerrudolf,steinerrudolf,0.952381,rudolf steiner,erdolf steiner,-0.1,,,1.0,sophia books,sophia books,-0.1,,,0.986667,how do i find the christ?,how do i findt he christ?,-0.1,,,1.0,58,58
292475,1,-0.1,,,-0.1,,,-0.1,,,1.0,10.5169/seals-377332,10.5169/seals-377332,-0.1,,,0.75,2007aaaa,2007uuuu,1.0,10053,10053,1.0,bk,bk,1.0,[],[],-0.1,,,-0.1,,,1.0,286 2007,286 2007,0.94697,bührerwalter,bührewaltre,0.955556,[walter bührer],mwalter bührer],-0.1,,,-0.1,,,-0.1,,,0.919118,blick in die welt,blick mn die welt,-0.1,,,-0.1,,
287886,1,-0.1,,,-0.1,,,-0.1,,,-0.1,,,-0.1,,,1.0,19001950,19001950,1.0,10100,10100,1.0,mu,mu,1.0,[],[],-0.1,,,-0.1,,,-0.1,,,0.990909,mozartwolfgang amadeus,mozarwolfgang amadeus,0.998148,von emanuel schikaneder ; musik von w. a. moza...,von emanuel schikaneder ; musik von w. a. mzoa...,1.0,"aberthermann, schikanederemanuel, mozartwolfga...","aberthermann, schikanederemanuel, mozartwolfga...",0.944444,e. eulenburg,e.peulenburg,-0.1,,,0.961345,"die zauberflöte, eine deutsche oper","die zauberflöte, eine deutscheroper",-0.1,,,1.0,1 412,1 412
280654,1,-0.1,,,-0.1,,,-0.1,,,-0.1,,,-0.1,,,0.75,1998aaaa,1998uuuu,1.0,20000,20000,1.0,bk,bk,1.0,[3-15-002620-2],[3-15-002620-2],-0.1,,,-0.1,,,1.0,2620 2620,2620 2620,0.981818,mozartwolfgang amadeus,mozarjwolfgang amadeus,0.996255,wolfgang amadeus mozart ; libretto von emanuel...,wolfgang amadeus mozart ; libretto von emanuel...,0.994595,"kochhans-albrecht, schikanederemanuel","kochhans-albrceht, schikanederemanuel",1.0,p. reclam jun.,p. reclam jun.,-0.1,,,0.942018,"die zauberflöte, kv 620 : eine grosse oper in ...","ide zauberflöte, kv 620g: eine grosse opr in z...",-0.1,,,1.0,90,90


duplicate


## Summary

This chapter covers the central area of feature construction. The features of the feature matrix have been generated for each attribute of Swissbib's raw data, deciding on its similarity metric. With these metric values, the feature base DataFrame has been extended and a new DataFrame with the attribute values of the pairs together with their calculated similarity value have been generated. The similarity values will be the final features for training and performance testing of the models, compare [[JudACaps](./A_References.ipynb#judacaps)].

In [77]:
columns_metadata_dict['similarity_metrics']

{'coordinate_E': LCSStr({'qval': 1, 'external': True}),
 'coordinate_N': LCSStr({'qval': 1, 'external': True}),
 'corporate_full': LCSStr({'qval': 1, 'external': True}),
 'doi': Identity({'qval': 1, 'external': True}),
 'edition': Jaccard({'qval': 1, 'as_set': False, 'external': True}),
 'exactDate': Hamming({'qval': 1, 'test_func': <function Base._ident at 0x10b7c2950>, 'truncate': False, 'external': True}),
 'format_prefix': Identity({'qval': 1, 'external': True}),
 'format_postfix': Jaccard({'qval': 2, 'as_set': False, 'external': True}),
 'isbn': Identity({'qval': 1, 'external': True}),
 'ismn': Identity({'qval': 1, 'external': True}),
 'musicid': LCSStr({'qval': 1, 'external': True}),
 'part': StrCmp95({'long_strings': False, 'external': True}),
 'person_100': StrCmp95({'long_strings': False, 'external': True}),
 'person_700': StrCmp95({'long_strings': False, 'external': True}),
 'person_245c': Jaro({'qval': 1, 'long_tolerance': False, 'winklerize': False, 'external': True}),
 'pu

The similarity metric decided for each attribute has been added as an additional kind of information to the columns metadata dictionary. The following table gives this summary in a structured form and lists the metric used for each attribute. Attributes with the same font color indicate similar types of values (description column) for better orientation.

| attribute     | subtype | description | similarity metric |
| ------------- |:--------|:------------|:------------------|
|<font color='red'>[coordinate](#coordinate)</font>|<font color='red'>\_E</font>|<font color='red'>Code(9)</font>|<font color='red'>LCSStr</font>|
|               |<font color='red'>\_N</font>|<font color='red'>Code(9)</font>|<font color='red'>LCSStr</font>|
|<font color='blue'>[corporate](#corporate)</font>|<font color='blue'>\_full</font>|<font color='blue'>Name</font>|<font color='blue'>LCSStr</font>|
|<font color='green'>[doi](#doi)</font>|         |<font color='green'>Identifier</font>|<font color='green'>Identity</font>|
|<font color='orange'>[edition](#edition)</font>|         |<font color='orange'>Number</font>|<font color='orange'>Jaccard</font>|
|<font color='black'>[exactDate](#exactDate)</font>|         |<font color='black'>Date</font>|<font color='black'>Hamming</font>|
|<font color='red'>[format](#format)</font>|<font color='red'>\_prefix</font>|<font color='red'>Code(2)</font>|<font color='red'>Identity</font>|
|               |<font color='red'>\_postfix</font>|<font color='red'>Code(6)</font>|<font color='red'>Jaccard (qval=2)</font>|
|<font color='green'>[isbn](#isbn)</font>|         |<font color='green'>Identifier</font>|<font color='green'>Identity</font>|
|<font color='green'>[ismn](#ismn)</font>|         |<font color='green'>Identifier</font>|<font color='green'>Identity</font>|
|<font color='green'>[musicid](#musicid)</font>|         |<font color='green'>Identifier</font>|<font color='green'>LCSStr</font>|
|<font color='orange'>[part](#part)</font>|         |<font color='orange'>Number</font>|<font color='orange'>StrCmp95</font>|
|<font color='blue'>[person](#person)</font>|<font color='blue'>\_100</font>|<font color='blue'>Name</font>|<font color='blue'>StrCmp95</font>|
|               |<font color='blue'>\_700</font>|<font color='blue'>Name</font>|<font color='blue'>StrCmp95</font>|
|               |<font color='blue'>\_245c</font>|<font color='blue'>Name</font>|<font color='blue'>Jaro</font>|
|<font color='blue'>[pubinit](#pubinit)</font>|         |<font color='blue'>Name</font>|<font color='blue'>Jaro</font>|
|<font color='orange'>[scale](#scale)</font>|         |<font color='orange'>Number</font>|<font color='orange'>Jaccard</font>|
|<font color='blue'>[ttlfull](#ttlfull)</font>|<font color='blue'>\_245</font>|<font color='blue'>String</font>|<font color='blue'>Jaro</font>|
|               |<font color='blue'>\_246</font>|<font color='blue'>String</font>|<font color='blue'>Jaro</font>|
|<font color='orange'>[volumes](#volumes)</font>|         |<font color='orange'>Number</font>|<font color='orange'>StrCmp95</font>|

### Full Feature Matrix with Target Vector Handover

To hand over the resulting DataFrame of this chapter, the DataFrame is saved into a pickle file that will be read in the next chapters [Features Discussion and Dummy Classifier Baseline](./5_FeatureDiscussionDummyBaseline.ipynb) as input.

In [78]:
# Store into compressed intermediary file
with bz2.BZ2File(os.path.join(path_goldstandard,
                       'labelled_feature_matrix_full.pkl'), 'w') as df_output_file:
    pk.dump(df_attribute_with_sim_feature, df_output_file)
    
# Binary intermediary DataFrame file for docid's
with open(os.path.join(path_goldstandard, 'index_docids_df.pkl'), 'wb') as df_output_file:
    pk.dump(df_index_docids, df_output_file)

The full metadata dictionary is to be persisted for handover to subsequent chapters.

In [79]:
# The target is still needed for the feature matrix
columns_metadata_dict['features'].append('duplicates')

for k in columns_metadata_dict.keys():
    print(k, '\n', columns_metadata_dict[k], '\n')

data_analysis_columns 
 ['coordinate_E', 'coordinate_N', 'corporate_full', 'doi', 'edition', 'exactDate', 'format_prefix', 'format_postfix', 'isbn', 'ismn', 'musicid', 'part', 'person_100', 'person_700', 'person_245c', 'pubinit', 'scale', 'ttlfull_245', 'ttlfull_246', 'volumes'] 

columns_to_use 
 ['duplicates', 'coordinate_E_x', 'coordinate_E_y', 'coordinate_N_x', 'coordinate_N_y', 'corporate_full_x', 'corporate_full_y', 'doi_x', 'doi_y', 'edition_x', 'edition_y', 'exactDate_x', 'exactDate_y', 'format_prefix_x', 'format_prefix_y', 'format_postfix_x', 'format_postfix_y', 'isbn_x', 'isbn_y', 'ismn_x', 'ismn_y', 'musicid_x', 'musicid_y', 'part_x', 'part_y', 'person_100_x', 'person_100_y', 'person_700_x', 'person_700_y', 'person_245c_x', 'person_245c_y', 'pubinit_x', 'pubinit_y', 'scale_x', 'scale_y', 'ttlfull_245_x', 'ttlfull_245_y', 'ttlfull_246_x', 'ttlfull_246_y', 'volumes_x', 'volumes_y'] 

similarity_metrics 
 {'coordinate_E': LCSStr({'qval': 1, 'external': True}), 'coordinate_N': L

In [80]:
# Binary intermediary metadata file
with open(os.path.join(path_goldstandard,
                       'columns_metadata.pkl'), 'wb') as dict_output_file:
    pk.dump(columns_metadata_dict, dict_output_file)