# Data Preparation

In [1]:
import os
import json
import pandas as pd
import general_transformations as gt

In [2]:
records_master = []
records_slave = []
records_unique = []
path_goldstandard = './daten_goldstandard'

for line in open(os.path.join(path_goldstandard, 'master.json'), 'r'):
    records_master.append(json.loads(line))
for line in open(os.path.join(path_goldstandard, 'slave.json'), 'r'):
    records_slave.append(json.loads(line))
for line in open(os.path.join(path_goldstandard, 'unique.json'), 'r'):
    records_unique.append(json.loads(line))

len(records_master), len(records_slave), len(records_unique)

(159, 435, 596)

In [3]:
df_m = pd.DataFrame(records_master)
df_s = pd.DataFrame(records_slave)
df_u = pd.DataFrame(records_unique)

len(df_m.columns), len(df_s.columns), len(df_u.columns)

(23, 23, 23)

In [4]:
# Extend display to number of columns of DataFrame
pd.options.display.max_columns = len(df_m.columns)

df_m.head()

Unnamed: 0,035liste,century,coordinate,corporate,decade,docid,doi,edition,exactDate,format,isbn,ismn,musicid,pages,part,person,pubinit,pubword,pubyear,scale,ttlfull,ttlpart,volumes
0,"[(IDSBB)002447452, (ALEX)9912923344101791]",2000,[],{},2000,264853032,[],,2000,"[BK020000, BK020800]",[0-87834-101-3],[],,[319 S.],"[Vol. 26, 26, 2000]","{'100': [], '700': ['ChuC.Y. Cyrus', 'LeeRonal...",[],[],2000,,{'245': ['Population and economic change in Ea...,{'245': ['Population and economic change in Ea...,[319 S.]
1,"[(IDSBB)000369647, (NEBIS)005609528]",1978,[],{},1978,264853288,[],,1978,[BK020000],[],[],,[252 S.],[33001/680],{'100': ['MozartWolfgang Amadeus1756-1791(DE-5...,[],[],1978,,{'245': ['Die Zauberflöte']},{'245': ['Die Zauberflöte']},[252 S.]
2,"[(IDSLU)001293605, (IDSBB)006725532, (NEBIS)01...",2017,[],{},2017,264853539,[],,2017,[BK010000],[],[],,[74 Seiten],[Nr. 313 (August 2017)],"{'100': [], '700': ['SchneiderReto U.1963-(DE-...",[],[],2017,,"{'245': ['Alles Wissen dieser Welt', 'warum Bi...","{'245': ['Alles Wissen dieser Welt', 'warum Bi...",[74 Seiten]
3,"[(NATIONALLICENCE)oxford-10.1093/cid/ciu795, (...",2015,[],{},2015,264853784,[10.1093/cid/ciu795],,20150201,[BK010053],[],[10.1093/cid/ciu795],,[],"[60/3(2015-02-01), 432-437]","{'100': [], '700': ['RozotVirginieDivision of ...",[],[],20150201,,{'245': ['Combined Use of Mycobacterium tuberc...,{'245': ['Combined Use of Mycobacterium tuberc...,[]
4,"[(NATIONALLICENCE)oxford-10.1093/ndt/gft319, (...",2013,[],{},2013,264854039,[10.1093/ndt/gft319],,201310,[BK010053],[],[10.1093/ndt/gft319],,[],"[28/10(2013-10), 2421-2431]","{'100': [], '700': ['BonnyOlivierDepartment of...",[],[],201310,,{'245': ['Molecular bases of circadian rhythmi...,{'245': ['Molecular bases of circadian rhythmi...,[]


## Build DataFrames for Transformation into Feature Matrix

In [5]:
columns_to_use = ['century_x', 'volumes_x', 'century_y', 'volumes_y', 'duplicates']

In [6]:
df_s = gt.transform_list_to_string(df_s, 'volumes')
df_u = gt.transform_list_to_string(df_u, 'volumes')
df_m = gt.transform_list_to_string(df_m, 'volumes')

## Determine Target Vector

In [7]:
df_m['035liste'][0]

['(IDSBB)002447452', '(ALEX)9912923344101791']

In [8]:
df_s.docid.head()

0    000311049
1    00130724X
2    001817272
3    00236865X
4    00351031X
Name: docid, dtype: object

In [9]:
df_u.docid.head()

0    000143235
1    00044801X
2    000996009
3    00239538X
4    002410559
Name: docid, dtype: object

In [10]:
df_m.docid.loc[0]

'264853032'

In [11]:
df_m[df_m.docid == df_u.docid.loc[0]]

Unnamed: 0,035liste,century,coordinate,corporate,decade,docid,doi,edition,exactDate,format,isbn,ismn,musicid,pages,part,person,pubinit,pubword,pubyear,scale,ttlfull,ttlpart,volumes


In [12]:
df_m.loc[3]

035liste      [(NATIONALLICENCE)oxford-10.1093/cid/ciu795, (...
century                                                    2015
coordinate                                                   []
corporate                                                    {}
decade                                                     2015
docid                                                 264853784
doi                                        [10.1093/cid/ciu795]
edition                                                        
exactDate                                              20150201
format                                               [BK010053]
isbn                                                         []
ismn                                       [10.1093/cid/ciu795]
musicid                                                        
pages                                                        []
part                                [60/3(2015-02-01), 432-437]
person        {'100': [], '700': ['Rozot

In [13]:
df_s.loc[347]

035liste           [(NATIONALLICENCE)oxford-10.1093/cid/ciu795]
century                                                    2015
coordinate                                                   []
corporate                                                    {}
decade                                                     2015
docid                                                 395019044
doi                                        [10.1093/cid/ciu795]
edition                                                        
exactDate                                              20150201
format                                               [BK010053]
isbn                                                         []
ismn                                       [10.1093/cid/ciu795]
musicid                                                        
pages                                                        []
part                                [60/3(2015-02-01), 432-437]
person        {'100': [], '700': ['Rozot

In [14]:
df_u['035liste']

0                     [(OCoLC)362722306, (ABN)000551177]
1                     [(OCoLC)886929897, (ABN)000223034]
2                     [(OCoLC)778386601, (ABN)000433604]
3                     [(OCoLC)778561839, (ABN)000238844]
4                     [(OCoLC)777853583, (ABN)000243260]
5                     [(OCoLC)887199526, (ABN)000323695]
6                     [(OCoLC)887344137, (ABN)000567256]
7                     [(OCoLC)315725021, (ABN)000666789]
8                     [(OCoLC)887396789, (ABN)000628911]
9                     [(OCoLC)778331282, (ABN)000078381]
10                    [(OCoLC)778307776, (ABN)000284349]
11                    [(OCoLC)775818296, (ABN)000064232]
12                    [(OCoLC)246238140, (BGR)000025536]
13                    [(OCoLC)759250730, (BGR)000090407]
14                    [(OCoLC)808144668, (BGR)000340863]
15                    [(OCoLC)807987366, (BGR)000078321]
16                     [(OCoLC)76214484, (BGR)000281870]
17                    [(OCoLC)8

In [15]:
df_s['035liste']

0                     [(OCoLC)731635279, (ABN)000539983]
1                     [(OCoLC)808324878, (ABN)000155059]
2                     [(OCoLC)231772550, (ABN)000096920]
3                     [(OCoLC)887157168, (ABN)000223912]
4                     [(OCoLC)887324690, (ABN)000548154]
5                     [(OCoLC)180154028, (ABN)000308006]
6                     [(OCoLC)605622457, (ABN)000234417]
7                     [(OCoLC)887393628, (ABN)000218626]
8                     [(OCoLC)778329562, (ABN)000048149]
9                     [(OCoLC)780137741, (ABN)000669941]
10                    [(OCoLC)634380788, (ABN)000295774]
11                    [(OCoLC)887478782, (ABN)000327579]
12                    [(OCoLC)808324878, (ABN)000250841]
13                    [(OCoLC)254941323, (BGR)000336877]
14                    [(OCoLC)887618219, (BGR)000347882]
15                    [(OCoLC)887620487, (BGR)000384903]
16                    [(OCoLC)808016835, (BGR)000107522]
17                    [(OCoLC)6

In [16]:
df_m['035liste']

0             [(IDSBB)002447452, (ALEX)9912923344101791]
1                   [(IDSBB)000369647, (NEBIS)005609528]
2      [(IDSLU)001293605, (IDSBB)006725532, (NEBIS)01...
3      [(NATIONALLICENCE)oxford-10.1093/cid/ciu795, (...
4      [(NATIONALLICENCE)oxford-10.1093/ndt/gft319, (...
5      [(IDSBB)006349742, (SNL)vtls001511583, (OCoLC)...
6      [(NEBIS)009544280, (SNL)vtls000622833, (Sz)000...
7      [(NEBIS)002605093, (SBT)000026281, (RERO)07639...
8      [(SBT)000762391, (VAUD)991001881179702852, (RN...
9      [(RERO)R008339403, (VAUD)991004649259702852, (...
10     [(RERO)1083290, (VAUD)991008451129702852, (RNV...
11     [(RERO)R003786003, (VAUD)991008917219702852, (...
12                  [(IDSLU)000568353, (RERO)R006199229]
13     [(BGR)000347882, (NEBIS)005038859, (ABN)000218...
14                     [(SGBN)000443345, (ABN)000250841]
15     [(NEBIS)001013464, (SNL)vtls000511776, (Sz)000...
16     [(RERO)R005694081, (VAUD)991003937499702852, (...
17                  [(IDSBB)004

In [17]:
df_s.loc[[0, 187, 276, 424, 428, 429]]

Unnamed: 0,035liste,century,coordinate,corporate,decade,docid,doi,edition,exactDate,format,isbn,ismn,musicid,pages,part,person,pubinit,pubword,pubyear,scale,ttlfull,ttlpart,volumes
0,"[(OCoLC)731635279, (ABN)000539983]",2009,[],{},2009,311049,[],,2009,[BK020000],[978-3-15-020008-7],[],,[600 S.],[20008],{'100': ['AustenJane1775-1817(DE-588)118505173...,[Reclam jun.],[Reclam jun.],2009,,"{'245': ['Emma', 'Roman']}","{'245': ['Emma', 'Roman']}",600 S.
187,"[(OCoLC)731635279, (NEBIS)009587153]",2009,[],{},2009,196506476,[],,2009,[BK020000],[978-3-15-020008-7],[],,[600 S.],[20008],{'100': ['AustenJane1775-1817(DE-588)118505173...,[Reclam],[Reclam],2009,,{'245': ['Emma']},{'245': ['Emma']},600 S.
276,"[(OCoLC)731635279, (LIBIB)000315536]",2009,[],{},2009,323173349,[],,2009,[BK020000],[978-3-15-020008-7],[],,[600 S.],[20008],"{'100': ['AustenJane'], '245c': ['Jane Austen']}",[Reclam],[Reclam],2009,,"{'245': ['Emma', 'Roman']}","{'245': ['Emma', 'Roman']}",600 S.
424,"[(OCoLC)1002177443, (IDSBB)006726594]",2017,[],{},2017,491629737,[10.1055/b-005-143650],"7., überarbeitete und erweiterte Auflage",2017,[BK020053],[978-3-13-240808-1],[10.1055/b-005-143650],,[1 Online-Ressource],[],{'100': ['TrappeHans-Joachim1954-(DE-588)12494...,[],[],2017,,{'245': ['EKG-Kurs für Isabel']},{'245': ['EKG-Kurs für Isabel']},1 Online-Ressource
428,"[(OCoLC)1002177443, (NEBIS)011045420, (OCoLC)1...",2017,[],{},2017,495381160,[10.1055/b-005-143650],"7., überarbeitete und erweiterte Auflage",2017,[BK020053],[978-3-13-240808-1],[10.1055/b-005-143650],,[1 Online-Ressource],[],{'100': ['TrappeHans-Joachim1954-(DE-588)12494...,[],[],2017,,{'245': ['EKG-Kurs für Isabel']},{'245': ['EKG-Kurs für Isabel']},1 Online-Ressource
429,"[(VAUD)991010321879702852, (RNV)000202321-41bc...",1970,[],{},1970,501860959,[],,19702006,[MU010100],[],[],BA 4553,[1 partition (379 p.)],"[Werkgruppe 5, Bd. 19]","{'100': ['MozartWolfgang Amadeus'], '700': ['G...",[Bärenreiter],[Bärenreiter],19702006,,"{'245': ['Neue Ausgabe sämtlicher Werke', 'Die...","{'245': ['Neue Ausgabe sämtlicher Werke', 'Die...",1 partition (379 p.)


In [18]:
df_m['035liste'].loc[85]

['(NEBIS)009587153', '(LIBIB)000315536', '(ABN)000539983']

In [19]:
df_m.loc[85]

035liste      [(NEBIS)009587153, (LIBIB)000315536, (ABN)0005...
century                                                    2009
coordinate                                                   []
corporate                                                    {}
decade                                                     2009
docid                                                 504389793
doi                                                          []
edition                                                        
exactDate                                              2009    
format                                               [BK020000]
isbn                                        [978-3-15-020008-7]
ismn                                                         []
musicid                                                        
pages                                                  [600 S.]
part                                                    [20008]
person        {'100': ['AustenJane1775-1

In [20]:
def add_master_docid_to_slave (df_s, df_m):
    """Determine docid of master and store on slave."""
    # Initialize Foreign Key list
    df_s['masters_docid'] = [list() for x in range(len(df_s.index))]

    # Search for master of slave
    for i in range(len(df_s)):
        loc_li = list()
        for j in range(len(df_s['035liste'].loc[i])):
            master_index = df_m[df_m['035liste'].str.contains(
                df_s['035liste'].loc[i][j], regex=False
            )].index
            if len(master_index) > 0 : # Skip empty Series
                loc_li.append(df_m.docid[master_index].values[0])

        df_s['masters_docid'].loc[i] = loc_li
    
    return df_s

In [21]:
df_s = add_master_docid_to_slave(df_s, df_m)

df_s.masters_docid.head()

0    [504389793]
1    [504390597]
2    [50439018X]
3    [504389513]
4    [504389823]
Name: masters_docid, dtype: object

In [22]:
# Proof that all docid_masters are unique...
df_s['masters_docid'] = df_s['masters_docid'].apply(lambda x : set(x))
df_s['masters_docid'] = df_s['masters_docid'].apply(lambda x : list(x))

for i in range(len(df_s)):
    if len(df_s.masters_docid.loc[i]) != 1 :
        print('HALT!')
        break

df_s['masters_docid'] = df_s['masters_docid'].apply(lambda x : x[0])
df_s.head()

Unnamed: 0,035liste,century,coordinate,corporate,decade,docid,doi,edition,exactDate,format,isbn,...,pages,part,person,pubinit,pubword,pubyear,scale,ttlfull,ttlpart,volumes,masters_docid
0,"[(OCoLC)731635279, (ABN)000539983]",2009,[],{},2009,000311049,[],,2009,[BK020000],[978-3-15-020008-7],...,[600 S.],[20008],{'100': ['AustenJane1775-1817(DE-588)118505173...,[Reclam jun.],[Reclam jun.],2009,,"{'245': ['Emma', 'Roman']}","{'245': ['Emma', 'Roman']}",600 S.,504389793
1,"[(OCoLC)808324878, (ABN)000155059]",2000,[],"{'710': ['Metropolitan Opera Orchestra', 'Metr...",2000,00130724X,[],,2000,[VM010300],[],...,"[1 DVD-Video, DVD Region 0, 169 Min., farb.]",[],"{'100': ['LevineJamesDir.'], '700': ['MozartWo...",[Deutsche Grammophon],[Deutsche Grammophon],2000,,"{'245': ['Die Zauberflöte', 'Oper in zwei Aufz...","{'245': ['Die Zauberflöte', 'Oper in zwei Aufz...","1 DVD-Video, DVD Region 0, 169 Min., farb.",504390597
2,"[(OCoLC)231772550, (ABN)000096920]",1999,[],{},1999,001817272,[],,1999,[BK020000],[3-495-47879-5],...,[316 S.],[],"{'100': ['FluryAndreas'], '245c': ['Andreas Fl...",[Alber],[Alber],1999,,"{'245': ['Der moralische Status der Tiere', 'H...","{'245': ['Der moralische Status der Tiere', 'H...",316 S.,50439018X
3,"[(OCoLC)887157168, (ABN)000223912]",uuuu,[],{},uuuu,00236865X,[],,uuuuuuuu,[BK020000],[],...,[412 S.],[],"{'100': ['MozartWolfgang Amadeus'], '245c': ['']}",[Ernst Eulenburg],[Ernst Eulenburg],uuuuuuuu,,{'245': ['Die Zauberflöte']},{'245': ['Die Zauberflöte']},412 S.,504389513
4,"[(OCoLC)887324690, (ABN)000548154]",2008,[],{},2008,00351031X,[],,2008,[BK020000],[978-1-4058-8214-9],...,[64 S.],[],{'100': ['AustenJane1775-1817(DE-588)118505173...,[Pearson Education],[Pearson Education],2008,,{'245': ['Emma']},{'245': ['Emma']},64 S.,504389823


In [23]:
result = pd.merge(left=df_s, right=df_m, how='inner', left_on='masters_docid', right_on='docid')

# Extend display to number of columns of DataFrame
pd.options.display.max_columns = len(result)

result.head()

Unnamed: 0,035liste_x,century_x,coordinate_x,corporate_x,decade_x,docid_x,doi_x,edition_x,exactDate_x,format_x,isbn_x,ismn_x,musicid_x,pages_x,part_x,person_x,pubinit_x,pubword_x,pubyear_x,scale_x,ttlfull_x,ttlpart_x,volumes_x,masters_docid,035liste_y,century_y,coordinate_y,corporate_y,decade_y,docid_y,doi_y,edition_y,exactDate_y,format_y,isbn_y,ismn_y,musicid_y,pages_y,part_y,person_y,pubinit_y,pubword_y,pubyear_y,scale_y,ttlfull_y,ttlpart_y,volumes_y
0,"[(OCoLC)731635279, (ABN)000539983]",2009,[],{},2009,000311049,[],,2009,[BK020000],[978-3-15-020008-7],[],,[600 S.],[20008],{'100': ['AustenJane1775-1817(DE-588)118505173...,[Reclam jun.],[Reclam jun.],2009,,"{'245': ['Emma', 'Roman']}","{'245': ['Emma', 'Roman']}",600 S.,504389793,"[(NEBIS)009587153, (LIBIB)000315536, (ABN)0005...",2009,[],{},2009,504389793,[],,2009,[BK020000],[978-3-15-020008-7],[],,[600 S.],[20008],{'100': ['AustenJane1775-1817(DE-588)118505173...,[Reclam],[Reclam],2009,,{'245': ['Emma']},{'245': ['Emma']},600 S.
1,"[(OCoLC)731635279, (NEBIS)009587153]",2009,[],{},2009,196506476,[],,2009,[BK020000],[978-3-15-020008-7],[],,[600 S.],[20008],{'100': ['AustenJane1775-1817(DE-588)118505173...,[Reclam],[Reclam],2009,,{'245': ['Emma']},{'245': ['Emma']},600 S.,504389793,"[(NEBIS)009587153, (LIBIB)000315536, (ABN)0005...",2009,[],{},2009,504389793,[],,2009,[BK020000],[978-3-15-020008-7],[],,[600 S.],[20008],{'100': ['AustenJane1775-1817(DE-588)118505173...,[Reclam],[Reclam],2009,,{'245': ['Emma']},{'245': ['Emma']},600 S.
2,"[(OCoLC)731635279, (LIBIB)000315536]",2009,[],{},2009,323173349,[],,2009,[BK020000],[978-3-15-020008-7],[],,[600 S.],[20008],"{'100': ['AustenJane'], '245c': ['Jane Austen']}",[Reclam],[Reclam],2009,,"{'245': ['Emma', 'Roman']}","{'245': ['Emma', 'Roman']}",600 S.,504389793,"[(NEBIS)009587153, (LIBIB)000315536, (ABN)0005...",2009,[],{},2009,504389793,[],,2009,[BK020000],[978-3-15-020008-7],[],,[600 S.],[20008],{'100': ['AustenJane1775-1817(DE-588)118505173...,[Reclam],[Reclam],2009,,{'245': ['Emma']},{'245': ['Emma']},600 S.
3,"[(OCoLC)808324878, (ABN)000155059]",2000,[],"{'710': ['Metropolitan Opera Orchestra', 'Metr...",2000,00130724X,[],,2000,[VM010300],[],[],,"[1 DVD-Video, DVD Region 0, 169 Min., farb.]",[],"{'100': ['LevineJamesDir.'], '700': ['MozartWo...",[Deutsche Grammophon],[Deutsche Grammophon],2000,,"{'245': ['Die Zauberflöte', 'Oper in zwei Aufz...","{'245': ['Die Zauberflöte', 'Oper in zwei Aufz...","1 DVD-Video, DVD Region 0, 169 Min., farb.",504390597,"[(IDSBB)003690925, (NEBIS)005645758, (RERO)R00...",2000,[],"{'710': ['Metropolitan Opera', 'Metropolitan O...",2000,504390597,[],,2000,[VM010300],[],[],073 003-9,[1 DVD-Video (169 Min.)],[],{'100': ['MozartWolfgang Amadeus1756-1791(DE-5...,[],[],2000,,"{'245': ['Die Zauberflöte', 'Oper in zwei Aufz...","{'245': ['Die Zauberflöte', 'Oper in zwei Aufz...",1 DVD-Video (169 Min.)
4,"[(OCoLC)884447694, (IDSBB)003690925]",2000,[],"{'710': ['Metropolitan Opera', 'Metropolitan O...",2000,116188030,[],,2000,[VM010300],[],[],073 003-9,[1 DVD-Video (169 Min.)],[],{'100': ['MozartWolfgang Amadeus1756-1791(DE-5...,[],[],2000,,"{'245': ['Die Zauberflöte', 'Oper in zwei Aufz...","{'245': ['Die Zauberflöte', 'Oper in zwei Aufz...",1 DVD-Video (169 Min.),504390597,"[(IDSBB)003690925, (NEBIS)005645758, (RERO)R00...",2000,[],"{'710': ['Metropolitan Opera', 'Metropolitan O...",2000,504390597,[],,2000,[VM010300],[],[],073 003-9,[1 DVD-Video (169 Min.)],[],{'100': ['MozartWolfgang Amadeus1756-1791(DE-5...,[],[],2000,,"{'245': ['Die Zauberflöte', 'Oper in zwei Aufz...","{'245': ['Die Zauberflöte', 'Oper in zwei Aufz...",1 DVD-Video (169 Min.)


In [24]:
len(result)

435

In [25]:
result.loc[408]

035liste_x                     [(SNL)vtls001860448, (Sz)001860448]
century_x                                                     2013
coordinate_x                                                    []
corporate_x         {'710': ['Schweizerische Normen-Vereinigung']}
decade_x                                                      2013
docid_x                                                  404220762
doi_x                                                           []
edition_x                                                         
exactDate_x                                               2013    
format_x                                                [BK020053]
isbn_x                                                          []
ismn_x                                                          []
musicid_x                                                         
pages_x                                     [1 ressource en ligne]
part_x                                                        

In [26]:
def build_duplicate_pairs (df):
    """Builds-up all duplicate pairs, even with itself."""
    
    return pd.merge(left=df, right=df, how='inner', left_on='masters_docid', right_on='masters_docid')

In [27]:
duplicates = build_duplicate_pairs(df_s)
duplicates['duplicates'] = 1

len(duplicates)

1473

In [28]:
duplicates.loc[1000]

035liste_x                    [(OCoLC)248381623, (NEBIS)008641750]
century_x                                                     2001
coordinate_x                                                    []
corporate_x                                                     {}
decade_x                                                      2001
docid_x                                                  190326522
doi_x                                                           []
edition_x                                            3., erw. Aufl
exactDate_x                                               2001    
format_x                                                [BK020000]
isbn_x                                             [3-13-127283-X]
ismn_x                                                          []
musicid_x                                                         
pages_x                                                   [323 S.]
part_x                                                        

In [29]:
df_s_1 = df_s
df_u_1 = df_u
df_s_1['duplicates'] = 0
df_u_1['duplicates'] = 0
non_duplicates = pd.merge(df_s_1, df_u_1, on='duplicates')

len(non_duplicates)

259260

In [30]:
print(len(duplicates)/len(non_duplicates)*100)

0.5681555195556585


In [31]:
non_duplicates.loc[0]

035liste_x                      [(OCoLC)731635279, (ABN)000539983]
century_x                                                     2009
coordinate_x                                                    []
corporate_x                                                     {}
decade_x                                                      2009
docid_x                                                  000311049
doi_x                                                           []
edition_x                                                         
exactDate_x                                               2009    
format_x                                                [BK020000]
isbn_x                                         [978-3-15-020008-7]
ismn_x                                                          []
musicid_x                                                         
pages_x                                                   [600 S.]
part_x                                                     [20

### Feature DataFrame

In [32]:
dupes = duplicates[columns_to_use]
non_dupes = non_duplicates[columns_to_use]

In [33]:
frames = [dupes, non_dupes]

df_feature_base = pd.concat(frames)
df_feature_base.head()

Unnamed: 0,century_x,volumes_x,century_y,volumes_y,duplicates
0,2009,600 S.,2009,600 S.,1
1,2009,600 S.,2009,600 S.,1
2,2009,600 S.,2009,600 S.,1
3,2009,600 S.,2009,600 S.,1
4,2009,600 S.,2009,600 S.,1


In [34]:
len(df_feature_base), len(df_feature_base[df_feature_base.duplicates==0]), len(df_feature_base[df_feature_base.duplicates==1])

(260733, 259260, 1473)

In [35]:
df_feature_base.duplicates.value_counts(normalize=True)

0    0.994351
1    0.005649
Name: duplicates, dtype: float64

In [36]:
df_feature_base['century_delta'] = (df_feature_base['century_x'] == df_feature_base['century_y']).astype('int32')
df_feature_base['volumes_delta'] = (df_feature_base['volumes_x'] == df_feature_base['volumes_y']).astype('int32')

df_feature_base.drop(columns=['century_x', 'century_y', 'volumes_x', 'volumes_y'], inplace=True)

### Train-/Test Split

In [37]:
X = df_feature_base.drop(columns=['duplicates']).values
y = df_feature_base.duplicates.values

In [38]:
from sklearn.model_selection import train_test_split

X_tr, X_te, y_tr, y_te = train_test_split(X, y, stratify=y, test_size=0.2, random_state=0)

## The models

### DecisionTree

In [39]:
X_tr[:5], y_tr[:5]

(array([[0, 0],
        [0, 0],
        [0, 0],
        [0, 0],
        [0, 0]], dtype=int32), array([0, 0, 0, 0, 0]))

In [40]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(random_state=0)
dt.fit(X_tr, y_tr)
y_pred = dt.predict(X_te)
dt.score(X_te, y_te)

0.9971618693309299

### Performance Measurement

In [41]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_te, y_pred)

array([[51827,    25],
       [  123,   172]])

### SVC

In [42]:
from sklearn.svm import SVC

sv = SVC(kernel='rbf', gamma='auto' ,random_state=0)
sv.fit(X_tr, y_tr)
y_pred = sv.predict(X_te)
sv.score(X_te, y_te)

0.9971618693309299

In [43]:
confusion_matrix(y_te, y_pred)

array([[51827,    25],
       [  123,   172]])