In [151]:
import pandas as pd
import datacompy
benchmark_file = 'input/benchmark-nomsATC.tsv'
rero_files = ['input/RERO-aligned-1.txt', 'input/RERO-aligned-2.txt']
rero_concord_file = 'input/20220228_concordances_id_rnv_id_rero_4_headings.csv'

Import all data into dataframes. Specifying the `dtype` appears necessary because otherwise pandas doesn't know what to do with columns containing both numbers and text.

In [157]:
ouali_data = pd.read_csv(benchmark_file, sep='\t', dtype = str)
rero_data = pd.concat((pd.read_csv(f, sep='\t', encoding = "ISO-8859-1", dtype = str) for f in rero_files))
rero_data.columns =[column.replace(" ", "_") for column in rero_data.columns]
rero_concord = pd.read_csv(rero_concord_file, dtype = str)

In [27]:
rero_data.query('author_heading == "A, Cheng"')

Unnamed: 0,rero_id,idref_id,author_heading,subject_heading


Not sure why the dataframe query doesn't seem to work with strings. Maybe because of extra spaces? Using loc and contains works better. `na=False` is required because that column contains `NaN` data, which we need to convert to false in order to use loc lookup.

In [137]:
display(rero_data.loc[rero_data['author_heading'].str.contains("Christen-Gueissaz", na=False)])
display(rero_concord.loc[rero_concord['main_form'].str.contains("Christen-Gueissaz", na=False)])

Unnamed: 0,rero_id,idref_id,author_heading,subject_heading
6474,A000036058,33273901,"Christen-Gueissaz, Eliane",


Unnamed: 0,id,id_rnv,id_rero,id_rero_a,main_form,repo_id
3,981023303674702851,(RNV_A)0000149055,(RERO)vtls000036058,,"Christen-Gueissaz, Eliane",rnv-nz-auth-atc


In [138]:
display(rero_data.loc[rero_data['author_heading'].str.contains("Desvallières, Georges", na=False)])
display(rero_concord.loc[rero_concord['main_form'].str.contains("Desvallières, Georges", na=False)])

Unnamed: 0,rero_id,idref_id,author_heading,subject_heading
839975,A023957387,50433180,"Desvallières, Georges, 1861-1950",


Unnamed: 0,id,id_rnv,id_rero,id_rero_a,main_form,repo_id
968721,1010073363,,,A010073363,"Desvallières, Georges",rnv
1056650,981023302855002851,(RNV_A)0000113805,,A023957387,"Desvallières, Georges, 1861-1950",rnv-nz-auth-atc


The number in `rero_id` in `rero_data` corresponds to the last part of either `id_rero` or `id_rero_a` in `rero_concord`. All need to be trimmed of their control characters before they can be used as matchpoints.

In [158]:
rero_data['id_rero_join'] = rero_data.rero_id.str.extract('(\d+)')
rero_concord['id_rero_join'] = rero_concord['id_rero'].fillna(rero_concord['id_rero_a'])
rero_concord['id_rero_join'] = rero_concord.id_rero_join.str.extract('(\d+)')
display(rero_data.query('id_rero_join == "000036058"'))
display(rero_data.query('id_rero_join == "023957387"'))
display(rero_concord.query('id_rero_join == "000036058"'))
display(rero_concord.query('id_rero_join == "023957387"'))

Unnamed: 0,rero_id,idref_id,author_heading,subject_heading,id_rero_join
6474,A000036058,33273901,"Christen-Gueissaz, Eliane",,36058


Unnamed: 0,rero_id,idref_id,author_heading,subject_heading,id_rero_join
839975,A023957387,50433180,"Desvallières, Georges, 1861-1950",,23957387


Unnamed: 0,id,id_rnv,id_rero,id_rero_a,main_form,repo_id,id_rero_join
3,981023303674702851,(RNV_A)0000149055,(RERO)vtls000036058,,"Christen-Gueissaz, Eliane",rnv-nz-auth-atc,36058


Unnamed: 0,id,id_rnv,id_rero,id_rero_a,main_form,repo_id,id_rero_join
1056650,981023302855002851,(RNV_A)0000113805,,A023957387,"Desvallières, Georges, 1861-1950",rnv-nz-auth-atc,23957387


Now we can try merging the two tables

In [159]:
rero_aligns = pd.merge(rero_data, rero_concord, on='id_rero_join', how="inner")
rero_aligns['source'] = rero_aligns['id']
rero_aligns['cible'] = rero_aligns['idref_id']
display(rero_aligns.query('id_rero_join == "000036058"'))
display(rero_aligns.query('id_rero_join == "023957387"'))
print('Nombre de concordiances RERO-IdRef validées: ' + str(len(rero_aligns)))

Unnamed: 0,rero_id,idref_id,author_heading,subject_heading,id_rero_join,id,id_rnv,id_rero,id_rero_a,main_form,repo_id,source,cible
6474,A000036058,33273901,"Christen-Gueissaz, Eliane",,36058,981023303674702851,(RNV_A)0000149055,(RERO)vtls000036058,,"Christen-Gueissaz, Eliane",rnv-nz-auth-atc,981023303674702851,33273901


Unnamed: 0,rero_id,idref_id,author_heading,subject_heading,id_rero_join,id,id_rnv,id_rero,id_rero_a,main_form,repo_id,source,cible
547307,A023957387,50433180,"Desvallières, Georges, 1861-1950",,23957387,981023302855002851,(RNV_A)0000113805,,A023957387,"Desvallières, Georges, 1861-1950",rnv-nz-auth-atc,981023302855002851,50433180


Nombre de concordiances RERO-IdRef validées: 595757


Now we have a reference file to benchmark against.

Let's look at Ouali's output next:

In [160]:
ouali_data['source'] = ouali_data['id source']
ouali_data['cible'] = ouali_data['id cible']
# .copy() is necessary to avoid SettingWithCopyWarning when using datacompy later
ouali_align = ouali_data.query('`nombre de candidats` != 0 & `décision d\'alignement` == "auto"').copy()
ouali_no_align = ouali_data.query('`nombre de candidats` == 0 & `décision d\'alignement` == "auto"').copy()

print('Alignements: ' + str(len(ouali_align)))
print('Non-alignements: ' + str(len(ouali_no_align)))

Alignements: 993694
Non-alignements: 0


In [161]:
display(ouali_align)
display(rero_aligns)

Unnamed: 0,réservoir source,id source,forme principale source,arbitre,date d'arbitrage,niveau de confiance,commentaire,décision d'alignement,nombre de candidats,score max,...,type de cible 2,réservoir cible 2,id cible 2,forme principale cible 2,type de cible 3,réservoir cible 3,id cible 3,forme principale cible 3,source,cible
0,rnv-nz-auth-atc,981023336220602851,"Melossi, Dario",,,,,auto,1,0.60,...,,,,,,,,,981023336220602851,050194798
1,rnv-nz-auth-atc,981023354214602851,"Ugelstad, Endre",,,,,auto,0,,...,,,,,,,,,981023354214602851,
2,rnv-nz-auth-atc,981023335918202851,"Meinicke, Michael",,,,,auto,1,0.61,...,,,,,,,,,981023335918202851,100939848
3,rnv-nz-auth-atc,981023292394002851,"Hrozny, Bedřich",,,,,auto,1,0.60,...,,,,,,,,,981023292394002851,079549322
4,rnv-nz-auth-atc,981023401035502851,"Lauzon, Jean",,,,,auto,2,0.56,...,,,,,,,,,981023401035502851,153084596
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
993689,rnv-nz-auth-atc,981023307069702851,"Basedow, Johann-Bernhard",,,,,auto,1,0.60,...,,,,,,,,,981023307069702851,06117422X
993690,rnv-nz-auth-atc,981023306039602851,"Ingleby, David",,,,,auto,1,0.65,...,,,,,,,,,981023306039602851,056761503
993691,rnv-nz-auth-atc,981023349559402851,"Touchard-Lafosse, Georges",,,,,auto,1,0.62,...,,,,,,,,,981023349559402851,027166333
993692,rnv-nz-auth-atc,981023400255402851,"Desarzens, Victor",,,,,auto,1,0.61,...,,,,,,,,,981023400255402851,031402569


Unnamed: 0,rero_id,idref_id,author_heading,subject_heading,id_rero_join,id,id_rnv,id_rero,id_rero_a,main_form,repo_id,source,cible
0,A000000276,027262782,"Lyon (1450-1800, lieu d'édition ou d'impressio...",,000000276,981023297288102851,(RNV_A)0000775749,,A000000276,"Lyon (1450-1800, lieu d'édition ou d'impressio...",rnv-nz-auth-atc,981023297288102851,027262782
1,A000000496,086950673,2 Unlimited,,000000496,981023283256402851,(RNV_A)0000893383,,A000000496,2 Unlimited,rnv-nz-auth-atc,981023283256402851,086950673
2,A000000519,104516852,"Repington, Charles à Court",,000000519,981023296087902851,(RNV_A)0000714252,,A000000519,"Repington, Charles à Court",rnv-nz-auth-atc,981023296087902851,104516852
3,A000000531,02917516X,"A, Cheng",,000000531,981023293328902851,(RNV_A)0000957889,,A000000531,"A, Cheng",rnv-nz-auth-atc,981023293328902851,02917516X
4,A000000555,029274702,A.G. Leventis Foundation (Nicosia),,000000555,981023298664702851,(RNV_A)0000842390,,A000000555,A.G. Leventis Foundation (Nicosia),rnv-nz-auth-atc,981023298664702851,029274702
...,...,...,...,...,...,...,...,...,...,...,...,...,...
595752,A026544323,238137511,"????????, ?. ?. (???? ??????????)",,026544323,981023290466102851,(RNV_A)0000851050,,A026544323,"Старилов, Ю. Н. (Юрий Николаевич)",rnv-nz-auth-atc,981023290466102851,238137511
595753,A026545232,189034556,"Jaeger, Thomas, 1977-",,026545232,981023299732602851,(RNV_A)0000755755,,A026545232,"Jaeger, Thomas, 1977-",rnv-nz-auth-atc,981023299732602851,189034556
595754,A026553056,25523001X,,"Hänggi, Yves",026553056,1026553056,,,A026553056,"Hänggi, Yves",rnv,1026553056,25523001X
595755,A026553270,172802598,"Hecht, Susanna B.",,026553270,981023400205802851,(RNV_A)0001434674,(RERO)vtls026553270,,"Hecht, Susanna Bettina",rnv-nz-auth-atc,981023400205802851,172802598


In [162]:
compare = datacompy.Compare(rero_aligns, ouali_align, join_columns=['source', 'cible'], 
                            df1_name='RERO Benchmark', df2_name='Ouali')
print(compare.report())

DataComPy Comparison
--------------------

DataFrame Summary
-----------------

        DataFrame  Columns    Rows
0  RERO Benchmark       13  595757
1           Ouali       24  993694

Column Summary
--------------

Number of columns in common: 2
Number of columns in RERO Benchmark but not in Ouali: 11
Number of columns in Ouali but not in RERO Benchmark: 22

Row Summary
-----------

Matched on: source, cible
Any duplicates on match values: Yes
Absolute Tolerance: 0
Relative Tolerance: 0
Number of rows in common: 406,918
Number of rows in RERO Benchmark but not in Ouali: 188,839
Number of rows in Ouali but not in RERO Benchmark: 586,776

Number of rows with some compared columns unequal: 0
Number of rows with all compared columns equal: 406,918

Column Comparison
-----------------

Number of columns compared with some values unequal: 0
Number of columns compared with all values equal: 2
Total number of values which compare unequal: 0

Sample Rows Only in RERO Benchmark (First 10 Colum

In [6]:
common_align = pd.merge(bench_align, ouali_align, on=['id source', 'id cible'], how='inner')

In [14]:
common_align

Unnamed: 0,réservoir source_x,id source,forme principale source_x,arbitre_x,date d'arbitrage_x,niveau de confiance_x,commentaire_x,décision d'alignement_x,nombre de candidats_x,score max_x,...,réservoir cible_y,forme principale cible_y,type de cible 2_y,réservoir cible 2_y,id cible 2_y,forme principale cible 2_y,type de cible 3_y,réservoir cible 3_y,id cible 3_y,forme principale cible 3_y
0,rnv-nz-auth-atc,981023336220602851,"Melossi, Dario",,,,,auto,1,0.60,...,idref,"Melossi, Dario 1948-....",,,,,,,,
1,rnv-nz-auth-atc,981023335918202851,"Meinicke, Michael",,,,,auto,1,0.61,...,idref,"Meinicke, Michael 19..-....",,,,,,,,
2,rnv-nz-auth-atc,981023401035502851,"Lauzon, Jean",,,,,auto,2,0.56,...,idref,"lauzon, Jean 1953-....",,,,,,,,
3,rnv-nz-auth-atc,981023355887602851,"Dowbiggin, Ian Robert, 1952-",,,,,auto,1,0.60,...,idref,"Dowbiggin, Ian Robert",,,,,,,,
4,rnv-nz-auth-atc,981023336067602851,"Maccoy, Seth",,,,,auto,1,0.55,...,idref,"McCoy, Seth 1928-....",,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564237,rnv-nz-auth-atc,981023307069702851,"Basedow, Johann-Bernhard",,,,,auto,1,0.60,...,idref,"Basedow, Johann Bernhard 1724-1790",,,,,,,,
564238,rnv-nz-auth-atc,981023306039602851,"Ingleby, David",,,,,auto,1,0.65,...,idref,"Ingleby, David",,,,,,,,
564239,rnv-nz-auth-atc,981023349559402851,"Touchard-Lafosse, Georges",,,,,auto,1,0.62,...,idref,"Touchard-Lafosse, Georges 1780-1847",,,,,,,,
564240,rnv-nz-auth-atc,981023400255402851,"Desarzens, Victor",,,,,auto,1,0.61,...,idref,"Desarzens, Victor 1908-1986",,,,,,,,
