In [1]:
import pandas as pd

INPUT_FILE_1 = '../../results/16C/similarities_lsh_pagewise_16C.csv'
df_page_level = pd.read_csv(INPUT_FILE_1, delim_whitespace=False, sep=',')

INPUT_FILE_2 = '../../results/16C/similarities_ngram_16C.csv'
df_book_level = pd.read_csv(INPUT_FILE_2, delim_whitespace=False, sep=',')

f'Got {df_page_level.shape[0]} pairs for page-level, {df_book_level.shape[0]} paris for book-level'

'Got 1505 pairs for page-level, 2145 paris for book-level'

In [2]:
page_level = df_page_level.values.tolist()
page_level.sort(key=lambda x: -x[3])

book_level = df_book_level.values.tolist()
book_level.sort(key=lambda x: -x[2])

## Comparison

We'd expect the highest-scoring n-gram matches on the book level to be consistent with the highest scoring 
pairs resulting from the page-level matching.

Sample top 100.

In [3]:
def to_barcode(filename):
  return filename[filename.rindex('/') + 1: -4]

top_100_page = [ [ t[0], t[1], t[3] ] for t in page_level[: 300] ]
top_100_book = [ [ to_barcode(t[0]), to_barcode(t[1]), t[2] ] for t in book_level[: 300] ]

def equals(a, b):
  return (a[0] == b[0] and a[1] == b[1]) or (a[1] == b[0] and a[0] == b[1])

# Argh - I hate Python for being so quirky with this. I'm sure there's an easier way to intersect but, well..
intersection = []
for top_book in top_100_book:
  match = next((True for t in top_100_page if equals(t, top_book)), False)
  if match:
    intersection.append(top_book)

f'Found {len(intersection)} of the top 300 book-level matches in the top 300 page-level matches'

'Found 89 of the top 300 book-level matches in the top 300 page-level matches'

In [4]:
def to_url(barcode):
  return f'http://data.onb.ac.at/ABO/+{barcode}'

with_url = [ [ to_url(t[0]), to_url(t[1]), t[2] ] for t in intersection ]
df = pd.DataFrame(with_url, columns = [ 'url_a', 'url_b', 'score' ])
df

Unnamed: 0,url_a,url_b,score
0,http://data.onb.ac.at/ABO/+Z180628606,http://data.onb.ac.at/ABO/+Z253367704,0.141273
1,http://data.onb.ac.at/ABO/+Z158160208,http://data.onb.ac.at/ABO/+Z176246604,0.121940
2,http://data.onb.ac.at/ABO/+Z185157909,http://data.onb.ac.at/ABO/+Z185158008,0.117661
3,http://data.onb.ac.at/ABO/+Z182822001,http://data.onb.ac.at/ABO/+Z18514390X,0.108850
4,http://data.onb.ac.at/ABO/+Z180628606,http://data.onb.ac.at/ABO/+Z158160208,0.105781
5,http://data.onb.ac.at/ABO/+Z18514390X,http://data.onb.ac.at/ABO/+Z185143704,0.099772
6,http://data.onb.ac.at/ABO/+Z185142207,http://data.onb.ac.at/ABO/+Z18514390X,0.099744
7,http://data.onb.ac.at/ABO/+Z178362103,http://data.onb.ac.at/ABO/+Z18514390X,0.097520
8,http://data.onb.ac.at/ABO/+Z185157107,http://data.onb.ac.at/ABO/+Z18514390X,0.096435
9,http://data.onb.ac.at/ABO/+Z180628606,http://data.onb.ac.at/ABO/+Z185912505,0.095872
