I made this notebook as part of the UX kickoff for location and access for items and holdings.
It contains some functions to help me find interesting/useful examples.

In [4]:
import collections

import tqdm

from utils import get_works, get_source_identifier_str


access_terms = collections.defaultdict(set)


for work in tqdm.tqdm(get_works('works-2021-04-28_holdings,id,identifiers,items.json.gz')):
    source_name, source_id = get_source_identifier_str(work).split("/")
    for it in work['items']:
        for loc in it['locations']:
            for ac in loc['accessConditions']:
                if 'terms' in ac:
                    access_terms[ac['terms']].add((source_name, source_id, work['id']))

1213006it [00:17, 68762.80it/s]


In [6]:
import csv
import random


with open('access_terms.csv', 'w') as outfile:
    writer = csv.DictWriter(outfile, fieldnames=['tally', 'sierra_work_id', 'calm_work_id', 'terms'])
    writer.writeheader()
    for terms in sorted(access_terms.keys(), key=lambda ac: len(access_terms[ac]), reverse=True):
        try:
            sierra_work_id = random.choice([
                work_id
                for (source_name, _, work_id) in access_terms[terms]
                if source_name == 'sierra-system-number'
            ])
        except IndexError:
            sierra_work_id = ""
            
        try:
            calm_work_id = random.choice([
                work_id
                for (source_name, _, work_id) in access_terms[terms]
                if source_name == 'calm-record-id'
            ])
        except IndexError:
            calm_work_id = ""

        writer.writerow({
            'tally': len(access_terms[terms]),
            'terms': terms,
            'sierra_work_id': sierra_work_id,
            'calm_work_id': calm_work_id,
        })

This is about finding the shortest/longest examples:

In [12]:
from utils import get_source_identifier_from_id

by_length = sorted(access_terms.keys(), key=len)

for l in by_length[:20]:
    print(repr(l))
    for (_, _, work_id) in access_terms[l]:
        print(work_id, get_source_identifier_from_id(work_id))
    print("")
print(by_length[-1])

'.'
uggqzpcw calm-record-id/aac051a2-89f3-49d3-9d2e-8c29b24bae98

'<p>'
sn8gf4g9 calm-record-id/c6310d90-cddb-4ea4-b80c-cd4d8e04cf31
w72vehd6 calm-record-id/e9a458e8-2812-4a80-ac2f-74af8ce089a6
r3tamfk8 calm-record-id/b576573f-a190-486d-8c4c-ef5bcac856d9
q6jns2ep calm-record-id/56ad5b7d-d6bb-4953-ae4b-f88376efad18
xfu8cqza calm-record-id/30a8e5d9-540c-422c-b63f-add9a87b40c3
uv4jd6en calm-record-id/b74da2c6-385a-4bc0-8298-e2bc5db0abbe
t2fznryx calm-record-id/9d2f4c9c-bab8-45d9-a872-cf4df0b521e9
qekxt8h5 calm-record-id/6bf5de48-758e-45ee-b2f4-23736bc80964
ehpqmdn6 calm-record-id/6cdd4611-3fb5-48ce-b14e-a4c39a4dc1ea

'Open'
a4fckekr sierra-system-number/b16556744
awed3n2r sierra-system-number/b1656229x
u9jxzq8c sierra-system-number/b16556732

'Closed'
paybk2yj calm-record-id/deb1cdd1-4e1f-4457-9dc4-1bdf25a12c37

'<p></p>'
w4yg5syw calm-record-id/7970e873-06b9-43b5-91ac-4ef2c89bba8e
r46w6wmq calm-record-id/c58aca99-6882-4542-948b-6ed777a8b82b
t6thffk3 calm-record-id/0d143f86-d373-445b-8fd9