<a href="https://colab.research.google.com/github/ysugiyama3/google_colab/blob/master/cataloging_copy_lookup.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## **Cataloging copy lookup program - WorldCat Search API V1**

Last updated: 3/13/2024.

Please contact yukari.sugiyama@yale.edu if you have any issues, questions, or suggestions.

---
#### **About the program**
Using Python 3 and OCLC WorldCat Search API ***v1***, this program automates the work of cataloing copy search. It automatically searches WorldCat for good cataloging copy by LCCN, ISBN, and OCLC numbers. The process considers OCLC records with English cataloging language, encoding levels (blank, 1, 4, I, M) and LC Subject Headings (except for fiction) as good cataloging copy.

#### **What you need**


*   Your own OCLC <a href='https://www.oclc.org/developer/develop/authentication/how-to-request-a-wskey.en.html'>WorldCat Search API key.</a>
*   Excel spreadsheet in which **the 1st, 2nd, 3rd columns must be assigned for LCCN, ISBN, and OCLC number respectively.** The spreadsheet can have as many columns as necesary and must have column headers.

#### **How to run the program**
* Execute each step by simply clicking the play button (watch a video demo <a href='https://youtu.be/qAaI75ZTPoA'>here</a>)
* To start over, please go to the menu, go to "Runtime" and then select "Disconnect and delete runtime."
* To clear output, please go to "Edit" and then select "Clear all outputs."
---

In [None]:
#@title 1. Enter your OCLC WorldCat Search API key

from google.colab import files
import pandas as pd
import requests
import xml.etree.ElementTree as ET
from IPython.display import HTML, display
import time
import re
from getpass import getpass

# Set wskey
wskey = getpass('Enter your OCLC WorldCat Search API key: ')


In [None]:
#@title 2. Upload an input Excel file (.xls, .xlsx, .xlsm, .xlsb, .odf)

# Upload an input Excel file
uploaded = files.upload()
input_name = str(list(uploaded.keys())[0])

# Read an input Excel file into a pandas DataFrame
input_df = pd.read_excel(input_name)

# Create an output Excel file based on input excel file
output_name = input_name.rsplit( ".", 1 )[0] + "_output.xlsx"

# Create an output DataFrame
output_df = input_df.copy()
output_df = pd.concat([input_df, pd.DataFrame(columns = [ 'MATCH_OCLC_NO', 'SUBJECT', 'LC_CALL_NUMBER', 'ELVL'])], sort=False)

# number of rows
total = len(input_df.index)

In [None]:
#@title 3. Run the program

def progress(value, max=50000):
    return HTML("""
        <progress
            value='{value}'
            max='{max}',
            style='width: 40%'
        >
            {value}
        </progress>
        <br>{value}/{max}</br>
    """.format(value=value, max=max))

def eng_lang(record):
    cat_lang = record.find('{http://www.loc.gov/MARC21/slim}datafield[@tag="040"]/{http://www.loc.gov/MARC21/slim}subfield[@code="b"]')
    if cat_lang is not None:
        cat_lang = cat_lang.text
        if cat_lang == 'eng':
            return True
    return False

def encoding_level(record):
    elvl = record.find('{http://www.loc.gov/MARC21/slim}leader').text[17:18]
    if elvl in [' ','1','4','I','M']:
        return str(elvl)
    else:
        return ''

def oclc_no(record):
    oclc_no = record.find('{http://www.loc.gov/MARC21/slim}controlfield[@tag="001"]').text
    return oclc_no

def call_no(record):
    call_sub_a = record.find('{http://www.loc.gov/MARC21/slim}datafield[@tag="050"]/{http://www.loc.gov/MARC21/slim}subfield[@code="a"]')
    call_sub_b = record.find('{http://www.loc.gov/MARC21/slim}datafield[@tag="050"]/{http://www.loc.gov/MARC21/slim}subfield[@code="b"]')
    if call_sub_a is not None:
        call_sub_a = call_sub_a.text
    else:
        call_sub_a = ''
    if call_sub_b is not None:
        call_sub_b = ' ' + call_sub_b.text
    else:
        call_sub_b = ''
    call = call_sub_a + call_sub_b
    return call

def fiction(record):
    litf = record.find('{http://www.loc.gov/MARC21/slim}controlfield[@tag="008"]').text[33:34]
    if litf == '0':
        return False
    else:
        return True

def subject(record):
    if fiction(record):
        return 'Yes (Fiction)'
    else:
        subj_count = 0
        person = record.find('{http://www.loc.gov/MARC21/slim}datafield[@ind2="0"][@tag="600"]/{http://www.loc.gov/MARC21/slim}subfield[@code="a"]')
        corp = record.find('{http://www.loc.gov/MARC21/slim}datafield[@ind2="0"][@tag="610"]/{http://www.loc.gov/MARC21/slim}subfield[@code="a"]')
        conf = record.find('{http://www.loc.gov/MARC21/slim}datafield[@ind2="0"][@tag="611"]/{http://www.loc.gov/MARC21/slim}subfield[@code="a"]')
        ti = record.find('{http://www.loc.gov/MARC21/slim}datafield[@ind2="0"][@tag="630"]/{http://www.loc.gov/MARC21/slim}subfield[@code="a"]')
        topic = record.find('{http://www.loc.gov/MARC21/slim}datafield[@ind2="0"][@tag="650"]/{http://www.loc.gov/MARC21/slim}subfield[@code="a"]')
        geo = record.find('{http://www.loc.gov/MARC21/slim}datafield[@ind2="0"][@tag="651"]/{http://www.loc.gov/MARC21/slim}subfield[@code="a"]')

        if person is not None:
            subj_count += 1
        if corp is not None:
            subj_count += 1
        if conf is not None:
            subj_count += 1
        if ti is not None:
            subj_count += 1
        if topic is not None:
            subj_count += 1
        if geo is not None:
            subj_count += 1

        if subj_count > 0:
            return 'Yes'
        else:
            return 'No'

def check_results(url):
    global output_df, hascopy
    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
    try:
        time.sleep(0.02)
        r = requests.get(url, headers=headers)
        marcxml = r.text
        root = ET.fromstring(marcxml)
        result = root.find('{http://www.loc.gov/zing/srw/}numberOfRecords').text
        if int(result) > 0:
            records = root.findall('{http://www.loc.gov/zing/srw/}records/{http://www.loc.gov/zing/srw/}record/{http://www.loc.gov/zing/srw/}recordData/{http://www.loc.gov/MARC21/slim}record')
            for record in records:
                oclc = oclc_no(record)
                subj = subject(record)
                call = call_no(record)
                elvl = encoding_level(record)

                if eng_lang(record) and len(elvl) > 0 and subj.startswith('Yes'):
                    hascopy += 1
                    output_df.loc[index, 'MATCH_OCLC_NO'] = oclc
                    output_df.loc[index, 'SUBJECT'] = subj
                    output_df.loc[index, 'LC_CALL_NUMBER'] = call
                    output_df.loc[index, 'ELVL'] = elvl
                    return True
    except:
        return False
    return False

def search_by_lccn(lccn):
    url = 'http://www.worldcat.org/webservices/catalog/search/sru?query=srw.dn=%22' + str(lccn) + '%22+not+srw.mt=%22com%22&servicelevel=full&frbrGrouping=off&maximumRecords=10&wskey=' + wskey
    check_results(url)

def search_dlc_by_isbn(isbn):
    url = 'http://www.worldcat.org/webservices/catalog/search/sru?query=srw.bn=%22' + str(isbn) + '%22+and+srw.pc=%22dlc%22+not+srw.mt=%22com%22&servicelevel=full&frbrGrouping=off&maximumRecords=10&wskey=' + wskey
    check_results(url)

def search_by_isbn(isbn):
    url = 'http://www.worldcat.org/webservices/catalog/search/sru?query=srw.bn=%22' + str(isbn) + '%22+not+srw.mt=%22com%22&servicelevel=full&frbrGrouping=off&maximumRecords=10&wskey=' + wskey
    check_results(url)

def search_by_oclc(oclc):
    url = 'http://www.worldcat.org/webservices/catalog/content/' + str(oclc) + '?&servicelevel=full&frbrGrouping=off&wskey=' + wskey
    check_results(url)

def clean_lccn(lccn):
    if lccn is None or pd.isnull(lccn):
        lccn = None
    elif len(str(lccn)) > 0:
        lccn = str(lccn)
        lccn = re.sub(r'\..*', '', lccn)
    else:
        lccn = None
    return lccn

def clean_isbn(isbn):
    if isbn is None or pd.isnull(isbn) or pd.isna(isbn) or str(isbn).strip() == '':
        isbn = None
    elif isinstance(isbn, int):
        isbn = str(isbn)
        isbn = re.sub(r'[\(|\:|\.].*', '', isbn)
        isbn = re.sub(r'[^0-9Xx]', '', isbn).strip()
    else:
        isbn = str(isbn)
        isbn = re.sub(r'[\(|\:|\.].*', '', isbn)
        isbn = re.sub(r'[^0-9Xx]', '', isbn).strip()
    return isbn

def clean_oclc(oclc):
    if oclc is None or pd.isnull(oclc):
        oclc = None
    elif len(str(oclc)) > 0:
        oclc = str(oclc)
        if oclc.startswith('(OCoLC)') or oclc.startswith('ocn') or oclc.startswith('ocm') or oclc.startswith('on'):
            oclc = re.sub(r'[^0-9]*','',oclc)
        else:
            oclc = None
    else:
        oclc = None
    return oclc

# count
count = 0
hascopy = 0

out = display(progress(0, total), display_id=True)

for index, row in output_df.iterrows():
    count += 1
    time.sleep(0.02)
    out.update(progress(count, total))

    lccn = clean_lccn(row[0])
    isbn = clean_isbn(row[1])
    oclc = clean_oclc(row[2])

    if lccn is not None and search_by_lccn(lccn):
        continue
    elif isbn is not None and search_dlc_by_isbn(isbn):
        continue
    elif isbn is not None and search_by_isbn(isbn):
        continue
    elif oclc is not None:
        search_by_oclc(oclc)



In [None]:
#@title 4. Preview results

import matplotlib.pyplot as plt

labels = 'Has copy', 'No copy'
sizes = [hascopy, total-hascopy]
colors = ['c', 'y']
explode = (0.1, 0.0)

total = sum(sizes)

plt.pie(sizes, explode=explode, labels=labels, colors=colors,
        autopct=lambda s: '{:.0f}'.format(s * total / 100), shadow=True, startangle=90)

plt.axis('equal')
plt.title(input_name + ' Results')
plt.show()

In [None]:
#@title 5. Export an output file to Excel

try:
    output_df.to_excel(output_name, index=False)
except:
    output_name = output_name.rsplit( ".", 1 )[0] + '.csv'
    output_df.to_csv(output_name, index=False)
print('Done!\U0001f44D')