### Creating and Organizing Dataframe

In [1]:
import pandas as pd
import numpy as np
import requests
import xml.etree.ElementTree as ET

df = pd.read_csv("beacon.csv")
df = df[['filename', 'number', 'mods.physicalDescriptionextent', 'pageTitle', 'mods.titleInfo.0title']]	

# page title to string
df['pageTitle'] = df['pageTitle'].astype('str') 
df['pageTitle'] = df['pageTitle'].str.replace('.0', '')

# num of pages to int
df['mods.physicalDescriptionextent'] = df['mods.physicalDescriptionextent'].str.replace(' pages', '')
df['mods.physicalDescriptionextent'] = pd.to_numeric(df['mods.physicalDescriptionextent'], errors='coerce')

# sort by pid number
df = df.sort_values('number', ascending = True)

df.head(1000)

  df = pd.read_csv("beacon.csv")


Unnamed: 0,filename,number,mods.physicalDescriptionextent,pageTitle,mods.titleInfo.0title
16619,beacon:1,1,6.0,,"The daily beacon, October 9, 1968"
22897,beacon:2,2,,6,
20310,beacon:3,3,,5,
10674,beacon:4,4,,4,
7945,beacon:5,5,,3,
...,...,...,...,...,...
21242,beacon:1010,1010,,10,
22078,beacon:1011,1011,,9,
15920,beacon:1012,1012,,8,
14892,beacon:1013,1013,,7,


### Find Missing PIDs

In [9]:
pids = df['number']
missing_pids = np.setdiff1d(np.arange(pids.min(), pids.max()+1), pids)

print(np.count_nonzero(missing_pids))

11519


### Add Missing Rows

In [None]:
def get_title_from_pid(pid):
    url = f"https://digital.lib.utk.edu/collections/islandora/object/beacon%3A{pid}/datastream/MODS/view"
    
    try:
        # Fetch the XML data from the URL
        response = requests.get(url)
        response.raise_for_status()  # Check if the request was successful

        # Parse the XML data
        root = ET.fromstring(response.content)

        # Define the namespace
        namespaces = {
            'mods': 'http://www.loc.gov/mods/v3'
        }

        # Find the title within the XML structure
        title_element = root.find('.//mods:titleInfo/mods:title', namespaces)

        # Extract and return the title
        if title_element is not None:
            return title_element.text
        else:
            return None
    except requests.exceptions.RequestException as e:
        return None
        
for pid in missing_pids:
    title = get_title_from_pid(pid)
    if title is not None:
        data.append({pid, }, ignore_index = True)
        print(f"PID: {pid} -> Title: {title}")     