# Harvest License Information from ETD Metadata Records with OAI:PMH

In [None]:
from oaipmh.client import Client
from oaipmh.metadata import MetadataRegistry, oai_dc_reader
import pandas as pd

To run the `ListSets` operation and retrieve the set identifiers from the DSpace repository, you can use the `pyoai` library. This will allow you to identify the set corresponding to the community you're interested in harvesting.

In [None]:
# Initialize OAI-PMH client
endpoint = "https://vtechworks.lib.vt.edu/oai/request"
registry = MetadataRegistry()
registry.registerReader('oai_dc', oai_dc_reader)
client = Client(endpoint, registry)

# Fetch and list sets
sets = client.listSets()

This will print out the `set_spec` and `set_name` for the set corresponding to the community with the handle `10919/5534`. You can then use this `set_spec` for harvesting records from that specific community.

In [None]:
# The handle of the ETD community in VTechWorks
target_handle = "10919_5534"

for set_spec, set_name, _ in sets:
    if target_handle in set_spec:
        print(f"Set Spec: {set_spec}, Set Name: {set_name}")

By running this code, you'll get a list of metadata formats supported by the repository, displayed as their prefixes, schemas, and namespaces. You can then choose the most suitable metadata prefix for your harvesting task.

In [None]:
# List available metadata formats
metadata_formats = client.listMetadataFormats()

for prefix, schema, namespace in metadata_formats:
    print(f"Prefix: {prefix}, Schema: {schema}, Namespace: {namespace}")

In [None]:
from oaipmh.metadata import MetadataReader
from oaipmh.client import Client
from oaipmh.metadata import MetadataRegistry

from lxml import etree  # Make sure to import the etree module

# Define the custom DIMReader
class DIMReader(MetadataReader):
    """A reader for the 'dim' metadata format."""
    # Define the namespaces used in the 'dim' metadata format
    namespaces = {
        'dim': 'http://www.dspace.org/xmlns/dspace/dim',
        'xsi': 'http://www.w3.org/2001/XMLSchema-instance'
    }
    
    fields = {
        'identifier_uri': ('textList', './dim:dim/dim:field[@mdschema="dc" and @element="identifier" and @qualifier="uri"]/text()'),
        'title': ('textList', './dim:dim/dim:field[@mdschema="dc" and @element="title"]/text()'),
        'author': ('textList', './dim:dim/dim:field[@mdschema="dc" and @element="contributor" and @qualifier="author"]/text()'),
        'department': ('textList', './dim:dim/dim:field[@mdschema="dc" and @element="contributor" and @qualifier="department"]/text()'),
        'committeechair': ('textList', './dim:dim/dim:field[@mdschema="dc" and @element="contributor" and @qualifier="committeechair"]/text()'),
        'committeemembers': ('textList', './dim:dim/dim:field[@mdschema="dc" and @element="contributor" and @qualifier="committeemember"]/text()'),
        'abstract': ('textList', './dim:dim/dim:field[@mdschema="dc" and @element="description" and @qualifier="abstract"]/text()'),
        'abstract_general': ('textList', './dim:dim/dim:field[@mdschema="dc" and @element="description" and @qualifier="abstractgeneral"]/text()'),
        'subject_terms': ('textList', './dim:dim/dim:field[@mdschema="dc" and @element="subject"]/text()'),
        'degree_name': ('textList', './dim:dim/dim:field[@mdschema="dc" and @element="description" and @qualifier="degree"]/text()'),
        'degree_level': ('textList', './dim:dim/dim:field[@mdschema="thesis" and @element="degree" and @qualifier="level"]/text()'),
        'degree_grantor': ('textList', './dim:dim/dim:field[@mdschema="thesis" and @element="degree" and @qualifier="grantor"]/text()'),
        'degree_discipline': ('textList', './dim:dim/dim:field[@mdschema="thesis" and @element="degree" and @qualifier="discipline"]/text()'),
        'type': ('textList', './dim:dim/dim:field[@mdschema="dc" and @element="type"]/text()'),
        'format': ('textList', './dim:dim/dim:field[@mdschema="dc" and @element="format" and @qualifier="medium"]/text()'),
        'publisher': ('textList', './dim:dim/dim:field[@mdschema="dc" and @element="publisher"]/text()'),
        'license': ('textList', './dim:dim/dim:field[@mdschema="dc" and @element="rights"][1]/text()'),
        'license_url': ('textList', './dim:dim/dim:field[@mdschema="dc" and @element="rights" and @qualifier="uri"]/text()'),
        'date_accessioned': ('textList', './dim:dim/dim:field[@mdschema="dc" and @element="date" and @qualifier="accessioned"]/text()'),
        'date_available': ('textList', './dim:dim/dim:field[@mdschema="dc" and @element="date" and @qualifier="available"]/text()'),
        'date_issued': ('textList', './dim:dim/dim:field[@mdschema="dc" and @element="date" and @qualifier="issued"]/text()')
    }
        
    # Initialize the reader
    def __init__(self):
        super(DIMReader, self).__init__(fields=self.fields, namespaces=self.namespaces)

    def __call__(self, element):
        map = {}
        
        # Serialize the XML for debugging
        # xml_str = etree.tostring(element, pretty_print=True)
        # print("Debug: Serialized XML:")
        # print(xml_str.decode('utf-8'))
        
        for field, (mdtype, xpath) in self._fields.items():
            # print(f"Debug: XPath: {xpath}, Field: {field}, mdtype: {mdtype}")  # Debug print
            el_list = element.xpath(xpath, namespaces=self._namespaces)
            # print(f"Debug: XPath: {xpath}, Result: {el_list}")  # Debug print
            if mdtype == 'text':
                map[field] = el_list
            elif mdtype == 'textList':
                # Join the elements of the list into a single string
                map[field] = '; '.join(el_list) 
        # print(f"Debug: Final map: {map}")  # Debug print
        return map


# Test the custom DIMReader

# Initialize OAI-PMH client
registry = MetadataRegistry()
registry.registerReader('dim', DIMReader())
client = Client('https://vtechworks.lib.vt.edu/oai/request', registry)

header, metadata, about = client.getRecord(
    metadataPrefix='dim', identifier='oai:vtechworks.lib.vt.edu:10919/103424'
)

print(metadata)

This code harvests metadata from the DSpace set identified as `com_10919_5534` using the OAI-PMH protocol. It initializes an empty list, `metadata_list`, to store the harvested records. A `for` loop iterates through the records fetched by the `listRecords` method, extracting the header and metadata. Each metadata record, along with its datestamp, is converted to a dictionary and appended to `metadata_list`. Finally, this list is transformed into a Pandas DataFrame, `df`, for easier data manipulation and analysis.

In [None]:
from oaipmh.client import Client
from oaipmh.metadata import MetadataRegistry
import pandas as pd
import time

def fetch_metadata(target_set):
    # Initialize OAI-PMH client
    registry = MetadataRegistry()
    registry.registerReader('dim', DIMReader())
    client = Client('https://vtechworks.lib.vt.edu/oai/request', registry)
    
    # Initialize an empty list to collect metadata
    metadata_list = []
    
    # Initialize counters
    record_count = 0  # Total records processed
    batch_count = 1  # Batch number
    batch_size = 5000  # Number of records per batch
    max_retries = 3  # Maximum number of retries
    retry_count = 0  # Initialize retry counter
    
    while retry_count < max_retries:
        try:
            records = client.listRecords(metadataPrefix='dim', set=target_set)
            for header, meta_dict, _ in records:
                record_count += 1
        
                # Check if metadata is None
                if meta_dict is None:
                    # print("Skipping record due to missing metadata.")
                    continue
                    
                # meta_dict['datestamp'] = header.datestamp()
                metadata_list.append(meta_dict)
        
                # Check if it's time to save a batch
                # if record_count % batch_size == 0:
                #     df_batch = pd.DataFrame(metadata_list)
                #     df_batch.to_csv(f'metadata_batch_{batch_count}.csv', index=False)
                #     metadata_list = []  # Clear the list for the next batch
                #     batch_count += 1  # Increment the batch counter
    
            # If processing is successful, reset retry count
            retry_count = 0
            break  # Exit the loop if successful
    
        except Exception as e:
            print(f"An error occurred: {e}")
            retry_count += 1  # Increment retry counter
            print(f"Retrying in 60 seconds... (Attempt {retry_count} of {max_retries})")
            time.sleep(60)
    
    if retry_count >= max_retries:
        print("Max retries reached. Exiting.")
        return None
    
    return pd.DataFrame(metadata_list)



target_set = "com_10919_5534"

df_batch = fetch_metadata(target_set)
df_batch.to_csv(f'data/vt_etd_metadata_all.csv', index=False)