In [2]:
!pip install sickle pandas

Defaulting to user installation because normal site-packages is not writeable
Collecting sickle
  Downloading Sickle-0.7.0-py3-none-any.whl.metadata (4.5 kB)
Collecting lxml>=3.2.3 (from sickle)
  Downloading lxml-5.3.0-cp39-cp39-macosx_10_9_universal2.whl.metadata (3.8 kB)
Downloading Sickle-0.7.0-py3-none-any.whl (12 kB)
Downloading lxml-5.3.0-cp39-cp39-macosx_10_9_universal2.whl (8.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.1/8.1 MB[0m [31m30.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: lxml, sickle
Successfully installed lxml-5.3.0 sickle-0.7.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49m/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip[0m


In [1]:
# Step 1: Import necessary libraries
from sickle import Sickle  # For OAI-PMH harvesting
import pandas as pd  # For data manipulation and analysis
import time  # For implementing delays
from requests.exceptions import RequestException  # For error handling
from lxml import etree  # For XML parsing

# Step 2: Define a function to parse each record
def parse_record(record):
    """
    Parse an OAI-PMH record and extract relevant metadata fields.
    
    :param record: An OAI-PMH record object
    :return: A dictionary containing parsed metadata
    """
    # Parse the XML content of the record
    root = etree.fromstring(record.raw)
    
    # Define namespace
    ns = {'oai_dc': 'http://www.openarchives.org/OAI/2.0/oai_dc/',
          'dc': 'http://purl.org/dc/elements/1.1/'}
    
    # Extract metadata
    get_text = lambda tag: '; '.join(e.text for e in root.findall(f'.//dc:{tag}', ns) if e.text)
    
    return {
        'identifier': record.header.identifier,  # Unique identifier for the record
        'datestamp': record.header.datestamp,  # Last modification date of the record
        'title': metadata.get('title', [None])[0],  # Title of the work
        'creator': '; '.join(metadata.get('creator', [])),  # Author(s) of the work
        'date': '; '.join(metadata.get('date', [])),  # Relevant dates (e.g., publication, submission)
        'description': '; '.join(metadata.get('description', [])),  # Abstract or other descriptions
        'subject': '; '.join(metadata.get('subject', [])),  # Subject terms or keywords
        'publisher': metadata.get('publisher', [None])[0],  # Publisher information
        'type': '; '.join(metadata.get('type', [])),  # Type of the work (e.g., thesis, dissertation)
        'language': metadata.get('language', [None])[0],  # Language of the work
        'relation': '; '.join(metadata.get('relation', [])),  # Related information (e.g., report numbers)
        'identifier_url': metadata.get('identifier', [None])[0],  # URL or DOI of the work
    }

# Step 3: Set up the OAI-PMH client
base_url = "https://www.ideals.illinois.edu/oai-pmh"
sickle = Sickle(base_url)

# Step 4: Define harvesting parameters
metadata_prefix = "oai_dc"  # We're using the Dublin Core metadata format
set_spec = "com_2142_5130"  # Graduate Dissertations and Theses at Illinois



In [2]:
# Step 5: Harvest the records
print("Starting the harvesting process...")
records = []
retries = 3 # Number of retry attempts for each record
for record in sickle.ListRecords(metadataPrefix=metadata_prefix, set=set_spec):
    for attempt in range(retries):
        try:
            parsed_record = parse_record(record)
            records.append(parsed_record)
            if len(records) % 1000 == 0:
                print(f"Harvested {len(records)} records...")
                time.sleep(1) # Sleep for 1 second every 1000 records to avoid overwhelming the server
            break # Exit the retry loop if successful
        except RequestException as e:
            if attempt < retries - 1:
                print(f"Error occurred: {e}. Retrying in 5 seconds...")
                time.sleep(5)
            else:
                print(f"Failed to harvest record after {retries} attempts.")

print(f"Total records harvested: {len(records)}")

Starting the harvesting process...
Harvested 1000 records...
Harvested 2000 records...
Harvested 3000 records...
Harvested 4000 records...
Harvested 5000 records...
Harvested 6000 records...
Harvested 7000 records...
Harvested 8000 records...
Harvested 9000 records...
Harvested 10000 records...
Harvested 11000 records...
Harvested 12000 records...
Harvested 13000 records...
Harvested 14000 records...
Harvested 15000 records...
Harvested 16000 records...
Harvested 17000 records...
Harvested 18000 records...
Harvested 19000 records...
Harvested 20000 records...
Harvested 21000 records...
Harvested 22000 records...
Harvested 23000 records...
Harvested 24000 records...
Harvested 25000 records...
Harvested 26000 records...
Harvested 27000 records...
Harvested 28000 records...
Harvested 29000 records...
Harvested 30000 records...
Harvested 31000 records...
Harvested 32000 records...
Harvested 33000 records...
Harvested 34000 records...
Harvested 35000 records...
Harvested 36000 records...
Ha

In [3]:
# Step 6: Convert the harvested records to a pandas DataFrame
df = pd.DataFrame(records)

# Step 7: Save the data to a CSV file
df.to_csv('uiuc_etd_metadata.csv', index=False)
print("Data saved to uiuc_etd_metadata.csv") 


Data saved to uiuc_etd_metadata.csv


In [4]:
# Step 8: Perform basic analysis on the harvested data
print("\nBasic analysis of the harvested data:")
print(f"Total number of records: {len(df)}")

# Calculate the date range, assuming the first date in the list is the most relevant
print(f"Date range: from {df['date'].str.split(';').str[0].min()} to {df['date'].str.split(';').str[0].max()}")



Basic analysis of the harvested data:
Total number of records: 53471
Date range: from  to 2024-09-16T10:20:27-05:00


In [12]:
import pandas as pd
from datetime import datetime

# Load the CSV file
df = pd.read_csv('uiuc_etd_metadata.csv')

# Function to extract the earliest and latest dates
def extract_dates(date_string):
    if pd.isna(date_string):
        return pd.NaT, pd.NaT
    dates = date_string.split(';')
    valid_dates = []
    for date in dates:
        try:
            parsed_date = pd.to_datetime(date.strip(), errors='coerce')
            if not pd.isna(parsed_date):
                # Convert to UTC and then remove timezone info
                if parsed_date.tzinfo is not None:
                    parsed_date = parsed_date.tz_convert('UTC').tz_localize(None)
                valid_dates.append(parsed_date)
        except:
            continue
    if valid_dates:
        return min(valid_dates), max(valid_dates)
    else:
        return pd.NaT, pd.NaT

# Apply the function to create new 'earliest_date' and 'update_date' columns
df['earliest_date'], df['update_date'] = zip(*df['date'].apply(extract_dates))

# Set the last_modified_date
last_modified_date = pd.to_datetime("2023-07-10")
print(f"Last modified date: {last_modified_date}")

# Identify records where 'update_date' is after last_modified_date
future_records = df[df['update_date'] > last_modified_date]
print(f"Number of records with update dates after {last_modified_date}: {len(future_records)}")

if not future_records.empty:
    print("\nSample of records with future update dates:")
    print(future_records[['identifier', 'title', 'earliest_date', 'update_date']].head())
    
    # Count records by year for future update dates
    future_years = future_records['update_date'].dt.year.value_counts().sort_index()
    print("\nDistribution of records with future update dates by year:")
    print(future_years)
else:
    print("No records found with update dates after 2023-07-10.")

# Additional analysis: Check for records with update_date equal to last_modified_date
records_on_last_date = df[df['update_date'].dt.date == last_modified_date.date()]
print(f"\nNumber of records with update date equal to last modified date: {len(records_on_last_date)}")

if not records_on_last_date.empty:
    print("\nSample of records with update date equal to last modified date:")
    print(records_on_last_date[['identifier', 'title', 'earliest_date', 'update_date']].head())

# Distribution of earliest dates
print("\nDistribution of earliest dates by year:")
earliest_years = df['earliest_date'].dt.year.value_counts().sort_index()
print(earliest_years) 

Last modified date: 2023-07-10 00:00:00
Number of records with update dates after 2023-07-10 00:00:00: 3338

Sample of records with future update dates:
                                 identifier  \
317  oai:www.ideals.illinois.edu:2142/97548   
351  oai:www.ideals.illinois.edu:2142/97582   
973  oai:www.ideals.illinois.edu:2142/98611   
978  oai:www.ideals.illinois.edu:2142/98616   
983  oai:www.ideals.illinois.edu:2142/98621   

                                                 title earliest_date  \
317  Global poverty, women’s empowerment, and highe...    2017-04-14   
351  States of discretion: Black migrating bodies a...    2017-04-18   
973  A study of the effect of rest on the fatigue s...    1919-01-01   
978  Highway location in the foothills of the Rocky...    1936-01-01   
983  The viability of the nodule bacteria of legume...    1921-01-01   

            update_date  
317 2025-04-23 05:00:00  
351 2023-11-03 05:00:00  
973 2040-01-01 06:00:00  
978 2032-01-01 06:00:00  
9

In [7]:
import pandas as pd

# Read the CSV file
df = pd.read_csv('uiuc_etd_metadata.csv')

# Define the search criteria
title = "Three papers in urban and regional economic and development"
creator = "Yu, Chenxi"
identifier = "http://hdl.handle.net/2142/97273"

# Search for the paper
result = df[(df['title'] == title) & 
            (df['creator'] == creator) & 
            (df['identifier'] == identifier)]

# Print the matched title record
if not result_by_title.empty:
    print("Record matching the title:")
    print(result_by_title.to_string(index=False))
else:
    print("No record found matching the title.")

# Check if the paper was found
if not result.empty:
    print("\nThe paper 100% match was found in the CSV file.")
    print(result)
else:
    print("\nThe paper 100% match was not found in the CSV file.")

# If you want to search by just one or two criteria, you can use:
result_by_title = df[df['title'] == title]
result_by_creator = df[df['creator'] == creator]
result_by_identifier = df[df['identifier'] == identifier]

print(f"\nMatches by title: {len(result_by_title)}")
print(f"\nMatches by creator: {len(result_by_creator)}")
print(f"Matches by identifier: {len(result_by_identifier)}")

Record matching the title:
                            identifier  datestamp                                                       title    creator                                                            date                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     