In [2]:
!pip install sickle pandas

Defaulting to user installation because normal site-packages is not writeable
Collecting sickle
  Downloading Sickle-0.7.0-py3-none-any.whl.metadata (4.5 kB)
Collecting lxml>=3.2.3 (from sickle)
  Downloading lxml-5.3.0-cp39-cp39-macosx_10_9_universal2.whl.metadata (3.8 kB)
Downloading Sickle-0.7.0-py3-none-any.whl (12 kB)
Downloading lxml-5.3.0-cp39-cp39-macosx_10_9_universal2.whl (8.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.1/8.1 MB[0m [31m30.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: lxml, sickle
Successfully installed lxml-5.3.0 sickle-0.7.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49m/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip[0m


In [1]:
# Step 1: Import necessary libraries
from sickle import Sickle  # For OAI-PMH harvesting
import pandas as pd  # For data manipulation and analysis
import time  # For implementing delays
from requests.exceptions import RequestException  # For error handling
from lxml import etree  # For XML parsing

# Step 2: Define a function to parse each record
def parse_record(record):
    """
    Parse an OAI-PMH record and extract relevant metadata fields.
    
    :param record: An OAI-PMH record object
    :return: A dictionary containing parsed metadata
    """
    # Parse the XML content of the record
    root = etree.fromstring(record.raw)
    
    # Define namespace
    ns = {'oai_dc': 'http://www.openarchives.org/OAI/2.0/oai_dc/',
          'dc': 'http://purl.org/dc/elements/1.1/'}
    
    # Extract metadata
    get_text = lambda tag: '; '.join(e.text for e in root.findall(f'.//dc:{tag}', ns) if e.text)
    
    return {
        'identifier': record.header.identifier,  # Unique identifier for the record
        'datestamp': record.header.datestamp,  # Last modification date of the record
        'title': metadata.get('title', [None])[0],  # Title of the work
        'creator': '; '.join(metadata.get('creator', [])),  # Author(s) of the work
        'date': '; '.join(metadata.get('date', [])),  # Relevant dates (e.g., publication, submission)
        'description': '; '.join(metadata.get('description', [])),  # Abstract or other descriptions
        'subject': '; '.join(metadata.get('subject', [])),  # Subject terms or keywords
        'publisher': metadata.get('publisher', [None])[0],  # Publisher information
        'type': '; '.join(metadata.get('type', [])),  # Type of the work (e.g., thesis, dissertation)
        'language': metadata.get('language', [None])[0],  # Language of the work
        'relation': '; '.join(metadata.get('relation', [])),  # Related information (e.g., report numbers)
        'identifier_url': metadata.get('identifier', [None])[0],  # URL or DOI of the work
    }

# Step 3: Set up the OAI-PMH client
base_url = "https://www.ideals.illinois.edu/oai-pmh"
sickle = Sickle(base_url)

# Step 4: Define harvesting parameters
metadata_prefix = "oai_dc"  # We're using the Dublin Core metadata format
set_spec = "com_2142_5130"  # Graduate Dissertations and Theses at Illinois



In [2]:
# Step 5: Harvest the records
print("Starting the harvesting process...")
records = []
retries = 3 # Number of retry attempts for each record
for record in sickle.ListRecords(metadataPrefix=metadata_prefix, set=set_spec):
    for attempt in range(retries):
        try:
            parsed_record = parse_record(record)
            records.append(parsed_record)
            if len(records) % 1000 == 0:
                print(f"Harvested {len(records)} records...")
                time.sleep(1) # Sleep for 1 second every 1000 records to avoid overwhelming the server
            break # Exit the retry loop if successful
        except RequestException as e:
            if attempt < retries - 1:
                print(f"Error occurred: {e}. Retrying in 5 seconds...")
                time.sleep(5)
            else:
                print(f"Failed to harvest record after {retries} attempts.")

print(f"Total records harvested: {len(records)}")

Starting the harvesting process...
Harvested 1000 records...
Harvested 2000 records...
Harvested 3000 records...
Harvested 4000 records...
Harvested 5000 records...
Harvested 6000 records...
Harvested 7000 records...
Harvested 8000 records...
Harvested 9000 records...
Harvested 10000 records...
Harvested 11000 records...
Harvested 12000 records...
Harvested 13000 records...
Harvested 14000 records...
Harvested 15000 records...
Harvested 16000 records...
Harvested 17000 records...
Harvested 18000 records...
Harvested 19000 records...
Harvested 20000 records...
Harvested 21000 records...
Harvested 22000 records...
Harvested 23000 records...
Harvested 24000 records...
Harvested 25000 records...
Harvested 26000 records...
Harvested 27000 records...
Harvested 28000 records...
Harvested 29000 records...
Harvested 30000 records...
Harvested 31000 records...
Harvested 32000 records...
Harvested 33000 records...
Harvested 34000 records...
Harvested 35000 records...
Harvested 36000 records...
Ha

In [3]:
# Step 6: Convert the harvested records to a pandas DataFrame
df = pd.DataFrame(records)

# Step 7: Save the data to a CSV file
df.to_csv('uiuc_etd_metadata.csv', index=False)
print("Data saved to uiuc_etd_metadata.csv") # Save locally 


Data saved to uiuc_etd_metadata.csv


In [4]:
# Step 8: Perform basic analysis on the harvested data
print("\nBasic analysis of the harvested data:")
print(f"Total number of records: {len(df)}")

# Calculate the date range, assuming the first date in the list is the most relevant
print(f"Date range: from {df['date'].str.split(';').str[0].min()} to {df['date'].str.split(';').str[0].max()}")



Basic analysis of the harvested data:
Total number of records: 53471
Date range: from  to 2024-09-16T10:20:27-05:00


In [6]:
# # Handle date analysis more robustly
# df['first_date'] = df['date'].str.split(';').str[0]
# df['first_date'] = pd.to_datetime(df['first_date'], errors='coerce')

# # Count records with valid dates
# valid_dates = df['first_date'].notna()
# print(f"Records with valid dates: {valid_dates.sum()} ({valid_dates.mean():.2%})")

# if valid_dates.any():
#     print(f"Date range: from {df['first_date'].min()} to {df['first_date'].max()}")
# else:
#     print("No valid dates found in the dataset.")

# # Show records without dates
# if not valid_dates.all():
#     print("\nSample of records without valid dates:")
#     print(df[~valid_dates][['identifier', 'title', 'date']].head())

# Handle date analysis more robustly
df['first_date'] = df['date'].str.split(';').str[0]
df['first_date'] = pd.to_datetime(df['first_date'], errors='coerce') 

# Count records with valid dates
valid_dates = df['first_date'].notna()
print(f"Records with valid dates: {valid_dates.sum()} ({valid_dates.mean():.2%})")

if valid_dates.any():
    min_date = df['first_date'].dropna().min()
    max_date = df['first_date'].dropna().max()
    print(f"Date range: from {min_date} to {max_date}")
else:
    print("No valid dates found in the dataset.")

# Show records without dates
if not valid_dates.all():
    print("\nSample of records without valid dates:")
    print(df[~valid_dates][['identifier', 'title', 'date']].head()) 


Records with valid dates: 48604 (90.90%)
Date range: from 2007-12-11 01:07:40+00:00 to 2024-09-16 10:20:27-05:00

Sample of records without valid dates:
                                 identifier  \
132  oai:www.ideals.illinois.edu:2142/97363   
151  oai:www.ideals.illinois.edu:2142/97382   
322  oai:www.ideals.illinois.edu:2142/97553   
585  oai:www.ideals.illinois.edu:2142/98092   
647  oai:www.ideals.illinois.edu:2142/98214   

                                                 title  \
132  Stability thresholds for signed Laplacians on ...   
151  Unbridled: the horses of Géricault’s English S...   
322  The α-helical conformation of polypeptides: de...   
585                                                      
647           Geometry and topological phase of matter   

                                                  date  
132  2017-04-19; 2017-05; 2017-08-10T19:15:07Z; 201...  
151  2017-04-24; 2017-08-10T19:15:17Z; 2017-08-10T1...  
322  2017-04-13; 2017-05; 2017-08-10T19:51:4

  df['first_date'] = pd.to_datetime(df['first_date'], errors='coerce')


In [7]:
# Analyze the top 10 subjects
print("\nTop 10 subjects:")
print(df['subject'].str.split(';').explode().str.strip().value_counts().head(10))

# Analyze the document types
print("\nDocument types:")
print(df['type'].str.split(';').explode().str.strip().value_counts())


Top 10 subjects:
subject
Engineering, Electronics and Electrical    1782
Computer Science                           1192
Mathematics                                 994
Engineering, Civil                          964
Chemistry, Biochemistry                     854
Physics, Condensed Matter                   726
Chemistry, Organic                          723
Engineering, Mechanical                     691
Engineering, Materials Science              661
Education, Educational Psychology           615
Name: count, dtype: int64

Document types:
type
text                        56966
Thesis                       5238
                             3054
Dissertation / Thesis        2503
dissertation/thesis            83
Text                           12
image                           8
Technical Report                8
ger                             4
Score                           4
other                           3
Other                           2
Journal (whole)                 2
Bibl