In [None]:
import requests
import json
import time
from datetime import datetime

def fetch_ufl_scholars():
    """Fetch all UF scholars data with pagination"""

    cookies = {
        'cookieConsent': '',
    }

    headers = {
        'Accept-Language': 'en-US,en;q=0.6',
        'Connection': 'keep-alive',
        'Origin': 'https://scholars.ufl.edu',
        'Referer': 'https://scholars.ufl.edu/search?back&by=text&type=user&v=',
        'Sec-Fetch-Dest': 'empty',
        'Sec-Fetch-Mode': 'cors',
        'Sec-Fetch-Site': 'same-origin',
        'Sec-GPC': '1',
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36',
        'accept': 'application/json',
        'content-type': 'application/json',
        'sec-ch-ua': '"Brave";v="143", "Chromium";v="143", "Not A(Brand";v="24"',
        'sec-ch-ua-mobile': '?0',
        'sec-ch-ua-platform': '"macOS"',
    }

    all_resources = []
    per_page = 100
    start_from = 0
    total_records = None

    print(f"Starting data extraction at {datetime.now()}")
    print(f"Records per page: {per_page}")
    print("-" * 60)

    while True:
        json_data = {
            'params': {
                'by': 'text',
                'category': 'user',
                'text': '',
            },
            'pagination': {
                'startFrom': start_from,
                'perPage': per_page,
            },
            'sort': 'lastNameAsc',
            'filters': [
                {
                    'name': 'tags',
                    'matchDocsWithMissingValues': True,
                    'useValuesToFilter': False,
                },
                {
                    'name': 'department',
                    'matchDocsWithMissingValues': True,
                    'useValuesToFilter': False,
                },
            ],
        }

        try:
            print(f"Fetching records {start_from} to {start_from + per_page}...")
            response = requests.post(
                'https://scholars.ufl.edu/api/users',
                cookies=cookies,
                headers=headers,
                json=json_data,
                timeout=30
            )

            response.raise_for_status()
            data = response.json()

            # Extract resources from this page
            if 'resource' in data:
                resources = data['resource']

                # If no resources returned, we've reached the end
                if not resources:
                    print("✓ No more records to fetch")
                    break

                all_resources.extend(resources)
                print(f"✓ Fetched {len(resources)} records. Total so far: {len(all_resources)}")
            else:
                print("⚠ No 'resource' field in response")
                break

            # Get total from first response
            if total_records is None and 'pagination' in data:
                total_records = data['pagination'].get('total', 0)
                print(f"ℹ Total records available: {total_records}")
                print(f"ℹ Estimated pages: {(total_records + per_page - 1) // per_page}")

            # Move to next page
            start_from += per_page

            # Check if we've fetched everything
            if total_records and start_from >= total_records:
                print("✓ All records fetched")
                break

            # Don't wait after the last request
            if total_records and start_from >= total_records:
                break

            print(f"Waiting 10 seconds before next request...")
            time.sleep(10)

        except requests.exceptions.RequestException as e:
            print(f"✗ Error fetching data at startFrom={start_from}: {e}")
            print("Saving partial data...")
            break

    # Save to JSON file (just the resources array)
    filename = f"ufl_scholars_data_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(all_resources, f, indent=2, ensure_ascii=False)

    print("-" * 60)
    print(f"✓ Data extraction complete!")
    print(f"Total records fetched: {len(all_resources)}")
    print(f"Data saved to: {filename}")

    return all_resources

if __name__ == "__main__":
    fetch_ufl_scholars()

In [None]:
import csv
import requests
import time
import sys

# Function based on get_email.py logic
def get_email(discovery_url_id):
    cookies = {
        'cookieConsent': '',
    }

    headers = {
        'Accept-Language': 'en-US,en;q=0.6',
        'Connection': 'keep-alive',
        'If-None-Match': 'W/"3c9-E4Kn9QarD8ifcyXpzLDtE1pAFmk"',
        # Update referer dynamically
        'Referer': f'https://scholars.ufl.edu/{discovery_url_id}/publications',
        'Sec-Fetch-Dest': 'empty',
        'Sec-Fetch-Mode': 'cors',
        'Sec-Fetch-Site': 'same-origin',
        'Sec-GPC': '1',
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36',
        'accept': 'application/json',
        'content-type': 'application/json',
        'sec-ch-ua': '"Brave";v="143", "Chromium";v="143", "Not A(Brand";v="24"',
        'sec-ch-ua-mobile': '?0',
        'sec-ch-ua-platform': '"macOS"',
    }

    url = f'https://scholars.ufl.edu/api/users/{discovery_url_id}'

    try:
        response = requests.get(url, cookies=cookies, headers=headers, timeout=10)
        if response.status_code == 200:
            data = response.json()
            # Extract email from the specific JSON path provided by user
            email_data = data.get('emailAddress', {})
            if isinstance(email_data, dict):
                return email_data.get('address', '')
            return ''
        else:
            print(f"Failed to fetch {discovery_url_id}: Status {response.status_code}")
            return ''
    except Exception as e:
        print(f"Error fetching {discovery_url_id}: {e}")
        return ''

def process_emails(input_csv, output_csv):
    print(f"Reading from {input_csv}...")

    with open(input_csv, 'r', encoding='utf-8') as infile:
        reader = csv.DictReader(infile)
        fieldnames = reader.fieldnames

        # Add 'email' to fieldnames if not present
        if 'email' not in fieldnames:
            fieldnames.append('email')

        rows = list(reader)
        total = len(rows)
        print(f"Found {total} entries. Starting extraction...")

        with open(output_csv, 'w', newline='', encoding='utf-8') as outfile:
            writer = csv.DictWriter(outfile, fieldnames=fieldnames)
            writer.writeheader()

            for i, row in enumerate(rows):
                discovery_id = row.get('discoveryUrlId')

                if discovery_id:
                    email = get_email(discovery_id)
                    row['email'] = email
                    if email:
                        print(f"[{i+1}/{total}] Found email for {discovery_id}: {email}")
                    else:
                        print(f"[{i+1}/{total}] No email found for {discovery_id}")
                else:
                    print(f"[{i+1}/{total}] Skipping row with missing discoveryUrlId")
                    row['email'] = ''

                writer.writerow(row)

                # Add delay to avoid rate limiting
                time.sleep(1)

    print(f"Done! Results saved to {output_csv}")

if __name__ == "__main__":
    input_file = "ufl_scholars_data.csv"
    output_file = "ufl_scholars_data_with_emails.csv"

    process_emails(input_file, output_file)


In [None]:
import csv
import requests
import json
import time
import os

def get_publications(discovery_url_id):
    url = 'https://scholars.ufl.edu/api/publications/linkedTo'

    cookies = {
        'cookieConsent': '',
    }

    headers = {
        'Accept-Language': 'en-US,en;q=0.6',
        'Connection': 'keep-alive',
        'Origin': 'https://scholars.ufl.edu',
        'Referer': f'https://scholars.ufl.edu/{discovery_url_id}/publications',
        'Sec-Fetch-Dest': 'empty',
        'Sec-Fetch-Mode': 'cors',
        'Sec-Fetch-Site': 'same-origin',
        'Sec-GPC': '1',
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36',
        'accept': 'application/json',
        'content-type': 'application/json',
        'sec-ch-ua': '"Brave";v="143", "Chromium";v="143", "Not A(Brand";v="24"',
        'sec-ch-ua-mobile': '?0',
        'sec-ch-ua-platform': '"macOS"',
    }

    # Using the exact payload structure provided
    json_data = {
        'objectId': discovery_url_id,
        'category': 'user',
        'pagination': {
            'perPage': 100, # Increased limit to capture more data if possible, though user example had 25
            'startFrom': 0,
        },
        'sort': 'dateDesc',
        'favouritesFirst': True,
    }

    try:
        response = requests.post(url, cookies=cookies, headers=headers, json=json_data, timeout=10)
        if response.status_code == 200:
            # Return the raw text as requested "whole json text"
            return response.text
        else:
            print(f"Failed to fetch publications for {discovery_url_id}: Status {response.status_code}")
            return '{}'
    except Exception as e:
        print(f"Error fetching publications for {discovery_url_id}: {e}")
        return '{}'

def get_grants(discovery_url_id):
    url = 'https://scholars.ufl.edu/api/grants/linkedTo'

    cookies = {
        'cookieConsent': 'WzAsZmFsc2Vd',
    }

    headers = {
        'Accept-Language': 'en-US,en;q=0.6',
        'Connection': 'keep-alive',
        'Origin': 'https://scholars.ufl.edu',
        'Referer': f'https://scholars.ufl.edu/{discovery_url_id}/grants',
        'Sec-Fetch-Dest': 'empty',
        'Sec-Fetch-Mode': 'cors',
        'Sec-Fetch-Site': 'same-origin',
        'Sec-GPC': '1',
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36',
        'accept': 'application/json',
        'content-type': 'application/json',
        'sec-ch-ua': '"Brave";v="143", "Chromium";v="143", "Not A(Brand";v="24"',
        'sec-ch-ua-mobile': '?0',
        'sec-ch-ua-platform': '"macOS"',
    }

    json_data = {
        'objectId': discovery_url_id,
        'category': 'user',
        'pagination': {
            'perPage': 100, # Increased limit
            'startFrom': 0,
        },
        'sort': 'dateDesc',
        'favouritesFirst': True,
    }

    try:
        response = requests.post(url, cookies=cookies, headers=headers, json=json_data, timeout=10)
        if response.status_code == 200:
            return response.text
        else:
            print(f"Failed to fetch grants for {discovery_url_id}: Status {response.status_code}")
            return '{}'
    except Exception as e:
        print(f"Error fetching grants for {discovery_url_id}: {e}")
        return '{}'

def process_research_data(input_csv, output_csv):
    print(f"Reading from {input_csv}...")

    if not os.path.exists(input_csv):
        print(f"Error: Input file '{input_csv}' not found.")
        return

    with open(input_csv, 'r', encoding='utf-8') as infile:
        reader = csv.DictReader(infile)
        fieldnames = reader.fieldnames

        # Add new columns
        if 'publications_json' not in fieldnames:
            fieldnames.append('publications_json')
        if 'grants_json' not in fieldnames:
            fieldnames.append('grants_json')

        rows = list(reader)
        total = len(rows)
        print(f"Found {total} entries. Starting extraction...")

        with open(output_csv, 'w', newline='', encoding='utf-8') as outfile:
            writer = csv.DictWriter(outfile, fieldnames=fieldnames)
            writer.writeheader()

            for i, row in enumerate(rows):
                discovery_id = row.get('discoveryUrlId')

                if discovery_id:
                    print(f"[{i+1}/{total}] Processing {discovery_id}...")

                    # Fetch Publications
                    pub_json = get_publications(discovery_id)
                    row['publications_json'] = pub_json

                    # Small delay between calls
                    time.sleep(0.7)

                    # Fetch Grants
                    grant_json = get_grants(discovery_id)
                    row['grants_json'] = grant_json

                    # Delay before next user
                    time.sleep(0.7)
                else:
                    print(f"[{i+1}/{total}] Skipping row with missing discoveryUrlId")
                    row['publications_json'] = '{}'
                    row['grants_json'] = '{}'

                writer.writerow(row)

    print(f"Done! Results saved to {output_csv}")

if __name__ == "__main__":
    # Input file can be the one with emails or the base one
    # Assuming chained execution: base -> emails -> research
    input_file = "ufl_scholars_data_with_emails.csv"

    # Fallback to base if email file doesn't exist (e.g. if skipped)
    if not os.path.exists(input_file):
        print(f"'{input_file}' not found, falling back to 'ufl_scholars_data.csv'")
        input_file = "ufl_scholars_data.csv"

    output_file = "ufl_scholars_data_complete.csv"

    process_research_data(input_file, output_file)