In [58]:
import requests
import time
import json
from datetime import datetime
import os

# URL of the website
url = "https://secure.in.gov/apps/idoa/contractsearch/api/contracts/search"

In [60]:
# Helper functions for incremental update
def load_existing_contracts(filepath):
    """Load existing contracts and return both data and set of existing records (as tuples)"""
    if os.path.exists(filepath):
        try:
            with open(filepath, 'r') as f:
                data = json.load(f)
            
            # Create a set of tuples representing unique records
            # Using key fields that together identify a unique record
            existing_records = set()
            for record in data:
                if isinstance(record, dict):
                    # Create a tuple of key fields that uniquely identify a record
                    record_tuple = (
                        record.get('id'),
                        record.get('pdfUrl'),
                        record.get('actionType'),
                        record.get('vendorName'),
                        record.get('amount'),
                        record.get('startDate'),
                        record.get('endDate')
                    )
                    existing_records.add(record_tuple)
            
            print(f"Successfully loaded {len(data)} records from {filepath}")
            print(f"Found {len(existing_records)} unique records")
            return data, existing_records
            
        except json.JSONDecodeError as e:
            print(f"Error reading JSON file: {e}")
            print("File may be corrupted. Returning empty data.")
            return [], set()
        except Exception as e:
            print(f"Unexpected error reading file: {e}")
            return [], set()
    else:
        print(f"File {filepath} does not exist. Will create new file.")
        return [], set()

def get_record_tuple(record):
    """Convert a record dict to a tuple for comparison"""
    return (
        record.get('id'),
        record.get('pdfUrl'),
        record.get('actionType'),
        record.get('vendorName'),
        record.get('amount'),
        record.get('startDate'),
        record.get('endDate')
    )

def scrape_all_new_records(url, existing_records, contract_type_flags=None, page_size=1000):
    """
    Scrape all pages and collect only new records that don't exist in the dataset.
    """
    new_records = []
    page = 1
    
    # Get first page to determine total pages
    payload = {
        "pageNumber": page,
        "pageSize": page_size
    }
    
    if contract_type_flags is not None:
        payload["contractTypeFlags"] = contract_type_flags
    
    # Make initial request
    resp = requests.post(url, json=payload)
    if resp.status_code != 200:
        print(f"Error: {resp.status_code}")
        return new_records
    
    js_result = resp.json()
    pagination = js_result.get('pagination', {})
    total_pages = pagination.get('totalPages', 0)
    
    print(f"Total pages to scan: {total_pages}")
    
    # Process all pages
    while page <= total_pages:
        if page > 1:  # We already have page 1 data
            payload["pageNumber"] = page
            resp = requests.post(url, json=payload)
            if resp.status_code != 200:
                print(f"Error on page {page}: {resp.status_code}")
                page += 1
                continue
            js_result = resp.json()
        
        results = js_result.get('results', [])
        
        # Check each record
        new_on_page = 0
        for record in results:
            record_tuple = get_record_tuple(record)
            if record_tuple not in existing_records:
                new_records.append(record)
                new_on_page += 1
        
        print(f"Page {page}/{total_pages}: Found {new_on_page} new records")
        
        page += 1
        if page <= total_pages:
            time.sleep(0.25)  # Rate limiting
    
    print(f"\nTotal new records found: {len(new_records)}")
    return new_records

In [62]:
# Configuration
# Set CLOBBER = True to scrape everything and overwrite existing file
# Set CLOBBER = False to only scrape until existing IDs are found and combine with existing data
CLOBBER = False

In [64]:
def scrape_all_contracts(url, contract_type_flags=None, start_date='2000-01-01T00:00:00.0000000', page_size=1000):
    """Scrape all contracts from the start date"""
    # Build initial request
    payload = {
        "pageNumber": 1,
        "pageSize": page_size,
        "startDate": start_date
    }
    
    if contract_type_flags is not None:
        payload["contractTypeFlags"] = contract_type_flags
    
    # Get first page
    resp = requests.post(url, json=payload)
    js_result = resp.json()
    paginate = js_result['pagination']
    total_pages = paginate['totalPages']
    all_records = js_result['results']
    
    print(f"Total pages to scrape: {total_pages}")
    
    # Get remaining pages
    for p in range(2, total_pages + 1):
        print(f"Processing page {p}/{total_pages}")
        payload["pageNumber"] = p
        resp = requests.post(url, json=payload)
        js_result = resp.json()
        all_records.extend(js_result['results'])
        time.sleep(0.25)
    
    return all_records

In [66]:

# Correct usage:
filepath = '../../data/raw/indiana_contracts.json'
data, existing_ids = load_existing_contracts(filepath)


Successfully loaded 181000 records from ../../data/raw/indiana_contracts.json
Found 180445 unique records


In [68]:
# Example: How to use load_existing_contracts function correctly
# The function returns a tuple: (data, existing_urls)

# Correct usage:
filepath = '../../data/raw/indiana_contracts.json'
data, existing_urls = load_existing_contracts(filepath)

# If you want to inspect the data structure:
if data:
    print("\nFirst contract structure:")
    print(f"Keys: {list(data[0].keys())}")
    print(f"\nExample record:")
    print(f"ID: {data[0].get('id')}")
    print(f"PDF URL: {data[0].get('pdfUrl')}")
    print(f"Vendor: {data[0].get('vendorName')}")
    print(f"Amount: ${data[0].get('amount'):,.2f}" if data[0].get('amount') else "Amount: N/A")

Successfully loaded 181000 records from ../../data/raw/indiana_contracts.json
Found 180445 unique records

First contract structure:
Keys: ['actionType', 'agencyName', 'amendment', 'amount', 'businessUnit', 'contractTypeFlags', 'id', 'endDate', 'pdfUrl', 'startDate', 'vendorName', 'zipCode', 'approvals']

Example record:
ID: A70-5-2020
PDF URL: https://contracts.idoa.in.gov/idoacontractsweb/PUBLIC/10778-001.pdf
Vendor: GENESIS SYSTEMS INC
Amount: N/A


In [70]:
# Update all contracts
all_contracts_file = '../../data/raw/indiana_contracts.json'

if CLOBBER:
    print("CLOBBER mode: Scraping all contracts and overwriting existing file...")
    all_data = scrape_all_contracts(url)
    print(f"Scraped {len(all_data)} total contracts")
    
    # Save all data (overwrite)
    with open(all_contracts_file, 'w') as write_file:
        json.dump(all_data, write_file)
        
    print(f"Saved {len(all_data)} contracts (overwrote existing file)")
    
else:
    print("Incremental mode: Scanning for new contracts across all pages...")
    
    # Load existing data
    existing_data, existing_records = load_existing_contracts(all_contracts_file)
    
    # Scrape all pages looking for new records
    if existing_records:
        print("Scanning all pages for new contracts...")
        new_records = scrape_all_new_records(url, existing_records)
        print(f"Found {len(new_records)} new contracts")
        
        # Combine new and existing data (new records first)
        updated_data = new_records + existing_data
        
        # Save updated data
        with open(all_contracts_file, 'w') as write_file:
            json.dump(updated_data, write_file)
            
        print(f"Total contracts after update: {len(updated_data)}")
        
    else:
        print("No existing data found. Performing full scrape...")
        all_data = scrape_all_contracts(url)
        
        # Save all data
        with open(all_contracts_file, 'w') as write_file:
            json.dump(all_data, write_file)
            
        print(f"Saved {len(all_data)} contracts")

Incremental mode: Scanning for new contracts across all pages...
Successfully loaded 181000 records from ../../data/raw/indiana_contracts.json
Found 180445 unique records
Scanning all pages for new contracts...
Total pages to scan: 194
Page 1/194: Found 5 new records
Page 2/194: Found 29 new records
Page 3/194: Found 69 new records
Page 4/194: Found 200 new records
Page 5/194: Found 3 new records
Page 6/194: Found 1 new records
Page 7/194: Found 3 new records
Page 8/194: Found 110 new records
Page 9/194: Found 8 new records
Page 10/194: Found 19 new records
Page 11/194: Found 7 new records
Page 12/194: Found 31 new records
Page 13/194: Found 232 new records
Page 14/194: Found 435 new records
Page 15/194: Found 142 new records
Page 16/194: Found 70 new records
Page 17/194: Found 19 new records
Page 18/194: Found 28 new records
Page 19/194: Found 21 new records
Page 20/194: Found 20 new records
Page 21/194: Found 46 new records
Page 22/194: Found 88 new records
Page 23/194: Found 70 new 

In [72]:
# Update professional services contracts
prof_services_file = '../../data/raw/indiana_prof_services_contracts.json'

# Contract type flags
# 128 = professional services
contract_type_flags = 128

if CLOBBER:
    print("CLOBBER mode: Scraping all professional services contracts and overwriting existing file...")
    all_data = scrape_all_contracts(url, contract_type_flags=contract_type_flags)
    print(f"Scraped {len(all_data)} total professional services contracts")
    
    # Save all data (overwrite)
    with open(prof_services_file, 'w') as write_file:
        json.dump(all_data, write_file)
        
    print(f"Saved {len(all_data)} professional services contracts (overwrote existing file)")
    
else:
    print("Incremental mode: Scanning for new professional services contracts across all pages...")
    
    # Load existing data
    existing_data, existing_records = load_existing_contracts(prof_services_file)
    
    # Scrape all pages looking for new records
    if existing_records:
        print("Scanning all pages for new professional services contracts...")
        new_records = scrape_all_new_records(url, existing_records, contract_type_flags=contract_type_flags)
        print(f"Found {len(new_records)} new professional services contracts")
        
        # Combine new and existing data (new records first)
        updated_data = new_records + existing_data
        
        # Save updated data
        with open(prof_services_file, 'w') as write_file:
            json.dump(updated_data, write_file)
            
        print(f"Total professional services contracts after update: {len(updated_data)}")
        
    else:
        print("No existing data found. Performing full scrape...")
        all_data = scrape_all_contracts(url, contract_type_flags=contract_type_flags)
        
        # Save all data
        with open(prof_services_file, 'w') as write_file:
            json.dump(all_data, write_file)
            
        print(f"Saved {len(all_data)} professional services contracts")

# Other contract type flags for reference:
# 0 = all
# 1 = attorney
# 4 = grant
# 8 = lease
# 64 = MOU
# 128 = professional services

Incremental mode: Scanning for new professional services contracts across all pages...
Successfully loaded 30000 records from ../../data/raw/indiana_prof_services_contracts.json
Found 29998 unique records
Scanning all pages for new professional services contracts...
Total pages to scan: 35
Page 1/35: Found 313 new records
Page 2/35: Found 88 new records
Page 3/35: Found 125 new records
Page 4/35: Found 24 new records
Page 5/35: Found 18 new records
Page 6/35: Found 594 new records
Page 7/35: Found 164 new records
Page 8/35: Found 408 new records
Page 9/35: Found 130 new records
Page 10/35: Found 5 new records
Page 11/35: Found 194 new records
Page 12/35: Found 30 new records
Page 13/35: Found 82 new records
Page 14/35: Found 54 new records
Page 15/35: Found 122 new records
Page 16/35: Found 137 new records
Page 17/35: Found 72 new records
Page 18/35: Found 166 new records
Page 19/35: Found 115 new records
Page 20/35: Found 46 new records
Page 21/35: Found 0 new records
Page 22/35: Foun