In [16]:
# Import what's available
from fbref_match_downloader import quick_download, FBRefMatchDownloader
import os

# Your match IDs
match_ids = [
    'f8177893',
    '7ac1c0c8',
    '83edc9ff',
    'd0426a07',
    'b9cf7980'
]

# Check what's already downloaded (if any)
downloaded = []
if os.path.exists('match_html_files'):
    downloaded = [f.replace('match_', '').replace('.html', '') 
                  for f in os.listdir('match_html_files') 
                  if f.startswith('match_') and f.endswith('.html')]
    print(f"Already downloaded: {downloaded}")

# Filter to only download missing matches
remaining_matches = [m for m in match_ids if m not in downloaded]
print(f"\nTotal matches: {len(match_ids)}")
print(f"Already downloaded: {len(downloaded)}")
print(f"Remaining to download: {len(remaining_matches)}")
print(f"Match IDs to download: {remaining_matches}")

# Download with anti-blocking measures
if remaining_matches:
    print("\nStarting download with anti-blocking measures...")
    print("(6-12 second delays, break every 2 downloads)")
    
    results = quick_download(
        remaining_matches,
        min_delay=6.0,      # 6-12 seconds between requests
        max_delay=12.0,
        batch_size=2        # Break every 2 downloads
    )
    
    # Show results
    print("\n" + "="*50)
    successful = sum(1 for r in results if r['status'] == 'success')
    failed = sum(1 for r in results if r['status'] != 'success')
    print(f"Download complete!")
    print(f"✓ Successful: {successful}")
    print(f"✗ Failed: {failed}")
    
    # Show individual results
    print("\nDetailed results:")
    for r in results:
        status_icon = "✓" if r['status'] == 'success' else "✗"
        print(f"  {status_icon} {r['match_id']}: {r['status']}")
else:
    print("\nAll matches already downloaded! Nothing to do.")

Already downloaded: ['7284c984', '6a57c82e', 'd0426a07', '8883ea79', '81481f61', 'fb569f13', '83edc9ff', '5c187984', 'eb172ca3', 'd5615e5b', '064fab50']

Total matches: 5
Already downloaded: 11
Remaining to download: 3
Match IDs to download: ['f8177893', '7ac1c0c8', 'b9cf7980']

Starting download with anti-blocking measures...
(6-12 second delays, break every 2 downloads)


TypeError: quick_download() got an unexpected keyword argument 'min_delay'

In [17]:
# Import and check the function signature
from fbref_match_downloader import quick_download, FBRefMatchDownloader
import os
import inspect

# Check what parameters quick_download accepts
print("quick_download function signature:")
print(inspect.signature(quick_download))
print("\n" + "="*50 + "\n")

# Your match IDs
match_ids = [
    'f8177893',
    '7ac1c0c8',
    '83edc9ff',
    'd0426a07',
    'b9cf7980'
]

# Check what's already downloaded
downloaded = []
if os.path.exists('match_html_files'):
    downloaded = [f.replace('match_', '').replace('.html', '') 
                  for f in os.listdir('match_html_files') 
                  if f.startswith('match_') and f.endswith('.html')]

# Filter remaining matches
remaining_matches = [m for m in match_ids if m not in downloaded]
print(f"Remaining to download: {remaining_matches}")

# Option 1: Try quick_download with just the match_ids
if remaining_matches:
    try:
        print("\nAttempting download with quick_download...")
        results = quick_download(remaining_matches)
    except Exception as e:
        print(f"Error with quick_download: {e}")
        
        # Option 2: Use FBRefMatchDownloader directly
        print("\nUsing FBRefMatchDownloader directly...")
        downloader = FBRefMatchDownloader(
            min_delay=6.0,
            max_delay=12.0,
            batch_size=2
        )
        results = downloader.download_multiple_matches(remaining_matches)
    
    # Show results
    print("\n" + "="*50)
    successful = sum(1 for r in results if r['status'] == 'success')
    failed = sum(1 for r in results if r['status'] != 'success')
    print(f"✓ Successful: {successful}")
    print(f"✗ Failed: {failed}")
    
    for r in results:
        status_icon = "✓" if r['status'] == 'success' else "✗"
        print(f"  {status_icon} {r['match_id']}: {r['status']}")
else:
    print("\nAll matches already downloaded!")

quick_download function signature:
(match_ids: List[str], use_selenium: bool = False, extract_tables: bool = False)


Remaining to download: ['f8177893', '7ac1c0c8', 'b9cf7980']

Attempting download with quick_download...
Starting download of 3 matches...
Files will be saved to: /Users/thomasmcmillan/projects/nwsl_data/notebooks/match_html_files/
Download method: BeautifulSoup with Selenium fallback
--------------------------------------------------

[1/3] Downloading match f8177893...
Detected dynamic content, switching to Selenium...
Using Selenium for dynamic content...
✓ Successfully saved: match_f8177893.html (method: selenium_dynamic)

[2/3] Downloading match 7ac1c0c8...
Detected dynamic content, switching to Selenium...
Using Selenium for dynamic content...
Selenium download error: HTTPConnectionPool(host='localhost', port=52448): Read timed out. (read timeout=120)

[3/3] Downloading match b9cf7980...
Detected dynamic content, switching to Selenium...
Using Selenium for dynamic 

In [20]:
from fbref_match_downloader import quick_download
import os

# Your match IDs
match_ids = [
    '1a01081c',
    'b9cf7980',
    '0e4932ff',
    '12c17fb7',
    '8640ac6f',
    '2d03d5bd'
]


# Check what's already downloaded
downloaded = []
if os.path.exists('match_html_files'):
    downloaded = [f.replace('match_', '').replace('.html', '') 
                  for f in os.listdir('match_html_files') 
                  if f.startswith('match_') and f.endswith('.html')]
    print(f"Already downloaded: {len(downloaded)} matches")
    print(f"Downloaded: {downloaded}")

# Filter remaining matches
remaining_matches = [m for m in match_ids if m not in downloaded]
print(f"\nRemaining to download: {remaining_matches}")
print(f"Number of matches to download: {len(remaining_matches)}")

# Download with Selenium and table extraction
if remaining_matches:
    print("\n" + "="*50)
    print("Starting download with:")
    print("- Selenium (FORCED)")
    print("- Table extraction (ENABLED)")
    print("="*50 + "\n")
    
    # Use quick_download with Selenium and table extraction
    results = quick_download(
        remaining_matches,
        use_selenium=True,      # Force Selenium
        extract_tables=True     # Extract tables
    )
    
    # Show results
    print("\n" + "="*50)
    print("Download complete!")
    
    # Count successes
    if results:
        successful = sum(1 for r in results if r.get('status') == 'success')
        failed = len(results) - successful
        print(f"✓ Successful: {successful}")
        print(f"✗ Failed: {failed}")
        
        # Show details
        print("\nDetailed results:")
        for r in results:
            status = r.get('status', 'unknown')
            match_id = r.get('match_id', 'unknown')
            status_icon = "✓" if status == 'success' else "✗"
            print(f"  {status_icon} {match_id}: {status}")
            if status == 'success' and 'tables_extracted' in r:
                print(f"     Tables extracted: {len(r['tables_extracted'])}")
else:
    print("\nAll matches already downloaded!")

print("\n" + "="*50)
print("✓ HTML files saved in: match_html_files/")
print("✓ CSV tables saved in: match_html_files/tables/[match_id]/")

Already downloaded: 14 matches
Downloaded: ['7284c984', 'b9cf7980', '6a57c82e', 'f8177893', 'd0426a07', '8883ea79', '81481f61', 'fb569f13', '7ac1c0c8', '83edc9ff', '5c187984', 'eb172ca3', 'd5615e5b', '064fab50']

Remaining to download: ['1a01081c', '0e4932ff', '12c17fb7', '8640ac6f', '2d03d5bd']
Number of matches to download: 5

Starting download with:
- Selenium (FORCED)
- Table extraction (ENABLED)

Starting download of 5 matches...
Files will be saved to: /Users/thomasmcmillan/projects/nwsl_data/notebooks/match_html_files/
Download method: Selenium (forced)
--------------------------------------------------

[1/5] Downloading match 1a01081c...
Using Selenium for dynamic content...
✓ Successfully saved: match_1a01081c.html (method: selenium)
  Extracted table: stats_8e306dc6_summary
  Extracted table: keeper_stats_8e306dc6
  Extracted table: stats_257fad2b_summary
  Extracted table: keeper_stats_257fad2b

[2/5] Downloading match 0e4932ff...
Using Selenium for dynamic content...
Selen

In [21]:
from fbref_match_downloader import FBRefMatchDownloader
import os
import time

# Match IDs that failed
failed_matches = ['0e4932ff', '12c17fb7', '8640ac6f', '2d03d5bd']

print(f"Retrying {len(failed_matches)} failed matches...")
print("Using fresh Selenium instance for each match\n")

# Download each match individually with its own driver
results = []
for i, match_id in enumerate(failed_matches, 1):
    print(f"[{i}/{len(failed_matches)}] Downloading {match_id}...")
    
    try:
        # Create a fresh downloader for each match
        downloader = FBRefMatchDownloader(use_selenium=True)
        
        # Download single match
        result = downloader.download_match_html(
            match_id,
            output_dir='match_html_files',
            extract_tables=True
        )
        
        results.append(result)
        
        # Clean up driver immediately
        if hasattr(downloader, 'driver') and downloader.driver:
            downloader.driver.quit()
        
        if result['status'] == 'success':
            print(f"✓ Success! Tables extracted: {len(result.get('tables_extracted', []))}")
        else:
            print(f"✗ Failed: {result.get('error', 'Unknown error')}")
        
        # Wait between downloads
        if i < len(failed_matches):
            wait_time = 10
            print(f"Waiting {wait_time} seconds before next download...\n")
            time.sleep(wait_time)
            
    except Exception as e:
        print(f"✗ Exception: {e}")
        results.append({'match_id': match_id, 'status': 'error', 'error': str(e)})

# Summary
print("\n" + "="*50)
successful = sum(1 for r in results if r.get('status') == 'success')
print(f"Retry complete!")
print(f"✓ Successful: {successful}/{len(failed_matches)}")
print(f"✗ Failed: {len(failed_matches) - successful}/{len(failed_matches)}")

Retrying 4 failed matches...
Using fresh Selenium instance for each match

[1/4] Downloading 0e4932ff...
Downloading match 0e4932ff...
Using Selenium for dynamic content...
✓ Successfully saved: match_0e4932ff.html (method: selenium)
  Extracted table: stats_5f911568_summary
  Extracted table: keeper_stats_5f911568
  Extracted table: stats_64c362f2_summary
  Extracted table: keeper_stats_64c362f2
✓ Success! Tables extracted: 4
Waiting 10 seconds before next download...

[2/4] Downloading 12c17fb7...
Downloading match 12c17fb7...
Using Selenium for dynamic content...
✓ Successfully saved: match_12c17fb7.html (method: selenium)
  Extracted table: stats_d976a235_summary
  Extracted table: keeper_stats_d976a235
  Extracted table: stats_df9a10a1_summary
  Extracted table: keeper_stats_df9a10a1
✓ Success! Tables extracted: 4
Waiting 10 seconds before next download...

[3/4] Downloading 8640ac6f...
Downloading match 8640ac6f...
Using Selenium for dynamic content...
✓ Successfully saved: match

In [None]:
from fbref_match_downloader import FBRefMatchDownloader
import os
import time
from datetime import datetime

def download_match_batch(match_ids, batch_name="Batch"):
    """
    Download a batch of matches with Selenium, skipping already downloaded ones.
    """
    # Check what's already downloaded
    downloaded = []
    if os.path.exists('match_html_files'):
        downloaded = [f.replace('match_', '').replace('.html', '') 
                      for f in os.listdir('match_html_files') 
                      if f.startswith('match_') and f.endswith('.html')]
    
    # Filter to only new matches
    new_matches = [m for m in match_ids if m not in downloaded]
    skipped = [m for m in match_ids if m in downloaded]
    
    print(f"\n{'='*60}")
    print(f"{batch_name} - {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    print(f"{'='*60}")
    print(f"Total matches provided: {len(match_ids)}")
    print(f"Already downloaded (skipping): {len(skipped)}")
    print(f"New matches to download: {len(new_matches)}")
    
    if not new_matches:
        print("\n✓ All matches already downloaded! Nothing to do.")
        return []
    
    print(f"\nDownloading: {', '.join(new_matches)}")
    print("Using fresh Selenium instance for each match\n")
    
    # Download each match
    results = []
    start_time = time.time()
    
    for i, match_id in enumerate(new_matches, 1):
        print(f"[{i}/{len(new_matches)}] Downloading {match_id}...")
        
        try:
            downloader = FBRefMatchDownloader(use_selenium=True)
            result = downloader.download_match_html(
                match_id,
                output_dir='match_html_files',
                extract_tables=True
            )
            results.append(result)
            
            if hasattr(downloader, 'driver') and downloader.driver:
                downloader.driver.quit()
            
            if result['status'] == 'success':
                print(f"✓ Success! Tables extracted: {len(result.get('tables_extracted', []))}")
            else:
                print(f"✗ Failed: {result.get('error', 'Unknown error')}")
            
            if i < len(new_matches):
                print(f"Waiting 10 seconds...\n")
                time.sleep(10)
                
        except Exception as e:
            print(f"✗ Exception: {e}")
            results.append({'match_id': match_id, 'status': 'error', 'error': str(e)})
    
    # Summary
    elapsed = time.time() - start_time
    successful = sum(1 for r in results if r.get('status') == 'success')
    
    print(f"\n{'='*60}")
    print(f"Batch complete! Time: {int(elapsed//60)}m {int(elapsed%60)}s")
    print(f"✓ Successful: {successful}/{len(new_matches)}")
    print(f"✗ Failed: {len(new_matches) - successful}/{len(new_matches)}")
    print('='*60)
    
    return results

# YOUR MATCH IDs HERE - Change these to whatever matches you want



# Run the download
if match_ids:
    results = download_match_batch(match_ids, "My Batch")
else:
    print("Please add match IDs to the list above!")


My Batch - 2025-07-27 20:01:16
Total matches provided: 99
Already downloaded (skipping): 0
New matches to download: 99

Downloading: f881e2c1, 60f738e9, c472965f, 5d762423, 478694f8, c6e91034, 6d73cf64, 520134f4, 170d1dda, c78c16a1, 9dcf496b, 418e0e31, 3c2a99b1, e4c6e5a2, e38fdb0f, db1b8928, f7ec5018, e01f2ab2, 85ea774d, 611eb468, 1b276643, 3f955256, 4874cbfb, 60713322, f13fd918, 29994120, 6c10cfdb, 3b6c58de, 0ab262a8, d8f2bb6c, 4ccba30d, b2f4ab2b, 541a43ed, 53d69588, fb3fee9e, d68ba0d9, 070a85c6, 0ac89923, 55840597, 0438ac6f, 5cbe3ea6, 2124d8de, 6c1960fe, ad4385ca, ca6b1e40, 9eae33e6, 0349e876, f4e707e7, bd617704, b3536acf, a8e5bb48, 1a84f24d, bd7ec044, 7f3417a9, caadbdc9, 7439eff5, 55934b05, 6c066abf, 0e7fddc2, 45ea9030, 61cc1e00, 565b1da5, 2d678cba, 889e323c, f5582e60, 502ff272, 5c3b3b7a, 2bda25a1, 5ff5843f, 7aa0733c, dc4da4e7, 88db5cc5, decc5784, 1cf24c9d, bc51c892, 50bd521d, 7239cc9a, 1bb660c2, ade5efdc, 573e3558, 6c0afe0b, fd03e664, 99b29ebc, 6fd000ec, 708f6bbe, 707d1ba5, 252a6a

In [None]:
from fbref_match_downloader import FBRefMatchDownloader
import os
import time
from datetime import datetime

def download_match_batch(match_ids, batch_name="Batch"):
    """
    Download a batch of matches with Selenium, skipping already downloaded ones.
    """
    # Check what's already downloaded
    downloaded = []
    if os.path.exists('match_html_files'):
        downloaded = [f.replace('match_', '').replace('.html', '') 
                      for f in os.listdir('match_html_files') 
                      if f.startswith('match_') and f.endswith('.html')]
    
    # Filter to only new matches
    new_matches = [m for m in match_ids if m not in downloaded]
    skipped = [m for m in match_ids if m in downloaded]
    
    print(f"\n{'='*60}")
    print(f"{batch_name} - {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    print(f"{'='*60}")
    print(f"Total matches provided: {len(match_ids)}")
    print(f"Already downloaded (skipping): {len(skipped)}")
    print(f"New matches to download: {len(new_matches)}")
    
    if not new_matches:
        print("\n✓ All matches already downloaded! Nothing to do.")
        return []
    
    print(f"\nDownloading: {', '.join(new_matches)}")
    print("Using fresh Selenium instance for each match\n")
    
    # Download each match
    results = []
    start_time = time.time()
    
    for i, match_id in enumerate(new_matches, 1):
        print(f"[{i}/{len(new_matches)}] Downloading {match_id}...")
        
        try:
            downloader = FBRefMatchDownloader(use_selenium=True)
            result = downloader.download_match_html(
                match_id,
                output_dir='match_html_files',
                extract_tables=True
            )
            results.append(result)
            
            if hasattr(downloader, 'driver') and downloader.driver:
                downloader.driver.quit()
            
            if result['status'] == 'success':
                print(f"✓ Success! Tables extracted: {len(result.get('tables_extracted', []))}")
            else:
                print(f"✗ Failed: {result.get('error', 'Unknown error')}")
            
            if i < len(new_matches):
                print(f"Waiting 10 seconds...\n")
                time.sleep(10)
                
        except Exception as e:
            print(f"✗ Exception: {e}")
            results.append({'match_id': match_id, 'status': 'error', 'error': str(e)})
    
    # Summary
    elapsed = time.time() - start_time
    successful = sum(1 for r in results if r.get('status') == 'success')
    
    print(f"\n{'='*60}")
    print(f"Batch complete! Time: {int(elapsed//60)}m {int(elapsed%60)}s")
    print(f"✓ Successful: {successful}/{len(new_matches)}")
    print(f"✗ Failed: {len(new_matches) - successful}/{len(new_matches)}")
    print('='*60)
    
    return results

# YOUR MATCH IDs HERE - Change these to whatever matches you want






# Run the download
if match_ids:
    results = download_match_batch(match_ids, "My Batch")
else:
    print("Please add match IDs to the list above!")


My Batch - 2025-07-27 21:16:35
Total matches provided: 200
Already downloaded (skipping): 0
New matches to download: 200

Downloading: cfce4a7e, 60c0fef2, eb7f25b6, e0a6d860, 64ae00a8, bde14f5d, ab51dc2a, a0785409, b14ef42f, aae9acb9, 5ecc20d9, 657ab5b5, 2bbf11bc, 258598f2, 9f6d6df5, 0d86f760, 2d9e0ef0, 11e0e304, 9fa14fb5, 40d29f66, 2f8861a3, beaa3276, ba8ffe87, 72856ea5, 169816e4, aae24672, 6ff06b8e, e1e5105f, 625276de, cb36f524, b8918216, 554b360b, 6245629c, 18fdb801, 83e9f6cd, 391bfb7d, 49e5362d, 34a9f7eb, e493bc1b, 78894a6f, eaf0b116, ba381725, acf83533, 5ada410d, 21aac0ca, 02765e6d, bc4d18e2, 14d766d0, d1f5bba9, 563b9572, c919bc72, 94916b48, d6daa6a9, 8aa2a6b5, 20f76f42, 9f3f16bd, 440a6ec7, 66a2ae77, 127db480, 8b6d410c, 0e9d2a77, 0df8900f, 56cb2952, 1193750f, 08f400cf, b583fc1c, 48a4cf5c, 55034c4e, 3166599e, df22842d, 2682e8f7, aacb507b, a72ceab9, 3534d09f, fb592d13, 13ecb249, 83cce59c, b9ef73c0, 04a87c26, edd08e31, 37d5d942, 4f0323c2, 2ac3cfd3, c7123f09, eb456c25, e265026c, ef2c

In [39]:
from fbref_match_downloader import FBRefMatchDownloader
import os
import time
from datetime import datetime

def download_match_batch(match_ids, batch_name="Batch"):
    """
    Download a batch of matches with Selenium, skipping already downloaded ones.
    """
    # Check what's already downloaded
    downloaded = []
    if os.path.exists('match_html_files'):
        downloaded = [f.replace('match_', '').replace('.html', '') 
                      for f in os.listdir('match_html_files') 
                      if f.startswith('match_') and f.endswith('.html')]
    
    # Filter to only new matches
    new_matches = [m for m in match_ids if m not in downloaded]
    skipped = [m for m in match_ids if m in downloaded]
    
    print(f"\n{'='*60}")
    print(f"{batch_name} - {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    print(f"{'='*60}")
    print(f"Total matches provided: {len(match_ids)}")
    print(f"Already downloaded (skipping): {len(skipped)}")
    print(f"New matches to download: {len(new_matches)}")
    
    if not new_matches:
        print("\n✓ All matches already downloaded! Nothing to do.")
        return []
    
    print(f"\nDownloading: {', '.join(new_matches)}")
    print("Using fresh Selenium instance for each match\n")
    
    # Download each match
    results = []
    start_time = time.time()
    
    for i, match_id in enumerate(new_matches, 1):
        print(f"[{i}/{len(new_matches)}] Downloading {match_id}...")
        
        try:
            downloader = FBRefMatchDownloader(use_selenium=True)
            result = downloader.download_match_html(
                match_id,
                output_dir='match_html_files',
                extract_tables=True
            )
            results.append(result)
            
            if hasattr(downloader, 'driver') and downloader.driver:
                downloader.driver.quit()
            
            if result['status'] == 'success':
                print(f"✓ Success! Tables extracted: {len(result.get('tables_extracted', []))}")
            else:
                print(f"✗ Failed: {result.get('error', 'Unknown error')}")
            
            if i < len(new_matches):
                print(f"Waiting 10 seconds...\n")
                time.sleep(10)
                
        except Exception as e:
            print(f"✗ Exception: {e}")
            results.append({'match_id': match_id, 'status': 'error', 'error': str(e)})
    
    # Summary
    elapsed = time.time() - start_time
    successful = sum(1 for r in results if r.get('status') == 'success')
    
    print(f"\n{'='*60}")
    print(f"Batch complete! Time: {int(elapsed//60)}m {int(elapsed%60)}s")
    print(f"✓ Successful: {successful}/{len(new_matches)}")
    print(f"✗ Failed: {len(new_matches) - successful}/{len(new_matches)}")
    print('='*60)
    
    return results

# YOUR MATCH IDs HERE - Change these to whatever matches you want
match_ids = [
    '84d6dca9', 'f36366a9', 'f542d0eb', '21526888', '55471e47', 'f7ee4334', '09657304',
    '5f19ba81', 'e1e516b9', '465459ea', '6e0ca93c', '50f8bec2', '58d26f0b', '30406f36',
    'adcabd51', 'ec411397', 'c0b6a639', '605fa6ac', 'd33ff0d8', '15f03025', '5238c761',
    '39dbe62f', 'f484f6f6', '5a7b7125', 'db9ea114', '092fe7ea', '783a65d6', 'e6091451',
    'e6d49a20', '16744a84', 'e17daaeb', '2c32f3d3', '22f04f44', 'afcd583a', '67a57f59',
    'c9ec6863', '25004f7e', '9bf95ec5', '70b9c1b6', '87083ab3', 'b66598e4', 'ff7f188e',
    '8780d6a5', 'b87d86b7', 'ae49ad75', '85ba8579', 'efddeda8', 'af3f157d', '5394bc1b',
    '36c46e0d', 'd53451cf', '86b6cc0a', '9a2b26a1', '23114faa', '37be7787', 'd26ed7f0',
    '7f398f88', 'cc8ebaaa', '0f1cb3d1', 'c79ee9c4', '4120af97', '622f898e', 'f8c8aea4',
    '6420bec8', '610a4c17', '1cef5979', '888d23a0', 'bde3da3d', 'f7b69a29', '6f3dc675',
    'ec9ceb9f', '1e61252e', 'eb81709b', '7ee309e3', 'b30b11e9', '05482155', '49094e3a',
    '745bedf3', '1520b6f5', 'b2b9405a', '0262bb35', '51020dea', '8f3fbf96', '78fac894',
    '2b75137d', 'efcbf7b7', '4f5b874c', 'dbbdb47c', '6c56c1c8', 'e133d584', '38048580',
    '125df7bb', 'f7ea6cf4', '01cdf2c9', '3e2273da', 'a0c570ff', '3903be81', '453b20ed',
    '3690a734', '073975b1', '6d4a68e6', 'e4dad184', '202faad2', '67076783', '4aa7a9c5',
    '6536d5aa', 'a6063bfa', 'e554a812', '976b8d77', 'a0d14941', 'ef3f22f7', '3b62060e',
    '6f44cb0a', '3e58ee5e', '2f8d4701', '6e16e67b', 'dd37453e', 'aa5085c0', 'b31185f6',
    '4d435ae3', 'fdd56674', '508e1cf0', '8b7900bd', '1173feeb', 'd6124086', '03f02a2d',
    'fba4e358', '1b8fd283', '2c25dcc1', '96746e28', '3653dfaf', 'f33364ee', '4a9bc623',
    'f2a8492d', 'd7245076', '731bfd8e', 'a6299d40', 'b3dca21a', '840ceaac', '3ccbf5a1',
    '9e58d38e', '6b7e06cd', '51f18293', '9f7344bb', '760a26e0', '477d8522', 'da5a0e99',
    '93804b58', '722a085f', 'b4d1565f', '26684b63', '610f69dd', '67a5da46', '74ed987d',
    'bfbad8e1', '3266a287', 'b0f251bd', '6c66219b', '705aeb44', '338f179a', '1ff4035b',
    '5edbe8f5', 'b4606770', '8ecc4297', '2dc93287', 'a172e221', 'cdde1e7a', '0550dc14',
    '2330e071', '9ea41f98', '27bba0f5', 'f7ab07b4', '81c16cad', '362bd167', '341cb0c8',
    '9a9a656b', 'afbff619', '47752024', 'eb4022c0', '640d698b', 'e7d9f27c', 'dc7cc573',
    '7f16970d', '794f4ccf', 'bffd3e4c', '24ddd8b3', 'f0a6559b', '051a8bb7', '06d86a41',
    '8d61a870', '7a7e6622', '8f77608d', '1d2f9fad', 'cd30a848', '7f8b4663', 'f6d763ba',
    'b67be4fe', 'e97c6f39', '3f04a185', '73ff619b', '57b30c8a', '2a0b287c', '7837616f', 
    'e0ef83c8', '888fb121', '119eeed3', '92ccc792', 'bf8c7152',
    '072e422f', '2e23bf3e', '14b12758', '78660c17', '76f87d7e', '6ba30741', 'c1330700', 'b34794ed',
    'b23322de', '9733a93c', 'a5d7e67a', '28529cef', 'be723b18', '2184074e', 'adffaaa7', '0ec1b5bb',
    'fb5d227f', 'b161f09c', '5a27b398', 'c19e2edc', '55fd0afd', '7d903802', '1011f557', '8acfa339',
    '74666aca', 'b818bb1d', '671d9115', '5648dd0c', '0f623827', '9f0e821c', '5dceef91', 'f29a7ef1',
    '481dcee5', '2cd4ec75', '10f108c3', '40d971e5', 'db88df87', '3548ce5a', '4ce881eb', 'ab0a3b8e',
    'f8ee8767', '9b3974b4', 'fa5f28b6', '08a226bf', '753a54ab', '151a8c6c', '138422ad', '986e95dc',
    '2a4f95fc', 'c3692424', '29064988', 'c17893bb', 'b2fdb217', '9a7e9c60', '80ff5d48', '57127901',
    'f832fee9', '4d241e61', 'e45b356c', '5c64fc50', 'a81c893d', 'b4cacaf2', '2b4a0943', 'fe50bdd7',
    'd11a6afc', '242d98cb', '800bc864', '2a7aad78', '4ebf1b38', 'b8c01225', '81e7d577', '8dca4022',
    'be46a00f', '714ba654', '2f020390', 'aff251d2', '3ea31ead', '16e23f3c', '8d282de4', '8eb3b0dd',
    '0bfe8e38', '7fc53fb8', '770ec49c', 'fea90630', 'd6191f62', '509b89aa', '165448bc', '37d40689',
    '0bbd1959', '84de42bc', 'd887c826', '94bcf409', '331b8375', 'b2c4fa2c', '343ae440', 'a6d4d95a',
    'b599c7e0', '6b5d492c', '5d320b9e', '74f63303', 'e8cf14bf', '3f95b1ba', 'fc2a3172', '5f985550',
    'dc4570b2', 'b7041740', '2b50579e', 'e62b6c8a', 'c364a795', 'b5eddd77', '9d5e9ab5', '1e851026',
    '500515e8', '14675a38', 'cc66a2f1', '176d858d', '94d3e20f', 'b9817679', 'bd3531f7', '725b6b2b',
    '441fc804', '9211e53e', 'd7eebcbd', '78069ae6', '17f37cd6', 'e94ca5b6', '61e4bfa2', 'b1027ba5',
    '08777f6d', 'cfbf0a81', 'b59b53ed', '30b89b79', 'b13fe91a', '44d9659f', 'f1b15f77', '75882c0d',
    'eca58f62', '74c8dfe3', 'fb385410', '49ea5dd8', '4fae253c', 'daa14142', '195b92b0', '256b881c',
    '20b32e92', 'c97a3548', '79ca1db6', '3024087b', '913891bb', 'cadc8b84', '4dd9f586', '290323ab',
    '4ba0e2d8', '72276317', '73ee3add', '96c9139c', '661be992', '1bc47be5', '91d6f674', 'a9c03493',
    '7ae48fd2', '76069ab3', '4d71b963', 'f7ef2eb5', 'c71e9b2c', '48487630', 'f1d3f350', '041bd193',
    'bdf986b5', '2ae40c48', '4b446ebc', 'cbf55073', '6b40342d', 'c21023d3', '8cbb2226', '738c8155',
    '3263eda1', 'fa7dd928', 'f86f4464', '78dfd534', '6d2bfcb4', 'd8723f81', 'aa2740ad', 'd03a38f0',
    'a5c17409', '29fe6e81', '30a56302', 'ecf1eace', 'c4cf4c77', 'd82ff5bb', 'f811c8da', '6d4482cd',
    'abcac375', '07c4cb97', '57a8dbcd', '760ee362', '49c13c3e', '00c0613d', '551050e9', '2f262298',
    '7033f01f', '7ba32c5e', '9dd1cfff', 'a3f62efc', 'c19cffe0', '2e0f3783', 'a7979881', '905f3edd',
    '7f334b6a', 'b3da318b', 'd307f386', 'e15b2aa1', 'bd8e3769', 'de8dd876', '23b85cf2', '1890f4e1',
    'a3875ee5', 'd2ace308', '5ac75852', '92eadf30', '196ef6a6', '9cd9f193', '71ea0b36', '050beb70',
    '405bcfe5', '1cdd8ceb', '0a49c452', '9a19607b', 'd81140c7', '3aaa5674', '493fdd35', '865cc949',
    '00d409f0', 'fa678a5b', 'fa5fc923', '3134af50', '741b0176', '90d46a63', '5bae48a8', '2a3907da',
    '9739f8b2', '954dc444', '01c06596', '092880c6', '86ff8eb0', 'ba56fd29', '9f485801', 'a11b4b90',
    '82e19107', 'fd132aad', 'a6f87cc7', '13f3f51b', '2b781d4b', '22e27112', '06ab0c2f', '3b78b4e3',
    'f7b7339e', '6d1ac525', '65e106ba', 'b3167d42', '75f0fb1c', 'a91bbbb0', 'aad3923c', '42244271',
    'd2a41760', '3e4549a4', '7d44ebcb', '0b2deba6', '9d3550a1', '1f7f7194', '2716932b', 'c73c9a6b',
    'f4bf2d71', '0d9dc005', '804f5ac8', '1fe8fd04', 'a41e3360', '18ca8cc7', '5a35c06d', 'b3e380c7',
    'f6dedb09', 'aff50b31', '2bbe892a', 'b4372c91', 'e9143a40', 'de185b12', 'e967af56', '90ce10b5',
    'dc2606b9', '01fa4544', '7b014660', '0000be22', '7652f3ff', 'b85f2b59', 'd10405b3', '063be4e2',
    '254dd7b3', '4ca190d3', '64aa1dab', '4daf6a3e', '119e0735', '6bb1dcb0', 'e6cd7493', '570d927f',
    '77024d19', '444a7265', '36a183f3', '1b3fed3f', '76a963e6', '1a60c64d', '038d5f72', 'dfcc80f1',
    'af9e6aa0', '973b8702', 'ecdb0756', 'f048e50d', '58f01846', 'f862299e', 'f6c8d2fc', '5c41b080',
    '2c708d10', '42af74d7', '1c170723', '0b33d8bf', '7b1d9e98', '7cdb0bb5', '12ddcc67', 'b5199fe1',
    '1a157196', '1ac67a32', '82ad937c', '62a51875', '2e307145', '76a4eaa9', '824e6399', 'e9749264',
    '27c148ec', '233953db', 'a3f1fd57', '7a39bead', '6fab8a8e', 'f200e21d', '4de26c64', '32e46224',
    '1e61e6d0', 'c5929ccf', '355edc62', '6afadaae', '9a3194b6', 'b7d732c1', '35872cab', '8137d4d4',
    'c00c0e92', '8d3c0bbb', 'cc101301', '7247e49a', '290df075', '85f65da4', '8c81b41c', '9bb6829b',
    'dcce831d', '2b8626c8', '809a38d9', 'dd811060', 'e229e7d5', 'fa838384', '4fc7c81b', '26ad3f02',
    'b785dc25', '75cae5a2', 'efd663a7', '799c6310', 'ac1441ed', '84320d5c', 'aa5de61c', '2a629ec8',
    'bb5b4d67', '9243d508', '9cd54811', '7c21b4c6', '4a41b9cc', '70633eda', '0a18ef58', 'faedb1cf',
    '800d40e3', '382cc9a5', '19671f44', '6bf69bec', 'cb2cf7c8', '231fb934', 'e094b273', 'cf8ab658',
    '87ec8077', '14d99e42', '23954a43', '873ded94', '42354974', '51c088f0', 'd2bd367f', '557b86ec',
    'ef2ef72c', '8f0b38be', '6ef26dfb', '2a94c73f', '9f9f9f44', '78a1130b', 'da63ca52', '907eb79c',
    '4d47a98b', '7aa8dbb7', '7b8a2deb', '2bd69279', 'c0e04bc8', '09f3121e', '6f5bbb39', 'caf64d89',
    '276ca70d', '9f2f3d48', 'a9fb7f83', 'ac9e20a5', '3b03fa77', '32fb4d07', '7a097c88', 'f16c843b',
    'd4d39b88', '1da7faf9', '8dea19d0', '4366d18b', '86457d5c', 'ce23a8c3', 'bd44bb43', '6042fdb4',
    '0359e2a8', '0fc70274', 'b9858345', 'cc8f4050', '02cc2db8', '32b74c42', '8777bb7e', 'f80ba5a9',
    'a37d3cfd', '11764aec', 'eca1e88a', '2efb993b', 'bcb0e7fb', '2b8f9b59', '22ee2a8e', '3b0958a3',
    'a1dc77b2', 'bb90432b', '1c672cf2', 'f88cc7e3', '22b9918f', '9d816b1c', '9467b1af', '0c66c1c4',
    '42534159', '076238c6', 'e0c9fd2f', '6546a002', '807c9e51', '63f04e4d', 'ffe3a6b6', '162e58e3',
    '5b076473', '9d04e156', 'a4476197', '24e15195', '6dc65c5d', 'f9e04c6f', '2de27dd8', 'f2f758a1',
    'd6b5ef91', '5cc2c4e6', '9d2567b5', '5a808fa8', '9ad58931', '580abedf', 'cab0661f', 'd41fc789',
    '96e0dd2c', '3075b5c0', '1670d17a', 'a7d01063', '0a54da3b', '737678f8', '65d57a30', '528bbf25',
    'd91aea7c', 'a7daf861', '15580ec7', '648bb908', '8268db6d', '233e11eb', '9f02aad1', '50db38c8',
    'dd1430c9', 'bbd9f51a', 'ffef1d9a', '38101d6b', '554d9958', '4d4843ca', '0ca01c78', '2d5641de',
    'f337712c', 'd5c6cdab', '43c5ccd9', '694bd5f6', '3be42001', 'a384fbda', 'a36e071c', 'dfb90a73',
    '65e6a4bd', 'ab5877a7', '2cd5aa58', '034925ae', '50e71049', '75606d80', 'ef84043d', '7d118693',
    'e3f72dbe', '65ed77f0', '6556c637', 'd2575048', 'b792c346', '22dc6222', '3bb46cff', '98086391',
    '28e6c7a2', '98440caf', '877d362d', '1b8a084e', 'ad773c94', 'c1cac8e6', '11c32fb2', '307d7dd2',
    'da18f85c', 'a263cad4', '5363a84e', 'cff46fe3', 'b2023bde', '8e441f7c', '7b7e3b0c', 'c52cb315',
    'f0b371c6', '1bfbde52', '5c48f9e2', '44143f63', 'd2234462', 'b7d2f49c', '213998de', '1046f207',
    'd0ef16cc', '969d88d7', '131887e6', '3a943218', 'f9e94321', '645bd1f9', 'ce065b4e', '1ea954ae',
    '116736c1', '26aaa42e', 'e191975f', '5c78295c', 'a1fcd4e5', '30ade4e0', 'b81af036', 'e08d6c4f',
    '621bc391', '402a27f7', '271bb187', '2aef3feb', 'a2773d43', '5bb56387', '58602728', 'af0d4c1d',
    '8459babf', 'b8fbf719', 'db34abb9', '376754a4', '39382fed', 'c1ec23ab', '5cdb5294', 'd823665c',
    '99f75387', '8882ca01', 'cfe0e5e2', '347fc889', 'ddb3f581', 'ab4a9a85', '6122aad9', 'ffc88eef',
    'fa4654dc', 'fada5342', '097dd81e', 'accfd699', 'e0cba9a0', '17572b77', '0bbc1d3a', 'ce4d64c6',
    '0be54aa1', '123f2632', 'bf1bed18', 'a3655313', '39ed7c23', '28b5fbd4', '0fc05a78', '3d9812c7',
    'abd75002', '8798b399', 'bcf89d77', '71d44dd8', 'e4e02187', '72b85d57', '264e1580', 'c26dbae0',
    '183a6c5b', '9ceee191', 'f55c85b3', '0c5c2895', '8863fbca', '4eb8c7b7', '7ef32f41', '6e2ba6e7',
    '96b60abd', '1cf79406', '3d0b5a1d', 'a6984338', 'a7005232', '18d35e73', '771c3b61', '1e837a99',
    '96b3955e', 'e8f52f26', '35bd2a13', '3f2e2d3f', 'cd89bcfb', 'a51234f5', '557ce1c6', '9c97389e',
    'cd6d5b4f', 'd6f7c3b8', '0e889e55', 'bfd1ee7c', '9eace85e', 'c08ebbce', '664107de', 'cb653ae5',
    '2164c947', '03f56ad5', '15d59878', '7d3be8a8', 'da8e116e', '0603e751', '73672411', '5a2a2730',
    'f87a6d18', 'd437189b', '5fef737e', '66eef21d', '70f6ed43', 'e96c892d', '87c96d20', 'd711b57e',
    'f168f5b1', '4158fb00', 'f61689bf', '7239a666', '5f6913d7', '008e301f', '45ad1c98', '0194ccbe',
    '7958f078', 'a4bd2b1e', '26889552', '0a35cfd4', 'a19ff7be', '2f0f155c', 'b7b3a204', '96890718',
    'a38568cb', '0877e051', '74349bed', '475a847a', '9ae009b7', '9e867b88', 'fb58cf7f', 'c8df78e2',
    'dd55cdd7', 'd0b68db4', '733e8be0', '3d114ae2', '031916dc', '71e1c7c8', '12b89923', 'f09b4cef',
    'ecbd2ae5', 'ba2ade47', 'd7e4972c', '0fbbc32f', '414d2972', '903440ae', '20423258', '592098e0',
    '7eff59ff', '64bc17e9', 'dd365af8', '1c087799', '8fc9e8b7', '0cf23a87', 'c0cb6fdc', '13e27d90',
    '56095b7f', 'f513c5f3', 'd19e8e40', '07c68416', '2f764197', '3329a1cc', '03442e78', '6a832d3c',
    '92952d3a', '383bbe06', '70ffc3bc', '6c8777ca', 'd2545219', '42890885', 'edb68a59', 'b2aea0ed',
    '5d73d536', 'eb55dce7', '79cbcac5', 'd174fd4f', 'b6095527', '30adae5d', '11e533b1', '39bffb5e',
    '60b53e0c', '4013be4a', '810cc4f5', '726a91eb', '230c5ecb', '60a0c5a1', '2e27ac56', 'ac197aa2',
    '11ec0554', 'fc4833e0', '30d6a0e0', 'f06ed457', '692f1ab8', '91373610', '548d019f', '612a6c4c',
    '6be469ca', '14bc0f9a', 'd40c64aa', 'c56678cf'
]

# Run the download
if match_ids:
    results = download_match_batch(match_ids, "My Batch")
else:
    print("Please add match IDs to the list above!")


My Batch - 2025-07-27 23:44:35
Total matches provided: 964
Already downloaded (skipping): 0
New matches to download: 964

Downloading: 84d6dca9, f36366a9, f542d0eb, 21526888, 55471e47, f7ee4334, 09657304, 5f19ba81, e1e516b9, 465459ea, 6e0ca93c, 50f8bec2, 58d26f0b, 30406f36, adcabd51, ec411397, c0b6a639, 605fa6ac, d33ff0d8, 15f03025, 5238c761, 39dbe62f, f484f6f6, 5a7b7125, db9ea114, 092fe7ea, 783a65d6, e6091451, e6d49a20, 16744a84, e17daaeb, 2c32f3d3, 22f04f44, afcd583a, 67a57f59, c9ec6863, 25004f7e, 9bf95ec5, 70b9c1b6, 87083ab3, b66598e4, ff7f188e, 8780d6a5, b87d86b7, ae49ad75, 85ba8579, efddeda8, af3f157d, 5394bc1b, 36c46e0d, d53451cf, 86b6cc0a, 9a2b26a1, 23114faa, 37be7787, d26ed7f0, 7f398f88, cc8ebaaa, 0f1cb3d1, c79ee9c4, 4120af97, 622f898e, f8c8aea4, 6420bec8, 610a4c17, 1cef5979, 888d23a0, bde3da3d, f7b69a29, 6f3dc675, ec9ceb9f, 1e61252e, eb81709b, 7ee309e3, b30b11e9, 05482155, 49094e3a, 745bedf3, 1520b6f5, b2b9405a, 0262bb35, 51020dea, 8f3fbf96, 78fac894, 2b75137d, efcbf7b7, 4f5b

In [2]:

import sys
sys.path.append('../scripts')
from jupyter_extract_player_stats import extract_player_stats_by_matches

# Paste your match IDs here (as many as you want)
match_ids = [
    'f9e04c6f',
    '2de27dd8',
    '9d2567b5',
    # Keep adding match IDs - paste hundreds if needed
]

# Run the extraction with production features
extract_player_stats_by_matches(
    match_ids=match_ids,
    batch_name="My Match Extraction",
    log_filename="my_matches.log"
)

2025-07-28 11:56:15,831 - INFO - Starting extraction for My Match Extraction
2025-07-28 11:56:15,832 - INFO - Logging to: /Users/thomasmcmillan/projects/nwsl_data/my_matches.log
2025-07-28 11:56:15,832 - INFO - Processing 3 match IDs: ['f9e04c6f', '2de27dd8', '9d2567b5']
2025-07-28 11:56:15,833 - INFO - Found 3 valid matches in database
2025-07-28 11:56:15,835 - INFO - Found 0 matches with existing player stats
2025-07-28 11:56:15,835 - INFO - Need to process 3 matches
2025-07-28 11:56:15,836 - INFO - My Match Extraction - 2025-07-28 11:56:15
2025-07-28 11:56:15,836 - INFO - Total matches: 3
2025-07-28 11:56:15,837 - INFO - Already extracted (skipping): 0
2025-07-28 11:56:15,837 - INFO - New matches to extract: 3
2025-07-28 11:56:15,837 - INFO - Extracting player stats from 3 matches...
2025-07-28 11:56:15,838 - INFO - Using comprehensive stats extraction with 3-12 second variable delays
2025-07-28 11:56:15,838 - INFO - Anti-blocking measures: randomized delays, extended pauses every 1

In [3]:
# Import the production-ready extraction function
import sys
sys.path.append('../scripts')
from jupyter_extract_player_stats import extract_player_stats_by_matches

# Paste your match IDs here (as many as you want)
match_ids = [
    '5b076473',
    '9d04e156',
    'a4476197',
    '24e15195',
    '6dc65c5d'
]


# Run the extraction with clean print statements and production features
extract_player_stats_by_matches(
    match_ids=match_ids,
    batch_name="My Match Extraction",
    log_filename="my_matches.log"
)

2025-07-28 11:59:55,161 - INFO - Starting extraction for My Match Extraction
2025-07-28 11:59:55,162 - INFO - Logging to: /Users/thomasmcmillan/projects/nwsl_data/my_matches.log
2025-07-28 11:59:55,162 - INFO - Processing 5 match IDs: ['5b076473', '9d04e156', 'a4476197', '24e15195', '6dc65c5d']
2025-07-28 11:59:55,164 - INFO - Found 5 valid matches in database
2025-07-28 11:59:55,165 - INFO - Found 0 matches with existing player stats
2025-07-28 11:59:55,166 - INFO - Need to process 5 matches
2025-07-28 11:59:55,166 - INFO - My Match Extraction - 2025-07-28 11:59:55
2025-07-28 11:59:55,167 - INFO - Total matches: 5
2025-07-28 11:59:55,167 - INFO - Already extracted (skipping): 0
2025-07-28 11:59:55,167 - INFO - New matches to extract: 5
2025-07-28 11:59:55,168 - INFO - Extracting player stats from 5 matches...
2025-07-28 11:59:55,168 - INFO - Using comprehensive stats extraction with 3-12 second variable delays
2025-07-28 11:59:55,168 - INFO - Anti-blocking measures: randomized delays,

In [4]:
# Import the production-ready extraction function
import sys
sys.path.append('../scripts')
from jupyter_extract_player_stats import extract_player_stats_by_matches

# Paste your match IDs here
match_ids = [
    '807c9e51',
    '6546a002',
    '63f04e4d',
    '162e58e3',
    'ffe3a6b6'
]


# Run the extraction with clean output
extract_player_stats_by_matches(
    match_ids=match_ids,
    batch_name="My Match Extraction",
    log_filename="my_matches.log"
)

2025-07-28 12:03:40,608 - INFO - Starting extraction for My Match Extraction
2025-07-28 12:03:40,610 - INFO - Logging to: /Users/thomasmcmillan/projects/nwsl_data/my_matches.log
2025-07-28 12:03:40,610 - INFO - Processing 5 match IDs: ['807c9e51', '6546a002', '63f04e4d', '162e58e3', 'ffe3a6b6']
2025-07-28 12:03:40,612 - INFO - Found 5 valid matches in database
2025-07-28 12:03:40,615 - INFO - Found 0 matches with existing player stats
2025-07-28 12:03:40,616 - INFO - Need to process 5 matches
2025-07-28 12:03:40,616 - INFO - My Match Extraction - 2025-07-28 12:03:40
2025-07-28 12:03:40,617 - INFO - Total matches: 5
2025-07-28 12:03:40,617 - INFO - Already extracted (skipping): 0
2025-07-28 12:03:40,618 - INFO - New matches to extract: 5
2025-07-28 12:03:40,618 - INFO - Extracting player stats from 5 matches...
2025-07-28 12:03:40,618 - INFO - Using comprehensive stats extraction with 3-12 second variable delays
2025-07-28 12:03:40,619 - INFO - Anti-blocking measures: randomized delays,

In [8]:
# Import the production-ready extraction function
import sys
sys.path.append('../scripts')
from jupyter_extract_player_stats import extract_player_stats_by_matches

# Paste your match IDs here
match_ids = [
    '22b9918f',
    '0c66c1c4',
    '9d816b1c',
    '9467b1af',
    '42534159',
    '076238c6',
    'e0c9fd2f'
]


# Extract player stats for these matches
extract_player_stats_by_matches(
    match_ids, 
    batch_name="Player Stats Extraction",
    log_filename="player_stats_extraction.log"
)

2025-07-28 12:15:57,111 - INFO - Starting extraction for Player Stats Extraction
2025-07-28 12:15:57,112 - INFO - Logging to: /Users/thomasmcmillan/projects/nwsl_data/player_stats_extraction.log
2025-07-28 12:15:57,113 - INFO - Processing 7 match IDs: ['22b9918f', '0c66c1c4', '9d816b1c', '9467b1af', '42534159', '076238c6', 'e0c9fd2f']
2025-07-28 12:15:57,114 - INFO - Found 7 valid matches in database
2025-07-28 12:15:57,118 - INFO - Found 0 matches with existing player stats
2025-07-28 12:15:57,119 - INFO - Need to process 7 matches
2025-07-28 12:15:57,120 - INFO - Player Stats Extraction - 2025-07-28 12:15:57
2025-07-28 12:15:57,120 - INFO - Total matches: 7
2025-07-28 12:15:57,120 - INFO - Already extracted (skipping): 0
2025-07-28 12:15:57,121 - INFO - New matches to extract: 7
2025-07-28 12:15:57,121 - INFO - Extracting player stats from 7 matches...
2025-07-28 12:15:57,121 - INFO - Using comprehensive stats extraction with 3-12 second variable delays
2025-07-28 12:15:57,122 - INFO

In [9]:
# Import the extraction function from the same directory
from jupyter_extract_player_stats import extract_player_stats_by_matches

# Paste your match IDs here
match_ids = [
    '5b076473',
    '9d04e156',
    'a4476197',
    '24e15195',
    '6dc65c5d'
]

# Extract player stats for these matches
extract_player_stats_by_matches(
    match_ids, 
    batch_name="Player Stats Extraction",
    log_filename="player_stats_extraction.log"
)

2025-07-28 12:19:30,313 - INFO - Starting extraction for Player Stats Extraction
2025-07-28 12:19:30,314 - INFO - Logging to: /Users/thomasmcmillan/projects/nwsl_data/player_stats_extraction.log
2025-07-28 12:19:30,315 - INFO - Processing 5 match IDs: ['5b076473', '9d04e156', 'a4476197', '24e15195', '6dc65c5d']
2025-07-28 12:19:30,317 - INFO - Found 5 valid matches in database
2025-07-28 12:19:30,320 - INFO - Found 5 matches with existing player stats
2025-07-28 12:19:30,321 - INFO - Need to process 0 matches
2025-07-28 12:19:30,322 - INFO - Player Stats Extraction - 2025-07-28 12:19:30
2025-07-28 12:19:30,322 - INFO - Total matches: 5
2025-07-28 12:19:30,323 - INFO - Already extracted (skipping): 5
2025-07-28 12:19:30,323 - INFO - New matches to extract: 0
2025-07-28 12:19:30,323 - INFO - ✓ All matches already extracted! Nothing to do.


In [10]:
# Suppress console logging
import logging
logging.getLogger().handlers = []

# Import the extraction function
from jupyter_extract_player_stats import extract_player_stats_by_matches

# Paste your match IDs here
match_ids = [
    '5b076473',
    '9d04e156',
    'a4476197',
    '24e15195',
    '6dc65c5d'
]

# Extract player stats for these matches
extract_player_stats_by_matches(
    match_ids, 
    batch_name="Player Stats Extraction",
    log_filename="player_stats_extraction.log"
)

2025-07-28 12:20:10,027 - INFO - Starting extraction for Player Stats Extraction
2025-07-28 12:20:10,029 - INFO - Logging to: /Users/thomasmcmillan/projects/nwsl_data/player_stats_extraction.log
2025-07-28 12:20:10,030 - INFO - Processing 5 match IDs: ['5b076473', '9d04e156', 'a4476197', '24e15195', '6dc65c5d']
2025-07-28 12:20:10,032 - INFO - Found 5 valid matches in database
2025-07-28 12:20:10,036 - INFO - Found 5 matches with existing player stats
2025-07-28 12:20:10,037 - INFO - Need to process 0 matches
2025-07-28 12:20:10,038 - INFO - Player Stats Extraction - 2025-07-28 12:20:10
2025-07-28 12:20:10,039 - INFO - Total matches: 5
2025-07-28 12:20:10,039 - INFO - Already extracted (skipping): 5
2025-07-28 12:20:10,040 - INFO - New matches to extract: 0
2025-07-28 12:20:10,041 - INFO - ✓ All matches already extracted! Nothing to do.


In [11]:
# Import the extraction function
from jupyter_extract_player_stats import extract_player_stats_by_matches

# Paste your match IDs here
match_ids = [
    '5b076473',
    '9d04e156',
    'a4476197',
    '24e15195',
    '6dc65c5d'
]

# Temporarily disable all logging to console
import logging
import sys

# Save current logging configuration
original_handlers = logging.root.handlers[:]
original_level = logging.root.level

# Clear all handlers and set level to only show CRITICAL messages
logging.root.handlers = []
logging.root.setLevel(logging.CRITICAL)

try:
    # Extract player stats for these matches
    extract_player_stats_by_matches(
        match_ids, 
        batch_name="Player Stats Extraction",
        log_filename="player_stats_extraction.log"
    )
finally:
    # Restore original logging configuration
    logging.root.handlers = original_handlers
    logging.root.setLevel(original_level)

2025-07-28 12:21:22,289 - INFO - Starting extraction for Player Stats Extraction
2025-07-28 12:21:22,291 - INFO - Logging to: /Users/thomasmcmillan/projects/nwsl_data/player_stats_extraction.log
2025-07-28 12:21:22,291 - INFO - Processing 5 match IDs: ['5b076473', '9d04e156', 'a4476197', '24e15195', '6dc65c5d']
2025-07-28 12:21:22,293 - INFO - Found 5 valid matches in database
2025-07-28 12:21:22,295 - INFO - Found 5 matches with existing player stats
2025-07-28 12:21:22,296 - INFO - Need to process 0 matches
2025-07-28 12:21:22,297 - INFO - Player Stats Extraction - 2025-07-28 12:21:22
2025-07-28 12:21:22,298 - INFO - Total matches: 5
2025-07-28 12:21:22,298 - INFO - Already extracted (skipping): 5
2025-07-28 12:21:22,299 - INFO - New matches to extract: 0
2025-07-28 12:21:22,300 - INFO - ✓ All matches already extracted! Nothing to do.


In [14]:
# Import the extraction function
from jupyter_extract_player_stats import extract_player_stats_by_matches

# Your match IDs
match_ids = [
    'cc8f4050',
    '0359e2a8',
    '02cc2db8',
    '32b74c42',
    '8777bb7e'
]


# Extract player stats
extract_player_stats_by_matches(
    match_ids, 
    batch_name="Player Stats Extraction",
    log_filename="player_stats_extraction.log"
)

2025-07-28 12:30:22,209 - INFO - Starting extraction for Player Stats Extraction
2025-07-28 12:30:22,211 - INFO - Logging to: /Users/thomasmcmillan/projects/nwsl_data/player_stats_extraction.log
2025-07-28 12:30:22,211 - INFO - Processing 5 match IDs: ['cc8f4050', '0359e2a8', '02cc2db8', '32b74c42', '8777bb7e']
2025-07-28 12:30:22,214 - INFO - Found 5 valid matches in database
2025-07-28 12:30:22,218 - INFO - Found 0 matches with existing player stats
2025-07-28 12:30:22,219 - INFO - Need to process 5 matches
2025-07-28 12:30:22,220 - INFO - Player Stats Extraction - 2025-07-28 12:30:22
2025-07-28 12:30:22,220 - INFO - Total matches: 5
2025-07-28 12:30:22,221 - INFO - Already extracted (skipping): 0
2025-07-28 12:30:22,221 - INFO - New matches to extract: 5
2025-07-28 12:30:22,222 - INFO - Extracting player stats from 5 matches...
2025-07-28 12:30:22,222 - INFO - Using comprehensive stats extraction with 3-12 second variable delays
2025-07-28 12:30:22,223 - INFO - Anti-blocking measure

In [15]:
!pip install requests



In [16]:
import requests
import json
import time
import csv
from typing import List, Dict, Tuple
import pandas as pd  # For better display in Jupyter

class WolframCityDataFetcher:
    def __init__(self, api_key: str):
        self.api_key = api_key
        self.base_url = "https://api.wolframalpha.com/v2/query"
        
    def fetch_city_data(self, city: str, state: str) -> Dict:
        """Fetch comprehensive data for a single city"""
        
        params = {
            'appid': self.api_key,
            'input': f"{city}, {state}",
            'format': 'plaintext',
            'output': 'JSON',
            'includepodid': 'Location:CityData,Population:CityData,Demographics:USCityData,EconomicProperties:CityData,GeographicProperties:CityData'
        }
        
        try:
            response = requests.get(self.base_url, params=params)
            response.raise_for_status()
            return response.json()
        except requests.exceptions.RequestException as e:
            print(f"Error fetching data for {city}, {state}: {e}")
            return None
    
    def parse_city_data(self, data: Dict) -> Dict:
        """Extract relevant information from Wolfram Alpha response"""
        
        if not data or not data.get('queryresult', {}).get('success'):
            return {"error": "Query failed"}
        
        parsed_data = {}
        pods = data['queryresult'].get('pods', [])
        
        for pod in pods:
            pod_id = pod.get('id', '')
            pod_title = pod.get('title', '')
            
            # Extract plaintext from subpods
            subpods = pod.get('subpods', [])
            if subpods and subpods[0].get('plaintext'):
                content = subpods[0]['plaintext']
                
                # Parse based on pod type
                if 'Population' in pod_id:
                    parsed_data['population'] = self._parse_population(content)
                elif 'Demographics' in pod_id:
                    parsed_data['demographics'] = self._parse_demographics(content)
                elif 'EconomicProperties' in pod_id:
                    parsed_data['economic'] = self._parse_economic(content)
                elif 'GeographicProperties' in pod_id:
                    parsed_data['geographic'] = self._parse_geographic(content)
                elif 'Location' in pod_id:
                    parsed_data['location'] = content
        
        return parsed_data
    
    def _parse_population(self, content: str) -> Dict:
        """Parse population data from plaintext"""
        data = {}
        lines = content.strip().split('\n')
        
        for line in lines:
            if '|' in line:
                parts = line.split('|')
                if len(parts) >= 2:
                    key = parts[0].strip()
                    value = parts[1].strip()
                    data[key] = value
        
        return data
    
    def _parse_demographics(self, content: str) -> Dict:
        """Parse demographic data from plaintext"""
        return {"raw": content}
    
    def _parse_economic(self, content: str) -> Dict:
        """Parse economic data from plaintext"""
        return {"raw": content}
    
    def _parse_geographic(self, content: str) -> Dict:
        """Parse geographic data from plaintext"""
        return {"raw": content}
    
    def fetch_multiple_cities(self, cities: List[Tuple[str, str]], delay: float = 1.0) -> List[Dict]:
        """Fetch data for multiple cities with rate limiting"""
        
        results = []
        
        for i, (city, state) in enumerate(cities):
            print(f"Fetching data for {city}, {state} ({i+1}/{len(cities)})...")
            
            # Fetch raw data
            raw_data = self.fetch_city_data(city, state)
            
            # Parse the data
            if raw_data:
                parsed = self.parse_city_data(raw_data)
                parsed['city'] = city
                parsed['state'] = state
                results.append(parsed)
            else:
                results.append({
                    'city': city,
                    'state': state,
                    'error': 'Failed to fetch data'
                })
            
            # Rate limiting (except for last city)
            if i < len(cities) - 1:
                time.sleep(delay)
        
        return results
    
    def save_to_csv(self, data: List[Dict], filename: str = 'city_data.csv'):
        """Save parsed data to CSV file"""
        
        if not data:
            print("No data to save")
            return
        
        # Flatten nested dictionaries for CSV
        flattened_data = []
        
        for city_data in data:
            flat_row = {
                'city': city_data.get('city'),
                'state': city_data.get('state'),
                'location': city_data.get('location', 'N/A')
            }
            
            # Add population data
            if 'population' in city_data:
                for key, value in city_data['population'].items():
                    flat_row[f'population_{key}'] = value
            
            flattened_data.append(flat_row)
        
        # Write to CSV
        if flattened_data:
            keys = set()
            for row in flattened_data:
                keys.update(row.keys())
            
            with open(filename, 'w', newline='', encoding='utf-8') as f:
                writer = csv.DictWriter(f, fieldnames=sorted(keys))
                writer.writeheader()
                writer.writerows(flattened_data)
            
            print(f"Data saved to {filename}")
    
    def save_to_json(self, data: List[Dict], filename: str = 'city_data.json'):
        """Save parsed data to JSON file"""
        
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(data, f, indent=2)
        
        print(f"Data saved to {filename}")

In [20]:
API_KEY = "8APYYV624R"
    
# List of cities to fetch
cities = [
    ("Cary", "NC"),
]

In [21]:
# Initialize fetcher
fetcher = WolframCityDataFetcher(API_KEY)

# Fetch data for all cities
results = fetcher.fetch_multiple_cities(cities, 1.5)

print(f"\n✅ Fetched data for {len(results)} cities")
successful = sum(1 for r in results if 'error' not in r)
print(f"Successful: {successful}")
print(f"Failed: {len(results) - successful}")

Fetching data for Cary, NC (1/1)...

✅ Fetched data for 1 cities
Successful: 1
Failed: 0


In [22]:
# Display first city's data as example
if results:
    print("Sample data for first city:")
    print(json.dumps(results[0], indent=2))

Sample data for first city:
{
  "city": "Cary",
  "state": "NC"
}


In [23]:
# Create a simplified DataFrame for viewing
def flatten_for_df(results):
    rows = []
    for r in results:
        row = {
            'City': r.get('city'),
            'State': r.get('state'),
            'Location': r.get('location', 'N/A')
        }
        
        # Add population data if available
        if 'population' in r and isinstance(r['population'], dict):
            for k, v in r['population'].items():
                row[f'Pop: {k}'] = v
        
        rows.append(row)
    
    return pd.DataFrame(rows)

df = flatten_for_df(results)
display(df)  # or just 'df' in Jupyter

Unnamed: 0,City,State,Location
0,Cary,NC,


In [24]:
# Debug: Check the raw response for one city
test_city = "Cary"
test_state = "NC"

params = {
    'appid': API_KEY,
    'input': f"{test_city}, {test_state}",
    'format': 'plaintext',
    'output': 'JSON'
}

response = requests.get("https://api.wolframalpha.com/v2/query", params=params)
raw_data = response.json()

# Check if query was successful
print(f"Success: {raw_data['queryresult'].get('success')}")
print(f"Number of pods: {len(raw_data['queryresult'].get('pods', []))}")

# List all available pods
print("\nAvailable pods:")
for pod in raw_data['queryresult'].get('pods', []):
    print(f"  - {pod['title']} (ID: {pod['id']})")

Success: True
Number of pods: 21

Available pods:
  - Input interpretation (ID: Input)
  - Populations (ID: Population:CityData)
  - Location (ID: Location:CityData)
  - Local map (ID: Map:CityData)
  - Administrative regions (ID: AdministrativeRegions:CityData)
  - Current local time (ID: CurrentTime:CityData)
  - Current weather (ID: WeatherPod:CityData)
  - Demographics (ID: ACSPercentageEntrainments:ACSData)
  - Educational attainment (ID: ACSEducationEntrainments:ACSData)
  - Income statistics (ID: ACSIncomeEntrainments:ACSData)
  - Economic properties (ID: EconomicProperties:CityData)
  - Other indicators (ID: QualityOfLife:CityData)
  - Nearby cities (ID: CityHierarchyInfo:CityData)
  - Nearby airport (ID: AirportHierarchyInfo:CityData)
  - Nearby hospital (ID: HospitalHierarchyInfo:CityData)
  - Notable company headquarters (ID: CompaniesInCity:CityData)
  - Geographic properties (ID: GeographicProperties:CityData)
  - Nearby features (ID: FeaturesHierarchyInfo:CityData)
  - Ne

In [31]:
# Run this in a Jupyter cell to install required packages
!pip install requests pandas openpyxl



In [53]:
# Import the module
from wolfram_city_fetcher import fetch_city_data

# ===== CONFIGURATION =====
# Your Wolfram Alpha API key
API_KEY = "8APYYV624R"

# Enter your cities here (City, State abbreviation)
CITIES = [
    ("Washington", "D.C.")
]

# Optional: Change output file prefix (default is "city_data")
OUTPUT_PREFIX = "city_data"

# ===== RUN THE FETCHER =====
results, excel_file, json_file, csv_file = fetch_city_data(
    api_key=API_KEY,
    cities=CITIES,
    output_prefix=OUTPUT_PREFIX
)

# ===== VIEW RESULTS =====
# Display summary of results
print("\n📊 Quick Summary:")
for result in results[:5]:  # Show first 5 cities
    if 'error' not in result:
        city_name = f"{result['city']}, {result['state']}"
        pop = result.get('populations', {}).get('city population', 'N/A') if isinstance(result.get('populations'), dict) else 'N/A'
        print(f"  {city_name}: Population = {pop}")

# Optional: Load and display the summary DataFrame
import pandas as pd
df = pd.read_csv(csv_file)
print("\n📋 Summary Table:")
display(df.head())

🏙️  WOLFRAM ALPHA CITY DATA FETCHER

🚀 Starting to fetch data for 1 cities...
📋 Fields to fetch: populations, demographics, education_attainment, income_statistics, economic_properties, nearby_cities, geographic_properties

📍 Fetching data for Washington, D.C. (1/1)...
   ✅ Success! Found 7 fields: populations, demographics, education_attainment, income_statistics, economic_properties, nearby_cities, geographic_properties

📊 SUMMARY:
✅ Successfully fetched: 1 cities
❌ Failed: 0 cities

💾 Saving data...
✅ Excel report saved to: city_data.xlsx
✅ Complete JSON data saved to: city_data.json
✅ Summary CSV saved to: city_data_summary.csv

🎉 All done! Your data has been saved:
   📊 city_data.xlsx - Excel with multiple sheets
   📄 city_data.json - Complete raw data
   📋 city_data_summary.csv - Summary metrics

📊 Quick Summary:
  Washington, D.C.: Population = 689545 people (country rank: 20th) (2020 estimate)

📋 Summary Table:


Unnamed: 0,City,State,Data_Status,City_Population,Metro_Population,Urban_Population,Demo_race,Demo_Hispanic origin,Demo_US citizens,College_Degree_Plus,...,Median_Household_Income,Per_Capita_Income,Poverty_Rate,Median_Home_Price,Unemployment_Rate,Sales_Tax_Rate,Area,Elevation,Population_Density,Nearby_Cities_Count
0,Washington,D.C.,Success,689545 people (country rank: 20th) (2020 estim...,5.704 million people (Washington (DC) metro ar...,3.934 million people (Washington (DC) urban ar...,black/African American: 46.3% | white: 41.3% |...,11%,92.3%,61.5% (1.51 × national average),...,$86420.00 per year (US dollars per year) (1.38...,$56147 per year per person (US dollars per yea...,16.2% (1.21 × national average),$637400.00 (Washington (DC) metro area) (annua...,6.2% (June 2025),6% (2025 estimate),61.048 mi^2,23 ft,11295 people per square mile,3
