In [51]:
# write code to scrape the land area data from a website
import requests
from bs4 import BeautifulSoup
import re
import time
import random

def get_land_area(zip_code: str) -> float:
    """Get the land area for a given ZIP code.
    
    Args:
        zip_code: ZIP code to lookup
        
    Returns:
        float: Land area in square miles
    """
    url = f"http://www.usa.com/{zip_code}-ny.htm"
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    
    try:
        land_area_text = soup.find("td", text=re.compile(r"Land Area:")).find_next_sibling("td").text
        land_area = float(re.search(r"([\d.]+) sq mi", land_area_text).group(1))
    except Exception as e:
        print(f"Error fetching land area for ZIP code {zip_code}: {e}")
        land_area = None

    # Also grab the population
    # It looks like: <tr><td><b>Population</b></td><td><a href="/11235-ny-population-and-races.htm">75,622 (2010-2014)</a>, rank <a href="/rank/new-york-state--total-population--zip-code-rank.htm?hl=11235&amp;hlst=NY&amp;yr=9000">#32</a></td></tr>
    try:
        population_text = soup.find("td", text=re.compile(r"Population")).find_next_sibling("td").text
        population = int(re.search(r"([\d,]+) \(", population_text).group(1).replace(",", ""))
    except Exception as e:
        print(f"Error fetching population for ZIP code {zip_code}: {e}")
        population = None

    # return both, land area and population, in a dictionary with labels
    return {"land_area": land_area, "population": population}

#  create a function that takes a list of zipcodes and returns a dictionary with the zip code as the key and the land area as the value. Make sure to handle errors and rate limit your requests to avoid getting blocked by the website.
def get_land_areas(zip_codes: List[str]) -> dict:
    """Get the land areas for a list of ZIP codes.
    
    Args:
        zip_codes: List of ZIP codes to lookup
        
    Returns:
        dict: Dictionary with ZIP code as key and land area in square miles as value
    """
    land_areas = {}
    
    for zip_code in zip_codes:
        # Rate limit the requests, randomize the delay, between 0.4 and 0.6 seconds
        time.sleep(random.uniform(0.4, 0.6))
        land_area = get_land_area(zip_code)
        if land_area is not None:
            land_areas[zip_code] = land_area
    
    return land_areas

# Test the function with a list of ZIP codes
# Convert the following zip_codes.txt file into a list of zip codes, single space separated, it looks like this: 11426 11427 11428 11429
zip_codes = open("zip_codes.txt", "r").read().split()
land_areas = get_land_areas(zip_codes)

Error fetching land area for ZIP code 10048: 'NoneType' object has no attribute 'find_next_sibling'
Error fetching population for ZIP code 10048: 'NoneType' object has no attribute 'find_next_sibling'
Error fetching population for ZIP code 11359: 'NoneType' object has no attribute 'find_next_sibling'


#### Comments on missing zip codes
Zip code 10048: World Trade Center Grounds

Zip code 11359: Fort Totten Park

In [52]:
# Save the results into a CSV file with the columns "ZIP Code", "Land Area (sq mi)", and "Population"
import csv

with open("land_areas.csv", "w", newline="") as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=["ZIP Code", "Land Area (sq mi)", "Population"])
    writer.writeheader()
    
    for zip_code, data in land_areas.items():
        writer.writerow({"ZIP Code": zip_code, "Land Area (sq mi)": data["land_area"], "Population": data["population"]})
    
print("Land areas saved to land_areas.csv")

Land areas saved to land_areas.csv
