In [1]:
from bs4 import BeautifulSoup
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut

import folium
import matplotlib.colors as colors
import matplotlib.cm as cm
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.cluster import KMeans
import requests
import pandas as pd

%matplotlib inline 

In [2]:
website_url = requests.get('https://www.governing.com/gov-data/population-density-land-area-cities-map.html').text
soup = BeautifulSoup(website_url,'lxml')

In [3]:
# get the table
my_table = soup.find("table", { "class":"dataTable"})

In [4]:
# convert BeautifulSoup tags to string list
def convert_to_list(bs4row):
    list_bs4row = bs4row.findAll(["td","th"])
    return [bs4.get_text().strip() for bs4 in list_bs4row]

In [5]:
# get the table 
rows=my_table.findAll("tr")

# first row is header
header = convert_to_list(rows[0])
# convert to list of list
my_data = [convert_to_list(r) for r in rows[1:]]

In [6]:
df = pd.DataFrame(my_data,columns=['City','Population_Density','Population','Land_Area'])

# convert the number from string to integer
for col in df.columns[1:]:
    df[col]=df[col].apply(lambda s: int(s.replace(',','')))
    
df.head()

Unnamed: 0,City,Population_Density,Population,Land_Area
0,"New York, New York",28211,8537673,303
1,"Los Angeles, California",8484,3976322,469
2,"Chicago, Illinois",11883,2704958,228
3,"Houston, Texas",3842,2303482,600
4,"Phoenix, Arizona",3126,1615017,517


In [7]:
def get_latitude_longitude(city):
    geolocator = Nominatim(user_agent="my-application")
    try:
        location = geolocator.geocode(city)
        if location is not None:
            print(city)
            return location.latitude, location.longitude
        else:
            return np.NaN,np.NaN
    except GeocoderTimedOut:
        return get_latitude_longitude(city)

In [8]:
# Get the latitude and longitude for each city
df['Latitude'],df['Longitude'] = zip(*df['City'].apply(get_latitude_longitude))

# Remove city without latitude and longitude
df.dropna(subset=['Latitude', 'Longitude'], inplace=True)
df.shape

New York, New York
Los Angeles, California
Chicago, Illinois
Houston, Texas
Phoenix, Arizona
Philadelphia, Pennsylvania
San Antonio, Texas
San Diego, California
Dallas, Texas
San Jose, California
Austin, Texas
Jacksonville, Florida
San Francisco, California
Columbus, Ohio
Indianapolis, Indiana
Fort Worth, Texas
Charlotte, North Carolina
Seattle, Washington
Denver, Colorado
El Paso, Texas
Washington, District of Columbia
Boston, Massachusetts
Detroit, Michigan
Nashville, Tennessee
Memphis, Tennessee
Portland, Oregon
Oklahoma City, Oklahoma
Las Vegas, Nevada
Louisville, Kentucky
Baltimore, Maryland
Milwaukee, Wisconsin
Albuquerque, New Mexico
Tucson, Arizona
Fresno, California
Sacramento, California
Mesa, Arizona
Kansas City, Missouri
Atlanta, Georgia
Long Beach, California
Colorado Springs, Colorado
Raleigh, North Carolina
Miami, Florida
Virginia Beach, Virginia
Omaha, Nebraska
Oakland, California
Minneapolis, Minnesota
Tulsa, Oklahoma
Arlington, Texas
New Orleans, Louisiana
Wichita, Ka

Longview, Texas
Medford, Oregon
Warwick, Rhode Island
Westland, Michigan
Somerville, Massachusetts
Melbourne, Florida
Lakewood, California
Farmington Hills, Michigan
Cranston, Rhode Island
Kennewick, Washington
Mountain View, California
Napa, California
Tustin, California
Lynchburg, Virginia
Lawrence, Massachusetts
Deerfield Beach, Florida
Brooklyn Park, Minnesota
New Rochelle, New York
Parma, Ohio
Alameda, California
Chino Hills, California
Springdale, Arkansas
Bloomington, Illinois
Bellflower, California
Racine, Wisconsin
Milpitas, California
Auburn, Washington
Pharr, Texas
Scranton, Pennsylvania
Folsom, California
Goodyear, Arizona
Plymouth, Minnesota
Fort Myers, Florida
Hammond, Indiana
Manteca, California
Loveland, Colorado
Lake Charles, Louisiana
Upland, California
St. Joseph, Missouri
Baldwin Park, California
Gary, Indiana
Perris, California
Baytown, Texas
Kalamazoo, Michigan
Layton, Utah
San Ramon, California
Boynton Beach, Florida
Wyoming, Michigan
Gastonia, North Carolina
Arl

(752, 6)

In [34]:
# create map
US_lat, US_long = get_latitude_longitude('US')
map_US = folium.Map(location=[US_lat,US_long], zoom_start=4)

# set color scheme for the clusters
cmap = plt.cm.get_cmap('YlOrRd')
norm = colors.Normalize(vmin=df['Population_Density'].min(), vmax=df['Population_Density'].max())


# add markers to the map
markers_colors = []
for lat, lon, poi, den in zip(df['Latitude'], df['Longitude'], df['City'], df['Population_Density']):
    label = folium.Popup(str(poi) + '\n' + str(den), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=colors.rgb2hex(cmap(norm(den))),
        fill_color=colors.rgb2hex(cmap(norm(den))),
        fill_opacity=0.3).add_to(map_US)
       
map_US

US


In [23]:
cmap(norm(den))

(0.403921568627451, 0.0, 0.05098039215686274, 1.0)

In [28]:
colors.rgb2hex(cmap(norm(den)))

'#67000d'