In [48]:
from bs4 import BeautifulSoup
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut

import folium
import matplotlib.colors as colors
import matplotlib.cm as cm
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.cluster import KMeans
import requests
import pandas as pd

%matplotlib inline 

In [2]:
website_url = requests.get('https://www.governing.com/gov-data/population-density-land-area-cities-map.html').text
soup = BeautifulSoup(website_url,'lxml')


In [3]:
# get the table
my_table = soup.find("table", { "class":"dataTable"})

In [4]:
# convert BeautifulSoup tags to string list
def convert_to_list(bs4row):
    list_bs4row = bs4row.findAll(["td","th"])
    return [bs4.get_text().strip() for bs4 in list_bs4row]

In [5]:
# get the table 
rows=my_table.findAll("tr")

# first row is header
header = convert_to_list(rows[0])
header

['City',
 'Population Density (Persons/Square Mile)',
 '2016 Population',
 'Land Area (Square Miles)']

In [6]:
# convert to list of list
my_data = [convert_to_list(r) for r in rows[1:]]

In [7]:
df = pd.DataFrame(my_data,columns=['City','Population_Density','Population','Land_Area'])
df.head()

Unnamed: 0,City,Population_Density,Population,Land_Area
0,"New York, New York",28211,8537673,303
1,"Los Angeles, California",8484,3976322,469
2,"Chicago, Illinois",11883,2704958,228
3,"Houston, Texas",3842,2303482,600
4,"Phoenix, Arizona",3126,1615017,517


In [8]:
for col in df.columns[1:]:
    df[col]=df[col].apply(lambda s: int(s.replace(',','')))

In [9]:
df = df[df['Land_Area']>50]

In [10]:
df.shape

(209, 4)

In [15]:
def get_latitude_longitude(city):
    geolocator = Nominatim(user_agent="my-application")
    try:
        location = geolocator.geocode(city)
        if location is not None:
            print(city)
            return location.latitude, location.longitude
        else:
            return np.NaN,np.NaN
    except GeocoderTimedOut:
        return get_latitude_longitude(city)

In [17]:
df['Latitude'],df['Longitude'] = zip(*df['City'].apply(get_latitude_longitude))
df.dropna(subset=['Latitude', 'Longitude'], inplace=True)
df.shape

New York, New York
Los Angeles, California
Chicago, Illinois
Houston, Texas
Phoenix, Arizona
Philadelphia, Pennsylvania
San Antonio, Texas
San Diego, California
Dallas, Texas
San Jose, California
Austin, Texas
Jacksonville, Florida
Columbus, Ohio
Indianapolis, Indiana
Fort Worth, Texas
Charlotte, North Carolina
Seattle, Washington
Denver, Colorado
El Paso, Texas
Washington, District of Columbia
Detroit, Michigan
Nashville, Tennessee
Memphis, Tennessee
Portland, Oregon
Oklahoma City, Oklahoma
Las Vegas, Nevada
Louisville, Kentucky
Baltimore, Maryland
Milwaukee, Wisconsin
Albuquerque, New Mexico
Tucson, Arizona
Fresno, California
Sacramento, California
Mesa, Arizona
Kansas City, Missouri
Atlanta, Georgia
Colorado Springs, Colorado
Raleigh, North Carolina
Virginia Beach, Virginia
Omaha, Nebraska
Oakland, California
Minneapolis, Minnesota
Tulsa, Oklahoma
Arlington, Texas
New Orleans, Louisiana
Wichita, Kansas
Cleveland, Ohio
Tampa, Florida
Bakersfield, California
Aurora, Colorado
Corpus Ch

(208, 6)

In [18]:
# @hidden_cell
CLIENT_ID = 'CRDS4XWDD4KPADKBVJP1JE5NW3AYATHVMUCCUC4ZFNIWNJZ5' # your Foursquare ID
CLIENT_SECRET = 'ZBKR4ETVK14UV2SMZRJ1OCPSLAZFF2K2RPBEWFEKWHFVFHTE' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

In [19]:
def getNearbyVenues(names, latitudes, longitudes, radius=1000,LIMIT=100):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
        
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius,
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['City', 
                  'City Latitude', 
                  'City Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [20]:
city_venues = getNearbyVenues(df['City'],df['Latitude'],df['Longitude'])

New York, New York
Los Angeles, California
Chicago, Illinois
Houston, Texas
Phoenix, Arizona
Philadelphia, Pennsylvania
San Antonio, Texas
San Diego, California
Dallas, Texas
San Jose, California
Austin, Texas
Jacksonville, Florida
Columbus, Ohio
Indianapolis, Indiana
Fort Worth, Texas
Charlotte, North Carolina
Seattle, Washington
Denver, Colorado
El Paso, Texas
Washington, District of Columbia
Detroit, Michigan
Nashville, Tennessee
Memphis, Tennessee
Portland, Oregon
Oklahoma City, Oklahoma
Las Vegas, Nevada
Louisville, Kentucky
Baltimore, Maryland
Milwaukee, Wisconsin
Albuquerque, New Mexico
Tucson, Arizona
Fresno, California
Sacramento, California
Mesa, Arizona
Kansas City, Missouri
Atlanta, Georgia
Colorado Springs, Colorado
Raleigh, North Carolina
Virginia Beach, Virginia
Omaha, Nebraska
Oakland, California
Minneapolis, Minnesota
Tulsa, Oklahoma
Arlington, Texas
New Orleans, Louisiana
Wichita, Kansas
Cleveland, Ohio
Tampa, Florida
Bakersfield, California
Aurora, Colorado
Corpus Ch

In [21]:
city_venues['City'].value_counts()

Laredo, Texas                    100
Temple, Texas                    100
Rio Rancho, New Mexico           100
Winston-Salem, North Carolina    100
Killeen, Texas                   100
Kansas City, Kansas              100
Henderson, Nevada                100
Bakersfield, California          100
Wichita, Kansas                  100
Virginia Beach, Virginia         100
Baltimore, Maryland              100
Independence, Missouri           100
Little Rock, Arkansas            100
Pueblo, Colorado                 100
Reno, Nevada                     100
Kansas City, Missouri            100
North Little Rock, Arkansas      100
Pittsburgh, Pennsylvania         100
San Angelo, Texas                100
Tampa, Florida                   100
Jacksonville, Florida            100
Aurora, Colorado                 100
Seattle, Washington              100
Madison, Wisconsin               100
Mesa, Arizona                    100
Stockton, California             100
Charlotte, North Carolina        100
W

In [52]:
# one hot encoding
city_onehot = pd.get_dummies(city_venues[['Venue Category']], prefix="", prefix_sep="")
city_onehot['City'] = city_venues['City']
fixed_columns = [city_onehot.columns[-1]] + list(city_onehot.columns[:-1])
city_onehot = city_onehot[fixed_columns]

city_onehot.head()

Unnamed: 0,City,ATM,Accessories Store,Adult Boutique,Advertising Agency,Afghan Restaurant,African Restaurant,Airport,Alternative Healer,American Restaurant,...,Wedding Hall,Whisky Bar,Wine Bar,Wine Shop,Winery,Wings Joint,Women's Store,Yoga Studio,Zoo,Zoo Exhibit
0,"New York, New York",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"New York, New York",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"New York, New York",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"New York, New York",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"New York, New York",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [53]:
city_onehot_mean = city_onehot.groupby('City').mean()
city_onehot_mean

Unnamed: 0_level_0,ATM,Accessories Store,Adult Boutique,Advertising Agency,Afghan Restaurant,African Restaurant,Airport,Alternative Healer,American Restaurant,Amphitheater,...,Wedding Hall,Whisky Bar,Wine Bar,Wine Shop,Winery,Wings Joint,Women's Store,Yoga Studio,Zoo,Zoo Exhibit
City,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"Abilene, Texas",0.000000,0.00,0.0,0.00,0.0,0.00,0.000000,0.0,0.070000,0.0,...,0.0,0.00,0.01,0.000000,0.00,0.010000,0.00,0.00,0.010000,0.0
"Akron, Ohio",0.000000,0.00,0.0,0.00,0.0,0.00,0.000000,0.0,0.060000,0.0,...,0.0,0.00,0.00,0.000000,0.00,0.000000,0.00,0.00,0.010000,0.0
"Albany, Georgia",0.000000,0.00,0.0,0.00,0.0,0.00,0.000000,0.0,0.026667,0.0,...,0.0,0.00,0.00,0.013333,0.00,0.026667,0.00,0.00,0.013333,0.0
"Albuquerque, New Mexico",0.000000,0.00,0.0,0.00,0.0,0.00,0.000000,0.0,0.050000,0.0,...,0.0,0.00,0.01,0.000000,0.00,0.000000,0.00,0.00,0.010000,0.0
"Amarillo, Texas",0.000000,0.00,0.0,0.00,0.0,0.00,0.000000,0.0,0.070000,0.0,...,0.0,0.00,0.01,0.000000,0.00,0.000000,0.00,0.00,0.000000,0.0
"Anchorage, Alaska",0.000000,0.00,0.0,0.00,0.0,0.00,0.000000,0.0,0.030000,0.0,...,0.0,0.00,0.01,0.000000,0.00,0.010000,0.00,0.00,0.000000,0.0
"Apple Valley, California",0.010000,0.00,0.0,0.00,0.0,0.00,0.000000,0.0,0.030000,0.0,...,0.0,0.00,0.00,0.000000,0.00,0.010000,0.00,0.00,0.000000,0.0
"Arlington, Texas",0.000000,0.01,0.0,0.00,0.0,0.00,0.000000,0.0,0.060000,0.0,...,0.0,0.00,0.00,0.000000,0.00,0.010000,0.00,0.00,0.000000,0.0
"Athens-Clarke County, Georgia",0.000000,0.00,0.0,0.00,0.0,0.00,0.000000,0.0,0.020000,0.0,...,0.0,0.00,0.00,0.000000,0.00,0.000000,0.00,0.00,0.000000,0.0
"Atlanta, Georgia",0.000000,0.00,0.0,0.00,0.0,0.01,0.000000,0.0,0.050000,0.0,...,0.0,0.00,0.01,0.020000,0.00,0.000000,0.00,0.00,0.000000,0.0


In [32]:
x=pd.DataFrame(city_onehot_mean).T
rslt = pd.DataFrame(np.zeros((0,5)), columns=['top1','top2','top3','top4','top5'])
for i in x.columns:
    df1row = pd.DataFrame(x.nlargest(5, i).index.tolist(), index=['top1','top2','top3','top4','top5']).T
    rslt = pd.concat([rslt, df1row], axis=0)

rslt.index=city_onehot_mean.index
rslt

Unnamed: 0_level_0,top1,top2,top3,top4,top5
City,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
"Abilene, Texas",Mexican Restaurant,American Restaurant,Fast Food Restaurant,Deli / Bodega,Grocery Store
"Akron, Ohio",American Restaurant,Park,Bar,Coffee Shop,Fast Food Restaurant
"Albany, Georgia",Grocery Store,Fast Food Restaurant,Sandwich Place,Chinese Restaurant,Coffee Shop
"Albuquerque, New Mexico",Mexican Restaurant,Brewery,Pizza Place,American Restaurant,Sandwich Place
"Amarillo, Texas",American Restaurant,Mexican Restaurant,Fast Food Restaurant,Burger Joint,Coffee Shop
"Anchorage, Alaska",Coffee Shop,Park,Seafood Restaurant,Mexican Restaurant,Steakhouse
"Apple Valley, California",Fast Food Restaurant,Pizza Place,Coffee Shop,Mexican Restaurant,Sandwich Place
"Arlington, Texas",American Restaurant,Burger Joint,Fried Chicken Joint,Mexican Restaurant,Bakery
"Athens-Clarke County, Georgia",Bar,Pizza Place,Mexican Restaurant,Breakfast Spot,Fast Food Restaurant
"Atlanta, Georgia",Trail,American Restaurant,Park,Brewery,Mexican Restaurant


In [54]:
city_grouped = city_onehot.groupby('City').mean().reset_index()
city_grouped.head()

Unnamed: 0,City,ATM,Accessories Store,Adult Boutique,Advertising Agency,Afghan Restaurant,African Restaurant,Airport,Alternative Healer,American Restaurant,...,Wedding Hall,Whisky Bar,Wine Bar,Wine Shop,Winery,Wings Joint,Women's Store,Yoga Studio,Zoo,Zoo Exhibit
0,"Abilene, Texas",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.07,...,0.0,0.0,0.01,0.0,0.0,0.01,0.0,0.0,0.01,0.0
1,"Akron, Ohio",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.06,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01,0.0
2,"Albany, Georgia",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.026667,...,0.0,0.0,0.0,0.013333,0.0,0.026667,0.0,0.0,0.013333,0.0
3,"Albuquerque, New Mexico",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05,...,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.0,0.01,0.0
4,"Amarillo, Texas",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.07,...,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [55]:
# set number of clusters
kclusters = 3

city_clustering = city_grouped.drop('City', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(city_clustering)

# add clustering labels
city_grouped.insert(0, 'Cluster Labels', kmeans.labels_)

In [56]:
city_grouped.head()

Unnamed: 0,Cluster Labels,City,ATM,Accessories Store,Adult Boutique,Advertising Agency,Afghan Restaurant,African Restaurant,Airport,Alternative Healer,...,Wedding Hall,Whisky Bar,Wine Bar,Wine Shop,Winery,Wings Joint,Women's Store,Yoga Studio,Zoo,Zoo Exhibit
0,0,"Abilene, Texas",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.01,0.0,0.0,0.01,0.0,0.0,0.01,0.0
1,1,"Akron, Ohio",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01,0.0
2,2,"Albany, Georgia",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.013333,0.0,0.026667,0.0,0.0,0.013333,0.0
3,0,"Albuquerque, New Mexico",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.0,0.01,0.0
4,0,"Amarillo, Texas",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [57]:
city_grouped_cluster = city_grouped.groupby('Cluster Labels').mean().reset_index()
city_grouped_cluster.head()

Unnamed: 0,Cluster Labels,ATM,Accessories Store,Adult Boutique,Advertising Agency,Afghan Restaurant,African Restaurant,Airport,Alternative Healer,American Restaurant,...,Wedding Hall,Whisky Bar,Wine Bar,Wine Shop,Winery,Wings Joint,Women's Store,Yoga Studio,Zoo,Zoo Exhibit
0,0,0.000164,0.000492,0.0,0.0,0.0,0.0,0.00034,0.000164,0.037985,...,0.0,0.000164,0.002295,0.001311,0.000492,0.004153,0.000164,0.000656,0.001311,0.0
1,1,9.7e-05,0.000777,9.7e-05,0.000194,0.0,0.000583,0.0,0.0,0.042039,...,0.000194,0.001456,0.006117,0.002136,0.000388,0.00165,0.000388,0.00301,0.001359,0.001359
2,2,0.00077,0.000909,0.0,0.0,0.000227,0.0,0.000421,0.0,0.041779,...,0.0,0.000455,0.001591,0.001667,0.000227,0.008141,0.000227,0.001591,0.00053,0.0


In [58]:
city_grouped_cluster = city_grouped_cluster.drop(columns = ['Cluster Labels'])


x=pd.DataFrame(city_grouped_cluster).T
rslt = pd.DataFrame(np.zeros((0,5)), columns=['top1','top2','top3','top4','top5'])
for i in x.columns:
    df1row = pd.DataFrame(x.nlargest(5, i).index.tolist(), index=['top1','top2','top3','top4','top5']).T
    rslt = pd.concat([rslt, df1row], axis=0)

rslt.index=city_grouped_cluster.index
rslt

Unnamed: 0,top1,top2,top3,top4,top5
0,Mexican Restaurant,Coffee Shop,Pizza Place,American Restaurant,Burger Joint
1,Coffee Shop,American Restaurant,Pizza Place,Bar,Brewery
2,Fast Food Restaurant,American Restaurant,Mexican Restaurant,Sandwich Place,Pizza Place


In [59]:
latitude, longitude = get_latitude_longitude('US')
latitude, longitude

US


(39.7837304, -100.4458825)

In [60]:
city_grouped_cord = pd.merge(city_grouped, df, on='City', how='left')
city_grouped_cord.head()

Unnamed: 0,Cluster Labels,City,ATM,Accessories Store,Adult Boutique,Advertising Agency,Afghan Restaurant,African Restaurant,Airport,Alternative Healer,...,Wings Joint,Women's Store,Yoga Studio,Zoo,Zoo Exhibit,Population_Density,Population,Land_Area,Latitude,Longitude
0,0,"Abilene, Texas",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.01,0.0,0.0,0.01,0.0,1145,122225,107,32.446674,-99.733301
1,1,"Akron, Ohio",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.01,0.0,3186,197633,62,41.083064,-81.518485
2,2,"Albany, Georgia",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.026667,0.0,0.0,0.013333,0.0,1339,73801,55,31.578206,-84.155681
3,0,"Albuquerque, New Mexico",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.01,0.0,2979,559277,188,35.084103,-106.650985
4,0,"Amarillo, Texas",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,2006,199582,99,35.207219,-101.833825


In [61]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=4)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]


# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(city_grouped_cord['Latitude'], city_grouped_cord['Longitude'], city_grouped_cord['City'], city_grouped_cord['Cluster Labels']):
    label = folium.Popup(str(poi) + '\n cluster' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [62]:
# create map
map_density = folium.Map(location=[latitude, longitude], zoom_start=4)


# add markers to the map
markers_colors = []
for lat, lon, poi, den in zip(df['Latitude'], df['Longitude'], df['City'], df['Population_Density']):
    label = folium.Popup(str(poi) + '\n cluster' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color='YlOrRd',
        fill_color='YlOrRd',
        fill_opacity=0.7).add_to(map_density)
       
map_density