# Clustering Most Populous US Cities
## Part 2: Preprocess Cluster Cities

In [124]:
import pandas as pd
import numpy as np
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

### Import data

In [45]:
# Zip code venue data part 1
url = 'https://raw.githubusercontent.com/vjacobsen/Coursera_Capstone/master/Dataset_zips_1.csv'
zips_1 = pd.read_csv(url, error_bad_lines=False)

# Zip code venue data part 2
url = 'https://raw.githubusercontent.com/vjacobsen/Coursera_Capstone/master/Dataset_zips_2.csv'
zips_2 = pd.read_csv(url, error_bad_lines=False)

In [425]:
# Append zip 1 and zip 2 dataframes
cities_zips = zips_1.append(zips_2)
cities_zips = cities_zips.rename(columns={'zip_latitute':'zip_latitude'})
print(cities_zips.shape)
cities_zips.sample(5)

(298390, 8)


Unnamed: 0,zip_code,zip_latitude,zip_longitude,venue,venue_id,venue_latitude,venue_longitude,venue_category
133346,89103.0,36.113211,-115.21849,The Whiskey Attic,546ec54e498e94529d18d87b,36.108642,-115.205297,Whisky Bar
103418,73019.0,35.208566,-97.44451,T.E.A. Cafe,4b6ce10cf964a5202e5b2ce3,35.211766,-97.443784,Asian Restaurant
117773,2205.0,42.350334,-71.053877,Kerasotes Showplace Icon Theater - Boston,59a4e70b23a2e65d0fdfc19a,42.352848,-71.047335,Movie Theater
34902,92619.0,33.66985,-117.765939,W Cafe & Restaurant,5afddea7fd16bb002c49521f,33.662598,-117.773721,Café
10469,15262.0,40.434436,-80.024817,Outback Steakhouse,4e4ca6a8bd413c4cc66b0a1d,40.430459,-80.039365,Steakhouse


In [426]:
# Convert Zip to string
cities_zips['zip_code'] = cities_zips['zip_code'].astype(str)

When pulling zip data from Foursquare, some venues might have duplicated due to overlapping radius between two zip codes. We need to check and remove duplicate venues

In [427]:
print('Number of duplicate venues: ',  len(cities_zips['venue_id']) - len(cities_zips['venue_id'].unique()))

Number of duplicate venues:  138571


In [428]:
# Remove duplicates
cities_zips = cities_zips.drop_duplicates(subset=['venue_id']) 
cities_zips.shape

(159819, 8)

Add city data back to venuesand zips

In [429]:
# Import city data
city_data = pd.read_csv('Dataset_Top_100_Cities_Zip.csv',dtype={'Zip':str})
city_data.sample(5)

Unnamed: 0,PopRank,City,State,city_pop_2013,Pop2010,city_pop_chg_pcnt,ChangePercent,SqMi,city_pop_sqmi,city_longitude,city_latitude,Zip,Latitude,Longitude,Timezone,Daylight savings time flag,geopoint,state_name
4202,65,Cincinnati,Ohio,297517,296943,0.19,0 to 5,77.9,3810,39.1399,-84.5064,45273.0,39.166759,-84.53822,-5.0,1.0,"39.166759,-84.53822",Ohio
1309,14,San Francisco,California,837442,805235,4.0,0 to 5,46.9,17179,37.7751,-122.4193,94111.0,37.798228,-122.40027,-8.0,1.0,"37.798228,-122.40027",California
5466,120,Amarillo,Texas,196429,190695,3.01,0 to 5,99.5,1917,35.1978,-101.8287,79175.0,35.401475,-101.895089,-6.0,1.0,"35.401475,-101.895089",Texas
2537,29,Portland,Oregon,609456,583776,4.4,0 to 5,133.4,4375,45.537,-122.65,97290.0,45.580557,-122.374776,-8.0,1.0,"45.580557,-122.374776",Oregon
5757,134,Providence,Rhode Island,177994,178042,-0.03,-5 to 0,18.4,9676,41.8231,-71.4188,2906.0,41.83635,-71.39427,-5.0,1.0,"41.83635,-71.39427",Rhode Island


In [430]:
# Add City to venue data
venues_cities = cities_zips.merge(city_data[['Zip','City','State']],left_on=['zip_code'], right_on=['Zip'],how='left')
venues_cities.sample(5)

Unnamed: 0,zip_code,zip_latitude,zip_longitude,venue,venue_id,venue_latitude,venue_longitude,venue_category,Zip,City,State
106251,70827.0,30.433837,-91.082468,I-12 Exit 2: Airline Hwy,4e4918a7ae60fa723315423a,30.423791,-91.075057,Intersection,70827.0,Baton Rouge,Louisiana
39438,79912.0,31.848055,-106.54487,Zino's,4d8e587efa94370498a400c6,31.836713,-106.552288,Greek Restaurant,79912.0,El Paso,Texas
109056,92410.0,34.106965,-117.2975,Del Taco,4bcba6aa3740b7139efd6265,34.10333,-117.300532,Fast Food Restaurant,92410.0,San Bernardino,California
149746,90242.0,33.92018,-118.14291,Starbucks,4af731a8f964a5200c0722e3,33.927598,-118.129375,Coffee Shop,90242.0,Downey,California
11328,77013.0,29.78146,-95.24289,Goodwill,4eaec41a9911d5c951cbeae6,29.774859,-95.232834,Thrift / Vintage Store,77013.0,Houston,Texas


In [431]:
# Format Zip code
# pad to 5 chars 
venues_cities['zip_code'] = [string.replace('.0','') for string in venues_cities['zip_code'].astype(str)]
venues_cities['zip_code'] = venues_cities['zip_code'].str.pad(5,side='left',fillchar='0')
venues_cities.tail()

Unnamed: 0,zip_code,zip_latitude,zip_longitude,venue,venue_id,venue_latitude,venue_longitude,venue_category,Zip,City,State
159814,77845,30.571905,-96.29882,Expressions Dance And Music,4bd5c7114e32d13a916dc180,30.569867,-96.307863,Dance Studio,77845.0,College Station,Texas
159815,77845,30.571905,-96.29882,Solaris Tan,4ca0f2083244b21ab05b3703,30.569522,-96.308051,Tanning Salon,77845.0,College Station,Texas
159816,77845,30.571905,-96.29882,Suzanne's School of Dance,4c7c4e70744d8cfab16eba37,30.572808,-96.308909,Dance Studio,77845.0,College Station,Texas
159817,77845,30.571905,-96.29882,Brothers Pond Park,4b3bdc36f964a520d97c25e3,30.584161,-96.300614,Park,77845.0,College Station,Texas
159818,77845,30.571905,-96.29882,Laredo Taco Company,58701926e386e36cdbf2a6fe,30.559191,-96.300311,Taco Place,77845.0,College Station,Texas


In [432]:
from pivottablejs import pivot_ui
#pivot_ui(venues_cities)

### Get the frequencies of each category relative to each zip

In [433]:
# Create a function to get frequency and rank
def category_frequency_func(df):
    frequencies = df['venue_category'].value_counts(normalize=True).to_frame()
    frequencies['rank'] = np.arange(1,len(frequencies)+1)
    frequencies = frequencies.reset_index().rename(columns={'index':'venue_category',
                                              'venue_category':'cat_frequency'})
    return frequencies

In [463]:
# Apply function to each city
category_frequencies = venues_cities.groupby(['State','City','zip_code']).apply(category_frequency_func)
category_frequencies.index = category_frequencies.index.droplevel(3)
category_frequencies.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,venue_category,cat_frequency,rank
State,City,zip_code,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Alabama,Birmingham,35201,Mexican Restaurant,0.125,1
Alabama,Birmingham,35201,Sandwich Place,0.09375,2
Alabama,Birmingham,35201,Diner,0.0625,3
Alabama,Birmingham,35201,Fast Food Restaurant,0.0625,4
Alabama,Birmingham,35201,Convenience Store,0.03125,5


### Pivot the data from long to wide format
Rank will be a feature in the machine learning model

In [464]:
category_frequencies = category_frequencies.reset_index()
category_frequencies['key'] = category_frequencies['State'] + '_' + category_frequencies['City'] + '_' + category_frequencies['zip_code']

# Number of top categories to get rank
n = 5
category_frequencies = category_frequencies[category_frequencies['rank']<=n]
category_frequencies = category_frequencies.set_index('key')
category_frequencies.head()

Unnamed: 0_level_0,State,City,zip_code,venue_category,cat_frequency,rank
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Alabama_Birmingham_35201,Alabama,Birmingham,35201,Mexican Restaurant,0.125,1
Alabama_Birmingham_35201,Alabama,Birmingham,35201,Sandwich Place,0.09375,2
Alabama_Birmingham_35201,Alabama,Birmingham,35201,Diner,0.0625,3
Alabama_Birmingham_35201,Alabama,Birmingham,35201,Fast Food Restaurant,0.0625,4
Alabama_Birmingham_35201,Alabama,Birmingham,35201,Convenience Store,0.03125,5


### Most common categories by zip

In [465]:
# Pivot rank from long to wide
category_rank = category_frequencies.reset_index().pivot('key','rank','venue_category')

# Rename columns
category_rank.columns = ['rank_'+ str(name) for name in category_rank.columns]
category_rank.head()

Unnamed: 0_level_0,rank_1,rank_2,rank_3,rank_4,rank_5
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Alabama_Birmingham_35201,Mexican Restaurant,Sandwich Place,Diner,Fast Food Restaurant,Convenience Store
Alabama_Birmingham_35203,Chinese Restaurant,Rental Car Location,Wings Joint,Pizza Place,Fried Chicken Joint
Alabama_Birmingham_35204,Football Stadium,Park,Gas Station,Nightclub,Hotel
Alabama_Birmingham_35205,American Restaurant,Coffee Shop,Indian Restaurant,Fast Food Restaurant,BBQ Joint
Alabama_Birmingham_35206,Fried Chicken Joint,Gas Station,Food,Grocery Store,Café


### Category frequency by zip

In [437]:
category_frequencies = category_frequencies.reset_index().pivot('key','venue_category','cat_frequency').fillna(0)
category_frequencies.head()

venue_category,ATM,Acai House,Accessories Store,Adult Boutique,Advertising Agency,Afghan Restaurant,African Restaurant,Airport,Airport Gate,Airport Lounge,...,Weight Loss Center,Whisky Bar,Wine Bar,Wine Shop,Winery,Wings Joint,Women's Store,Yoga Studio,Zoo,Zoo Exhibit
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Alabama_Birmingham_35201,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Alabama_Birmingham_35203,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.071429,0.0,0.0,0.0,0.0
Alabama_Birmingham_35204,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.071429,0.0,0.0,0.0,0.0
Alabama_Birmingham_35205,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Alabama_Birmingham_35206,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### City Data Features

In [438]:
# Select feature columns from city data
city_data['key'] = city_data['State'] + '_' + city_data['City']
city_data_features =  city_data.set_index('key')[['city_pop_chg_pcnt','city_pop_sqmi']] 
city_data_features = city_data_features.drop_duplicates()
city_data_features.head()

Unnamed: 0_level_0,city_pop_chg_pcnt,city_pop_sqmi
key,Unnamed: 1_level_1,Unnamed: 2_level_1
New York_New York,2.82,27012
California_Los Angeles,2.42,8092
Illinois_Chicago,0.86,11842
Texas_Houston,4.55,3501
Pennsylvania_Philadelphia,1.78,11379


### Zip Attributes

In [440]:
zip_attributes = venues_cities[['zip_code','zip_latitude','zip_longitude','City','State']].copy()
zip_attributes['key'] = zip_attributes['State'] + '_' + zip_attributes['City'] + '_' + zip_attributes['zip_code']
zip_attributes = zip_attributes.drop_duplicates(subset='zip_code')
zip_attributes = zip_attributes.set_index('key')
print('Number of rows: ',len(zip_attributes))
zip_attributes.head()

Number of rows:  4170


Unnamed: 0_level_0,zip_code,zip_latitude,zip_longitude,City,State
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
New York_New York_10072,10072,40.780751,-73.977182,New York,New York
New York_New York_10166,10166,40.754591,-73.976238,New York,New York
New York_New York_10012,10012,40.72596,-73.99834,New York,New York
New York_New York_10039,10039,40.826181,-73.9371,New York,New York
New York_New York_10422,10422,40.828279,-73.869454,New York,New York


### Add other features to data

In [441]:
# Join on city name
#features = city_data_features.join(category_frequencies) # Frequency only
#features = city_data_features.join(pd.get_dummies(category_rank)) # Top N Venues only, no frequency
features = category_frequencies # No city attributes
features.sample(5)

venue_category,ATM,Acai House,Accessories Store,Adult Boutique,Advertising Agency,Afghan Restaurant,African Restaurant,Airport,Airport Gate,Airport Lounge,...,Weight Loss Center,Whisky Bar,Wine Bar,Wine Shop,Winery,Wings Joint,Women's Store,Yoga Studio,Zoo,Zoo Exhibit
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
California_Murrieta_92563,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Texas_Carrollton_75010,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Michigan_Grand Rapids_49508,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Louisiana_Lafayette_70505,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Texas_Tyler_75709,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Feature Scaling

In [442]:
# Standard Scaler
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

## Part 3: Cluster Zips

In [471]:
# set number of clusters
kclusters = 3

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=42).fit(features_scaled)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:50] 

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0], dtype=int32)

In [472]:
# Number of clusters
len(kmeans.labels_)

4170

### Add cluster labels to data attributes

In [473]:
# ZIP DATA
zip_attributes['cluster'] = kmeans.labels_

# Convert cluster labes to int
zip_attributes['cluster'] = zip_attributes['cluster'].astype(int)
zip_attributes.head()

Unnamed: 0_level_0,zip_code,zip_latitude,zip_longitude,City,State,cluster
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
New York_New York_10072,10072,40.780751,-73.977182,New York,New York,0
New York_New York_10166,10166,40.754591,-73.976238,New York,New York,0
New York_New York_10012,10012,40.72596,-73.99834,New York,New York,0
New York_New York_10039,10039,40.826181,-73.9371,New York,New York,0
New York_New York_10422,10422,40.828279,-73.869454,New York,New York,0


In [474]:
# CITY DATA
#city_attributes =  city_data.set_index('key')[['City',
#                                                   'State',
#                                               'city_latitude',
#                                               'city_longitude']] 
#city_attributes = city_attributes.rename(columns={'city_latitude':'city_longitude','city_longitude':'city_latitude'})
#
#city_attributes = city_attributes.drop_duplicates()
#city_attributes['cluster'] = kmeans.labels_
#
## Convert cluster labes to int
#city_attributes['cluster'] = city_attributes['cluster'].astype(int)
#city_attributes

## Visualize Results

In [475]:
latitude = 37.0902
longitude = -95.7129

# create map of US using latitude and longitude values
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=3)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(zip_attributes['zip_latitude'], 
                                  zip_attributes['zip_longitude'], 
                                  zip_attributes['City'], 
                                  zip_attributes['cluster']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters