# Clustering Most Populous US Cities
## Part 2: Preprocess Cluster Cities

In [1]:
import pandas as pd
import numpy as np
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

import os

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

### Import data

In [2]:
# Zip code venue data part 1
url = 'https://raw.githubusercontent.com/vjacobsen/Coursera_Capstone/master/Dataset_zips_1.csv'
zips_1 = pd.read_csv(url, error_bad_lines=False)

# Zip code venue data part 2
url = 'https://raw.githubusercontent.com/vjacobsen/Coursera_Capstone/master/Dataset_zips_2.csv'
zips_2 = pd.read_csv(url, error_bad_lines=False)

In [3]:
# Append zip 1 and zip 2 dataframes
cities_zips = zips_1.append(zips_2)
cities_zips = cities_zips.rename(columns={'zip_latitute':'zip_latitude'})
print(cities_zips.shape)
cities_zips.sample(5)

(298390, 8)


Unnamed: 0,zip_code,zip_latitude,zip_longitude,venue,venue_id,venue_latitude,venue_longitude,venue_category
72294,93552.0,34.561207,-118.0375,SUBWAY,4e4de569bd4101d0d79d9dbd,34.558332,-118.045199,Sandwich Place
73861,94541.0,37.674431,-122.08883,Quiznos,4b035edaf964a520fd4e22e3,37.678649,-122.083039,Sandwich Place
12126,10112.0,40.759291,-73.97979,Hamilton: An American Musical,55c639db498e5c7ce637b1a1,40.759107,-73.986697,Performing Arts Venue
142757,95816.0,38.571661,-121.46827,OBO' italian table & bar,57631a62498e6621476321c0,38.569008,-121.465724,Italian Restaurant
94219,66686.0,39.042939,-95.769657,Zumiez,4c6988b40e98a593efb32459,39.03279,-95.766137,Clothing Store


In [4]:
# Convert Zip to string
cities_zips['zip_code'] = cities_zips['zip_code'].astype(str)

When pulling zip data from Foursquare, some venues might have duplicated due to overlapping radius between two zip codes. We need to check and remove duplicate venues

In [5]:
print('Number of duplicate venues: ',  len(cities_zips['venue_id']) - len(cities_zips['venue_id'].unique()))

Number of duplicate venues:  138571


In [6]:
# Remove duplicates
cities_zips = cities_zips.drop_duplicates(subset=['venue_id']) 
cities_zips.shape

(159819, 8)

Add city data back to venuesand zips

In [7]:
# Import city data
city_data = pd.read_csv('Dataset_Top_100_Cities_Zip.csv',dtype={'Zip':str})
city_data.sample(5)

Unnamed: 0,PopRank,City,State,city_pop_2013,Pop2010,city_pop_chg_pcnt,ChangePercent,SqMi,city_pop_sqmi,city_longitude,city_latitude,Zip,Latitude,Longitude,Timezone,Daylight savings time flag,geopoint,state_name
6175,172,Bridgeport,Connecticut,147216,144229,2.07,0 to 5,16.0,9029,41.1874,-73.1957,6604.0,41.178696,-73.20004,-5.0,1.0,"41.178696,-73.20004",Connecticut
4814,93,Baton Rouge,Louisiana,229426,229493,-0.03,-5 to 0,76.9,2982,30.4485,-91.1259,70823.0,30.51589,-91.080373,-6.0,1.0,"30.51589,-91.080373",Louisiana
3025,37,Kansas City,Missouri,467007,459787,1.57,0 to 5,315.0,1460,39.1252,-94.5511,64190.0,39.343225,-94.85161,-6.0,1.0,"39.343225,-94.85161",Missouri
3941,59,Riverside,California,316619,303871,4.2,0 to 5,81.1,3745,33.9381,-117.3932,92508.0,33.890313,-117.32122,-8.0,1.0,"33.890313,-117.32122",California
5537,122,Mobile,Alabama,194899,195111,-0.11,-5 to 0,139.1,1403,30.6684,-88.1002,36628.0,30.658865,-88.177975,-6.0,1.0,"30.658865,-88.177975",Alabama


In [8]:
# Add City to venue data
venues_cities = cities_zips.merge(city_data[['Zip','City','State']],left_on=['zip_code'], right_on=['Zip'],how='left')
venues_cities.sample(5)

Unnamed: 0,zip_code,zip_latitude,zip_longitude,venue,venue_id,venue_latitude,venue_longitude,venue_category,Zip,City,State
127435,57108.0,43.488472,-96.72258,Your Secret Kitchen,4c61cf1aedd320a15d79ab29,43.488948,-96.729289,American Restaurant,57108.0,Sioux Falls,South Dakota
145361,95051.0,37.346241,-121.9846,Kunjip Restaurant,4aeb99dbf964a520adc321e3,37.345867,-121.978755,Korean Restaurant,95051.0,Santa Clara,California
130726,61103.0,42.303365,-89.08246,Subway,4da70daa0cb66f6587121054,42.304047,-89.096569,Sandwich Place,61103.0,Rockford,Illinois
150361,80004.0,39.81431,-105.12263,King Soopers,4bf02a1fc601a593cc59c3d1,39.811223,-105.136591,Grocery Store,80004.0,Arvada,Colorado
131773,22312.0,38.817362,-77.1537,Royal Palace Kabob,52d04a55498e606dd62bc9f3,38.818047,-77.143693,Afghan Restaurant,22312.0,Alexandria,Virginia


In [9]:
# Format Zip code
# pad to 5 chars 
venues_cities['zip_code'] = [string.replace('.0','') for string in venues_cities['zip_code'].astype(str)]
venues_cities['zip_code'] = venues_cities['zip_code'].str.pad(5,side='left',fillchar='0')
venues_cities.tail()

Unnamed: 0,zip_code,zip_latitude,zip_longitude,venue,venue_id,venue_latitude,venue_longitude,venue_category,Zip,City,State
159814,77845,30.571905,-96.29882,Expressions Dance And Music,4bd5c7114e32d13a916dc180,30.569867,-96.307863,Dance Studio,77845.0,College Station,Texas
159815,77845,30.571905,-96.29882,Solaris Tan,4ca0f2083244b21ab05b3703,30.569522,-96.308051,Tanning Salon,77845.0,College Station,Texas
159816,77845,30.571905,-96.29882,Suzanne's School of Dance,4c7c4e70744d8cfab16eba37,30.572808,-96.308909,Dance Studio,77845.0,College Station,Texas
159817,77845,30.571905,-96.29882,Brothers Pond Park,4b3bdc36f964a520d97c25e3,30.584161,-96.300614,Park,77845.0,College Station,Texas
159818,77845,30.571905,-96.29882,Laredo Taco Company,58701926e386e36cdbf2a6fe,30.559191,-96.300311,Taco Place,77845.0,College Station,Texas


In [10]:
from pivottablejs import pivot_ui
#pivot_ui(venues_cities)

### Get the frequencies of each category relative to each zip

In [11]:
# Create a function to get frequency and rank
def category_frequency_func(df):
    frequencies = df['venue_category'].value_counts(normalize=True).to_frame()
    frequencies['rank'] = np.arange(1,len(frequencies)+1)
    frequencies = frequencies.reset_index().rename(columns={'index':'venue_category',
                                              'venue_category':'cat_frequency'})
    return frequencies

In [12]:
# Apply function to each city
category_frequencies = venues_cities.groupby(['State','City','zip_code']).apply(category_frequency_func)
category_frequencies.index = category_frequencies.index.droplevel(3)
category_frequencies.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,venue_category,cat_frequency,rank
State,City,zip_code,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Alabama,Birmingham,35201,Mexican Restaurant,0.125,1
Alabama,Birmingham,35201,Sandwich Place,0.09375,2
Alabama,Birmingham,35201,Fast Food Restaurant,0.0625,3
Alabama,Birmingham,35201,Diner,0.0625,4
Alabama,Birmingham,35201,College Theater,0.03125,5


### Pivot the data from long to wide format
Rank will be a feature in the machine learning model

In [13]:
category_frequencies = category_frequencies.reset_index()
category_frequencies['key'] = category_frequencies['State'] + '_' + category_frequencies['City'] + '_' + category_frequencies['zip_code']

# Number of top categories to get rank
n = 5
category_frequencies = category_frequencies[category_frequencies['rank']<=n]
category_frequencies = category_frequencies.set_index('key')
category_frequencies.head()

Unnamed: 0_level_0,State,City,zip_code,venue_category,cat_frequency,rank
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Alabama_Birmingham_35201,Alabama,Birmingham,35201,Mexican Restaurant,0.125,1
Alabama_Birmingham_35201,Alabama,Birmingham,35201,Sandwich Place,0.09375,2
Alabama_Birmingham_35201,Alabama,Birmingham,35201,Fast Food Restaurant,0.0625,3
Alabama_Birmingham_35201,Alabama,Birmingham,35201,Diner,0.0625,4
Alabama_Birmingham_35201,Alabama,Birmingham,35201,College Theater,0.03125,5


### Most common categories by zip

In [14]:
# Pivot rank from long to wide
category_rank = category_frequencies.reset_index().pivot('key','rank','venue_category')

# Rename columns
category_rank.columns = ['rank_'+ str(name) for name in category_rank.columns]
category_rank.head()

Unnamed: 0_level_0,rank_1,rank_2,rank_3,rank_4,rank_5
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Alabama_Birmingham_35201,Mexican Restaurant,Sandwich Place,Fast Food Restaurant,Diner,College Theater
Alabama_Birmingham_35203,Chinese Restaurant,Bank,Wings Joint,Deli / Bodega,Mexican Restaurant
Alabama_Birmingham_35204,Football Stadium,Park,Gas Station,Discount Store,Liquor Store
Alabama_Birmingham_35205,American Restaurant,Coffee Shop,BBQ Joint,Indian Restaurant,Italian Restaurant
Alabama_Birmingham_35206,Grocery Store,Café,Gas Station,Food,Fried Chicken Joint


### Category frequency by zip

In [15]:
category_frequencies = category_frequencies.reset_index().pivot('key','venue_category','cat_frequency').fillna(0)
category_frequencies.head()

venue_category,ATM,Accessories Store,Adult Boutique,Advertising Agency,Afghan Restaurant,African Restaurant,Airport,Airport Gate,Airport Lounge,Airport Service,...,Waterfront,Whisky Bar,Wine Bar,Wine Shop,Winery,Wings Joint,Women's Store,Yoga Studio,Zoo,Zoo Exhibit
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Alabama_Birmingham_35201,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Alabama_Birmingham_35203,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.071429,0.0,0.0,0.0,0.0
Alabama_Birmingham_35204,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Alabama_Birmingham_35205,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Alabama_Birmingham_35206,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### City Data Features

In [16]:
# Select feature columns from city data
city_data['key'] = city_data['State'] + '_' + city_data['City']
city_data_features =  city_data.set_index('key')[['city_pop_chg_pcnt','city_pop_sqmi']] 
city_data_features = city_data_features.drop_duplicates()
city_data_features.head()

Unnamed: 0_level_0,city_pop_chg_pcnt,city_pop_sqmi
key,Unnamed: 1_level_1,Unnamed: 2_level_1
New York_New York,2.82,27012
California_Los Angeles,2.42,8092
Illinois_Chicago,0.86,11842
Texas_Houston,4.55,3501
Pennsylvania_Philadelphia,1.78,11379


### Zip Attributes

In [17]:
zip_attributes = venues_cities[['zip_code','zip_latitude','zip_longitude','City','State']].copy()
zip_attributes['key'] = zip_attributes['State'] + '_' + zip_attributes['City'] + '_' + zip_attributes['zip_code']
zip_attributes = zip_attributes.drop_duplicates(subset='zip_code')
zip_attributes = zip_attributes.set_index('key')
print('Number of rows: ',len(zip_attributes))
zip_attributes.head()

Number of rows:  4170


Unnamed: 0_level_0,zip_code,zip_latitude,zip_longitude,City,State
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
New York_New York_10072,10072,40.780751,-73.977182,New York,New York
New York_New York_10166,10166,40.754591,-73.976238,New York,New York
New York_New York_10012,10012,40.72596,-73.99834,New York,New York
New York_New York_10039,10039,40.826181,-73.9371,New York,New York
New York_New York_10422,10422,40.828279,-73.869454,New York,New York


In [18]:
# Create key dict for keys-zips
zip_key_dict = zip_attributes.reset_index()[['key','zip_code']].set_index('zip_code').to_dict()['key']

### Add other features to data

In [19]:
acs_features = pd.read_feather('Dataset_acs_profile.feather')
acs_features = acs_features.set_index('zip_code')
acs_features.sample(5)

Unnamed: 0_level_0,Estimate!!INCOME AND BENEFITS (IN 2018 INFLATION-ADJUSTED DOLLARS)!!Total households!!Median household income (dollars),Estimate!!COMMUTING TO WORK!!Workers 16 years and over!!Mean travel time to work (minutes),Percent Estimate!!EMPLOYMENT STATUS!!Civilian labor force!!Unemployment Rate,Estimate!!GROSS RENT!!Occupied units paying rent!!Median (dollars),Percent Estimate!!SEX AND AGE!!Total population!!Male,Percent Estimate!!SEX AND AGE!!Total population!!Female,Estimate!!SEX AND AGE!!Total population!!Median age (years),Percent Estimate!!SEX AND AGE!!Total population!!Under 5 years,Percent Estimate!!SEX AND AGE!!Total population!!5 to 9 years,Percent Estimate!!SEX AND AGE!!Total population!!10 to 14 years,...,Percent Estimate!!EDUCATIONAL ATTAINMENT!!Population 25 years and over!!Graduate or professional degree,Percent Estimate!!EDUCATIONAL ATTAINMENT!!Population 25 years and over!!Bachelor's degree or higher,Percent Estimate!!EDUCATIONAL ATTAINMENT!!Population 25 years and over!!Associate's degree,"Percent Estimate!!COMMUTING TO WORK!!Workers 16 years and over!!Car, truck, or van -- carpooled","Percent Estimate!!COMMUTING TO WORK!!Workers 16 years and over!!Car, truck, or van -- drove alone",Percent Estimate!!COMMUTING TO WORK!!Workers 16 years and over!!Public transportation (excluding taxicab),Percent Estimate!!COMMUTING TO WORK!!Workers 16 years and over!!Walked,Percent Estimate!!COMMUTING TO WORK!!Workers 16 years and over!!Worked at home,"Percent Estimate!!EDUCATIONAL ATTAINMENT!!Population 25 years and over!!9th to 12th grade, no diploma",Percent Estimate!!EDUCATIONAL ATTAINMENT!!Population 25 years and over!!Less than 9th grade
zip_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
69024,51786.0,18.9,1.0,662.0,48.3,51.7,43.1,3.3,10.6,6.9,...,5.3,18.7,19.1,7.2,80.2,0.3,3.2,9.0,3.9,0.8
53705,65385.0,17.9,3.3,982.0,48.1,51.9,37.0,5.6,4.9,4.2,...,45.3,76.5,5.2,5.9,54.1,20.6,6.7,5.9,0.9,0.9
65759,37188.0,30.5,9.6,700.0,48.1,51.9,43.5,5.3,7.0,7.5,...,4.1,13.0,6.5,13.0,75.8,1.3,0.4,10.8,16.4,3.4
64120,28864.0,16.8,3.8,748.0,53.2,46.8,41.5,5.7,9.1,6.7,...,5.6,10.2,23.3,8.8,89.7,4.0,6.3,4.6,4.7,2.3
49863,23214.0,36.2,9.0,773.0,50.6,49.4,32.3,5.6,4.4,21.1,...,6.7,19.7,10.8,8.8,72.1,1.3,9.8,4.6,1.8,3.6


In [20]:
import qgrid
qgrid.show_grid(acs_features.reset_index()['zip_code'])

QgridWidget(grid_options={'fullWidthRows': True, 'syncColumnCellResize': True, 'forceFitColumns': True, 'defau…

In [21]:
acs_features.index = [zip_key_dict.get(zcode) for zcode in acs_features.index]

In [22]:
# Join on city name
#features = city_data_features.join(category_frequencies) # Frequency only
#features = city_data_features.join(pd.get_dummies(category_rank)) # Top N Venues only, no frequency
#features = category_frequencies # No city attributes
features = category_frequencies.join(acs_features)
features = features.dropna() 
features

Unnamed: 0,ATM,Accessories Store,Adult Boutique,Advertising Agency,Afghan Restaurant,African Restaurant,Airport,Airport Gate,Airport Lounge,Airport Service,...,Percent Estimate!!EDUCATIONAL ATTAINMENT!!Population 25 years and over!!Graduate or professional degree,Percent Estimate!!EDUCATIONAL ATTAINMENT!!Population 25 years and over!!Bachelor's degree or higher,Percent Estimate!!EDUCATIONAL ATTAINMENT!!Population 25 years and over!!Associate's degree,"Percent Estimate!!COMMUTING TO WORK!!Workers 16 years and over!!Car, truck, or van -- carpooled","Percent Estimate!!COMMUTING TO WORK!!Workers 16 years and over!!Car, truck, or van -- drove alone",Percent Estimate!!COMMUTING TO WORK!!Workers 16 years and over!!Public transportation (excluding taxicab),Percent Estimate!!COMMUTING TO WORK!!Workers 16 years and over!!Walked,Percent Estimate!!COMMUTING TO WORK!!Workers 16 years and over!!Worked at home,"Percent Estimate!!EDUCATIONAL ATTAINMENT!!Population 25 years and over!!9th to 12th grade, no diploma",Percent Estimate!!EDUCATIONAL ATTAINMENT!!Population 25 years and over!!Less than 9th grade
Alabama_Birmingham_35203,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,8.9,24.9,3.4,10.8,61.0,1.6,17.5,6.0,16.4,4.6
Alabama_Birmingham_35204,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.4,12.0,7.9,20.9,69.7,5.9,2.3,0.7,14.7,6.3
Alabama_Birmingham_35205,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,18.9,48.4,4.6,8.0,74.7,1.6,10.2,3.4,5.0,2.9
Alabama_Birmingham_35206,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.9,14.9,8.0,15.4,77.0,1.0,1.1,2.3,12.7,4.2
Alabama_Birmingham_35207,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,5.0,13.4,6.3,12.7,75.3,6.5,1.5,0.3,17.5,4.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Wisconsin_Milwaukee_53225,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,5.5,18.8,9.0,7.9,79.5,5.7,2.7,3.5,9.0,3.4
Wisconsin_Milwaukee_53226,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,23.6,56.9,8.4,5.3,84.5,1.3,4.2,3.6,1.9,0.6
Wisconsin_Milwaukee_53227,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,8.2,25.7,9.3,8.9,84.0,2.3,1.4,2.9,6.6,2.3
Wisconsin_Milwaukee_53228,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,11.3,32.8,8.2,7.1,84.5,1.4,0.7,5.3,5.1,2.4


## Feature Scaling

Unnamed: 0,key
0,Alabama_Birmingham_35203
1,Alabama_Birmingham_35204
2,Alabama_Birmingham_35205
3,Alabama_Birmingham_35206
4,Alabama_Birmingham_35207
...,...
3422,Wisconsin_Milwaukee_53225
3423,Wisconsin_Milwaukee_53226
3424,Wisconsin_Milwaukee_53227
3425,Wisconsin_Milwaukee_53228


In [23]:
# Standard Scaler
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

## Part 3: Cluster Zips

In [24]:
# set number of clusters
kclusters = 3

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=42).fit(features_scaled)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:50] 

array([1, 1, 2, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1,
       2, 2, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 2, 1, 0, 1, 0, 0, 1, 0, 1, 1,
       1, 0, 0, 1, 1, 1], dtype=int32)

In [25]:
# Number of clusters
len(kmeans.labels_)

3427

### Add cluster labels to data attributes

In [33]:
zip_attributes

Unnamed: 0_level_0,zip_code,zip_latitude,zip_longitude,City,State
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
New York_New York_10072,10072,40.780751,-73.977182,New York,New York
New York_New York_10166,10166,40.754591,-73.976238,New York,New York
New York_New York_10012,10012,40.725960,-73.998340,New York,New York
New York_New York_10039,10039,40.826181,-73.937100,New York,New York
New York_New York_10422,10422,40.828279,-73.869454,New York,New York
...,...,...,...,...,...
Texas_Pearland_77584,77584,29.543654,-95.340360,Pearland,Texas
Texas_College Station_77844,77844,30.652120,-96.341012,College Station,Texas
Texas_College Station_77840,77840,30.614647,-96.326410,College Station,Texas
Texas_College Station_77841,77841,30.572580,-96.327044,College Station,Texas


In [42]:
# ZIP DATA
labels_df = pd.DataFrame(features.index,columns=['key'])
labels_df['cluster'] = kmeans.labels_
labels_dict = labels_df.set_index('key').to_dict()['cluster']

In [54]:
zip_attributes['cluster'] = [labels_dict.get(key) for key in zip_attributes.index]
zip_attributes = zip_attributes.dropna()

# Convert cluster labes to int
zip_attributes['cluster'] = zip_attributes['cluster'].astype(int)
zip_attributes.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


Unnamed: 0_level_0,zip_code,zip_latitude,zip_longitude,City,State,cluster
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
New York_New York_10012,10012,40.72596,-73.99834,New York,New York,2
New York_New York_10039,10039,40.826181,-73.9371,New York,New York,1
New York_New York_10040,10040,40.858704,-73.92853,New York,New York,2
New York_New York_10017,10017,40.752159,-73.97231,New York,New York,2
New York_New York_10162,10162,40.769334,-73.94893,New York,New York,0


In [55]:
# CITY DATA
#city_attributes =  city_data.set_index('key')[['City',
#                                                   'State',
#                                               'city_latitude',
#                                               'city_longitude']] 
#city_attributes = city_attributes.rename(columns={'city_latitude':'city_longitude','city_longitude':'city_latitude'})
#
#city_attributes = city_attributes.drop_duplicates()
#city_attributes['cluster'] = kmeans.labels_
#
## Convert cluster labes to int
#city_attributes['cluster'] = city_attributes['cluster'].astype(int)
#city_attributes

## Visualize Results

In [56]:
latitude = 37.0902
longitude = -95.7129

# create map of US using latitude and longitude values
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=3)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(zip_attributes['zip_latitude'], 
                                  zip_attributes['zip_longitude'], 
                                  zip_attributes['City'], 
                                  zip_attributes['cluster']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters