In [1]:
import numpy as np
import pandas as pd
import json

from geopy.geocoders import Nominatim
import geocoder
import requests 
from bs4 import BeautifulSoup 
from pandas.io.json import json_normalize

import matplotlib.cm as cm
import matplotlib.colors as colors

from sklearn.cluster import KMeans
import folium

## Part I Data

### Scrape Data from Wikipedia

In [2]:
data = requests.get("https://en.wikipedia.org/wiki/List_of_administrative_divisions_of_Shanghai").text
soup = BeautifulSoup(data, 'html.parser')
My_table = soup.find('table',{'class':'wikitable'})
links = My_table.findAll('a')
Districts = []
for link in links:
    Districts.append(link.get('title'))

In [3]:
sh_df = pd.DataFrame()
sh_df['Neighborhood']=Districts
sh_df['Neighborhood'][0]='Huangpu District'
sh_df['Neighborhood'][4]='Putuo District'
sh_df['Neighborhood'][8]='Baoshan District'
sh_df['Neighborhood'][13]='Qingpu District'
sh_df = sh_df.drop(sh_df.index[16])

In [4]:
sh_df.head()

Unnamed: 0,Neighborhood
0,Huangpu District
1,Xuhui District
2,Changning District
3,Jing'an District
4,Putuo District


### Get Coordinates of each neighborhood

In [5]:
def get_latlng(neighborhood):
    lat_lng_coords = None
    while(lat_lng_coords is None):
        g = geocoder.arcgis('{}, Shanghai, China'.format(neighborhood))
        lat_lng_coords = g.latlng
    return lat_lng_coords

In [6]:
coords = [get_latlng(neighborhood) for neighborhood in sh_df["Neighborhood"].tolist() ]

In [7]:
df_coords = pd.DataFrame(coords, columns=['Latitude', 'Longitude'])

In [8]:
sh_df['Latitude'] = df_coords['Latitude']
sh_df['Longitude'] = df_coords['Longitude']

In [9]:
sh_df.head()

Unnamed: 0,Neighborhood,Latitude,Longitude
0,Huangpu District,31.2378,121.4781
1,Xuhui District,31.19594,121.44709
2,Changning District,31.21739,121.42105
3,Jing'an District,31.22,121.41583
4,Putuo District,31.251,121.3897


### Visualize Shanghai with districts as markers

In [10]:
address = 'Shanghai, China'
geolocator = Nominatim(user_agent="bjcn")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Shanghai are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Shanghai are 31.2322758, 121.4692071.


In [11]:
map_SH = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, neighborhood in zip(sh_df['Latitude'], sh_df['Longitude'], sh_df['Neighborhood']):
    label = '{}'.format(neighborhood)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_SH)  
    
map_SH

## Part II Analysis 

### Population Density Analysis

In [12]:
#read density data
density_df = pd.read_csv("shanghai_district.csv")

In [13]:
density_df

Unnamed: 0,Neighborhood,Area(km^2),Population,Density(/km^2)
0,Huangpu District,20.46,658600,32190
1,Xuhui District,54.76,1089100,19889
2,Changning District,38.3,691100,18044
3,Jing'an District,37.37,1000000,27000
4,Putuo District,54.83,1288000,23491
5,Hongkou District,23.46,809400,34501
6,Yangpu District,60.73,1315200,21657
7,Minhang District,370.75,2537900,6845
8,Baoshan District,270.99,2022900,7465
9,Jiading District,464.2,1568231,3378


In [14]:
sh_df["Density(/km^2)"]= density_df["Density(/km^2)"]

In [15]:
for i in range(0,15):
    sh_df["Density(/km^2)"][i]=int(sh_df["Density(/km^2)"][i].replace(',', ''))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [34]:
kclusters = 3
sh_clustering = sh_df.drop(["Neighborhood"],1)
sh_clustering = sh_clustering.drop(["Latitude"],1)
sh_clustering = sh_clustering.drop(["Longitude"],1)
sh_clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(sh_clustering)

In [17]:
sh_merged = sh_df.copy()
sh_merged["Cluster Labels"] = kmeans.labels_
sh_merged.sort_values(["Cluster Labels"], inplace=True)
sh_merged

Unnamed: 0,Neighborhood,Latitude,Longitude,Density(/km^2),Cluster Labels
7,Pudong,31.23513,121.52759,6845,0
8,Baoshan District,31.41639,121.48,7465,0
9,Minhang District,31.1088,121.37472,3378,0
10,Jiading District,31.36637,121.22153,4523,0
11,Jinshan District,30.92025,121.25199,1362,0
12,Songjiang District,31.03595,121.2146,2906,0
13,Qingpu District,31.15394,121.11408,1804,0
14,Fengxian District,30.83381,121.52128,1687,0
15,Chongming District,31.21739,121.42105,587,0
0,Huangpu District,31.2378,121.4781,32190,1


In [18]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(sh_merged['Latitude'], sh_merged['Longitude'], sh_merged['Neighborhood'], sh_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' - Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

### Get Venue Data from Foursquare API

In [19]:
CLIENT_ID = 'JX3VYJDBGE2PJ1OSMPU3JFLGVHFZ0XA2QKTDD4V5CHES2YEL' # your Foursquare ID
CLIENT_SECRET = 'BKNBCD34ECBQC2JHCRR5GZWQFRCFNSNEPWIFA4ACI1SRVE52' # your Foursquare Secret
VERSION = '20200314' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: JX3VYJDBGE2PJ1OSMPU3JFLGVHFZ0XA2QKTDD4V5CHES2YEL
CLIENT_SECRET:BKNBCD34ECBQC2JHCRR5GZWQFRCFNSNEPWIFA4ACI1SRVE52


In [20]:
radius = 3000
LIMIT = 100

venues = []

for lat, long, neighborhood in zip(sh_df['Latitude'], sh_df['Longitude'], sh_df['Neighborhood']):
    url = "https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}".format(
        CLIENT_ID,
        CLIENT_SECRET,
        VERSION,
        lat,
        long,
        radius, 
        LIMIT)
    
    results = requests.get(url).json()["response"]['groups'][0]['items']
    
    for venue in results:
        venues.append((
            neighborhood,
            lat, 
            long, 
            venue['venue']['name'], 
            venue['venue']['location']['lat'], 
            venue['venue']['location']['lng'],  
            venue['venue']['categories'][0]['name']
            ))

In [21]:
venues_df = pd.DataFrame(venues)
venues_df.columns = ['Neighborhood', 'DistrictLatitude', 'DistrictLongitude', 'VenueName', 'VenueLatitude', 'VenueLongitude', 'VenueCategory']
venues_df.head()

Unnamed: 0,Neighborhood,DistrictLatitude,DistrictLongitude,VenueName,VenueLatitude,VenueLongitude,VenueCategory
0,Huangpu District,31.2378,121.4781,Épices & Foie-gras,31.237557,121.47958,French Restaurant
1,Huangpu District,31.2378,121.4781,The Bund (外滩),31.239316,121.486065,Waterfront
2,Huangpu District,31.2378,121.4781,Campanile Hotel and Restaurant,31.232123,121.479144,Hotel
3,Huangpu District,31.2378,121.4781,The Peninsula Shanghai,31.243049,121.484564,Hotel
4,Huangpu District,31.2378,121.4781,Waldorf Astoria Shanghai on the Bund (外滩华尔道夫酒店),31.235479,121.485378,Hotel


### Competition Analysis

In [22]:
shopmall_df = venues_df[venues_df["VenueCategory"] == "Shopping Mall"]

In [23]:
venues_df[venues_df['VenueCategory'] == "Metro Station"]

Unnamed: 0,Neighborhood,DistrictLatitude,DistrictLongitude,VenueName,VenueLatitude,VenueLongitude,VenueCategory
475,Putuo District,31.251,121.3897,South Qilianshan Road Metro Station (祁连山南路地铁站),31.239508,121.362695,Metro Station
622,Yangpu District,31.26193,121.51904,Longchang Road Metro Station (隆昌路地铁站),31.277634,121.540713,Metro Station
759,Minhang District,31.1088,121.37472,Xinzhuang Metro Station (莘庄地铁站),31.113173,121.380579,Metro Station
766,Jiading District,31.36637,121.22153,Baiyin Road Metro Station (白银路地铁站),31.347281,121.240883,Metro Station
785,Songjiang District,31.03595,121.2146,Songjiang University Town Metro Station (松江大学城...,31.056169,121.228166,Metro Station
795,Qingpu District,31.15394,121.11408,Caoying Road Metro Station (漕盈路地铁站),31.162349,121.091949,Metro Station


In [24]:
shopmall_df.head()

Unnamed: 0,Neighborhood,DistrictLatitude,DistrictLongitude,VenueName,VenueLatitude,VenueLongitude,VenueCategory
20,Huangpu District,31.2378,121.4781,Three on the Bund,31.236266,121.486486,Shopping Mall
41,Huangpu District,31.2378,121.4781,Bund18 (外滩18号),31.24048,121.485575,Shopping Mall
42,Huangpu District,31.2378,121.4781,K11 Art Mall (上海K11购物艺术中心),31.225486,121.469001,Shopping Mall
68,Huangpu District,31.2378,121.4781,IFC Mall (国际金融中心商场),31.238492,121.497902,Shopping Mall
73,Huangpu District,31.2378,121.4781,HKRI TaiKoo Hui (兴业太古汇),31.230226,121.458413,Shopping Mall


In [25]:
venues_df.groupby('Neighborhood')['VenueName'].count()

Neighborhood
Baoshan District       12
Changning District    100
Chongming District    100
Fengxian District       4
Hongkou District      100
Huangpu District      100
Jiading District        4
Jing'an District      100
Minhang District       26
Pudong                100
Putuo District         77
Qingpu District         9
Songjiang District     21
Xuhui District        100
Yangpu District        48
Name: VenueName, dtype: int64

In [26]:
# one hot encoding
sh_onehot = pd.get_dummies(venues_df[['VenueCategory']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
sh_onehot['Neighborhoods'] = venues_df['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [sh_onehot.columns[-1]] + list(sh_onehot.columns[:-1])
sh_onehot = sh_onehot[fixed_columns]

print(sh_onehot.shape)
sh_onehot.head()

(901, 134)


Unnamed: 0,Neighborhoods,American Restaurant,Art Gallery,Art Museum,Asian Restaurant,BBQ Joint,Bagel Shop,Bakery,Bar,Beach,...,Vegetarian / Vegan Restaurant,Video Store,Waterfront,Whisky Bar,Wine Bar,Wine Shop,Xinjiang Restaurant,Yoga Studio,Yunnan Restaurant,Zhejiang Restaurant
0,Huangpu District,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Huangpu District,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2,Huangpu District,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Huangpu District,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Huangpu District,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [27]:
sh_grouped = sh_onehot.groupby(["Neighborhoods"]).mean().reset_index()

print(sh_grouped.shape)
sh_grouped

(15, 134)


Unnamed: 0,Neighborhoods,American Restaurant,Art Gallery,Art Museum,Asian Restaurant,BBQ Joint,Bagel Shop,Bakery,Bar,Beach,...,Vegetarian / Vegan Restaurant,Video Store,Waterfront,Whisky Bar,Wine Bar,Wine Shop,Xinjiang Restaurant,Yoga Studio,Yunnan Restaurant,Zhejiang Restaurant
0,Baoshan District,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Changning District,0.0,0.01,0.0,0.0,0.0,0.01,0.08,0.03,0.0,...,0.01,0.01,0.0,0.01,0.03,0.01,0.01,0.01,0.02,0.0
2,Chongming District,0.0,0.01,0.0,0.0,0.0,0.01,0.08,0.03,0.0,...,0.01,0.01,0.0,0.01,0.03,0.01,0.01,0.01,0.02,0.0
3,Fengxian District,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Hongkou District,0.01,0.01,0.0,0.0,0.01,0.0,0.01,0.01,0.0,...,0.01,0.0,0.02,0.0,0.0,0.0,0.0,0.0,0.01,0.0
5,Huangpu District,0.01,0.01,0.01,0.0,0.01,0.0,0.01,0.01,0.0,...,0.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,0.01,0.0
6,Jiading District,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,Jing'an District,0.0,0.01,0.0,0.0,0.0,0.01,0.08,0.02,0.0,...,0.01,0.01,0.0,0.01,0.03,0.01,0.01,0.01,0.02,0.0
8,Minhang District,0.0,0.0,0.0,0.0,0.0,0.0,0.038462,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.038462
9,Pudong,0.0,0.0,0.0,0.01,0.0,0.0,0.03,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.0


In [28]:
sh_mall = sh_grouped[["Neighborhoods","Shopping Mall"]]
sh_mall

Unnamed: 0,Neighborhoods,Shopping Mall
0,Baoshan District,0.166667
1,Changning District,0.02
2,Chongming District,0.02
3,Fengxian District,0.0
4,Hongkou District,0.03
5,Huangpu District,0.05
6,Jiading District,0.0
7,Jing'an District,0.02
8,Minhang District,0.076923
9,Pudong,0.05


In [29]:
# set number of clusters
kclusters = 3
sh_clustering = sh_mall.drop(["Neighborhoods"], 1)
# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(sh_clustering)

In [30]:
# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:16]

sh_merged = sh_mall.copy()

# add clustering labels
sh_merged["Cluster Labels"] = kmeans.labels_

sh_merged.rename(columns={"Neighborhoods": "Neighborhood"}, inplace=True)
sh_merged

sh_merged.sort_values(["Cluster Labels"], inplace=True)
sh_merged = sh_merged.join(sh_df.set_index("Neighborhood"), on="Neighborhood")
sh_merged

Unnamed: 0,Neighborhood,Shopping Mall,Cluster Labels,Latitude,Longitude,Density(/km^2)
8,Minhang District,0.076923,0,31.1088,121.37472,3378
10,Putuo District,0.077922,0,31.251,121.3897,23491
11,Qingpu District,0.111111,0,31.15394,121.11408,1804
12,Songjiang District,0.095238,0,31.03595,121.2146,2906
1,Changning District,0.02,1,31.21739,121.42105,18044
2,Chongming District,0.02,1,31.21739,121.42105,587
3,Fengxian District,0.0,1,30.83381,121.52128,1687
4,Hongkou District,0.03,1,31.25,121.48917,34501
5,Huangpu District,0.05,1,31.2378,121.4781,32190
6,Jiading District,0.0,1,31.36637,121.22153,4523


In [31]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(sh_merged['Latitude'], sh_merged['Longitude'], sh_merged['Neighborhood'], sh_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' - Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

## Visibility Analysis

In [32]:
from folium.plugins import HeatMap

In [33]:
shm_latlons = shopmall_df[["VenueLatitude","VenueLongitude"]]
map_sh = folium.Map(location=[latitude, longitude], zoom_start=12)
folium.TileLayer('cartodbpositron').add_to(map_sh) #cartodbpositron cartodbdark_matter
HeatMap(shm_latlons).add_to(map_sh)
folium.Marker([latitude, longitude]).add_to(map_sh)
folium.Circle([latitude, longitude], radius=1000, fill=False, color='white').add_to(map_sh)
folium.Circle([latitude, longitude], radius=3000, fill=False, color='yellow').add_to(map_sh)
folium.Circle([latitude, longitude], radius=5000, fill=False, color='cyan').add_to(map_sh)
for lat, lon, poi, cluster in zip(sh_merged['Latitude'], sh_merged['Longitude'], sh_merged['Neighborhood'], sh_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' - Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_sh)
map_sh