# <center>Peer-graded Assignment - Week 3<center>
## <center>Segmenting and Clustering Neighborhoods in Toronto<center>
### <center>Yunqian Guo<center>

## Scrape the neighborhood information from Wikipedia

In [1]:
import pandas as pd            # library for data analsysis
import requests                # library to handle requests
import numpy as np
from bs4 import BeautifulSoup  # library to scrap web content

# use BeautifulSoup4 to request web Table
res = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")
soup = BeautifulSoup(res.content,'lxml')
table = soup.find_all('table')[0] 
df = pd.read_html(str(table))

# convert html table to DataFrame
df = pd.DataFrame(df[0])

# Unselect "Not assigned" Borough
df = df[df.Borough != 'Not assigned']
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights


In [2]:
# locate Borough "Queen's Park"
df[df.Borough == "Queen's Park"]

Unnamed: 0,Postcode,Borough,Neighbourhood
8,M7A,Queen's Park,Not assigned


In [3]:
# Replace "Queen's Park" Neighbourhood with "Queen's Park"
Queen = df['Borough'] == "Queen's Park"
df['Neighbourhood'] = np.where(Queen, "Queen's Park",df['Neighbourhood'])
# check "Queen's Park" Neighbourhood
df[df.Borough == "Queen's Park"]

Unnamed: 0,Postcode,Borough,Neighbourhood
8,M7A,Queen's Park,Queen's Park


In [4]:
# group the dataframe by Postcode and Borough, and concatenate same postcode Neighbourhoods with seperated comma
df = df.groupby(['Postcode','Borough']).apply(lambda group: ','.join(group['Neighbourhood'])).reset_index()
df.columns = ['Postcode','Borough','Neighborhood']
df.head()

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [5]:
# See the shape of grouped Toronto neighborhood dataframe
df.shape

(103, 3)

## Read geographical coordinates of each postal code

In [6]:
!conda install -c conda-forge geocoder

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - geocoder


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    certifi-2019.9.11          |           py36_0         147 KB  conda-forge
    ca-certificates-2019.9.11  |       hecc5488_0         144 KB  conda-forge
    geocoder-1.38.1            |             py_1          53 KB  conda-forge
    openssl-1.1.1c             |       h516909a_0         2.1 MB  conda-forge
    ratelim-0.1.6              |             py_2           6 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         2.5 MB

The following NEW packages will be INSTALLED:

    geocoder:        1.38.1-py_1       conda-forge
    ratelim:         0.1.6-py_2        conda-forge

The following packages will be UPDATED:

    

In [7]:
geo = pd.read_csv("http://cocl.us/Geospatial_data")
geo = geo.round({'Latitude':2,'Longitude':2})
geo.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.81,-79.19
1,M1C,43.78,-79.16
2,M1E,43.76,-79.19
3,M1G,43.77,-79.22
4,M1H,43.77,-79.24


In [8]:
# See the shape of geo dataframe
geo.shape

(103, 3)

## Merge Toronto Neighborhood data (df) with its Geographical Coordinates (Geo)

In [9]:
df_new = pd.merge(df, geo, left_on='Postcode', right_on='Postal Code', how='left').drop(columns=['Postal Code'])
df_new.head()

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.81,-79.19
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.78,-79.16
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.76,-79.19
3,M1G,Scarborough,Woburn,43.77,-79.22
4,M1H,Scarborough,Cedarbrae,43.77,-79.24


In [10]:
# See the shape of merged dataframe
df_new.shape

(103, 5)

## Map and cluster the neighborhoods in Toronto

In [11]:
!conda install -c conda-forge folium=0.5.0 --yes
import folium

print('Folium installed and imported!')

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - folium=0.5.0


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    vincent-0.4.4              |             py_1          28 KB  conda-forge
    branca-0.3.1               |             py_0          25 KB  conda-forge
    altair-3.2.0               |           py36_0         770 KB  conda-forge
    folium-0.5.0               |             py_0          45 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         868 KB

The following NEW packages will be INSTALLED:

    altair:  3.2.0-py36_0 conda-forge
    branca:  0.3.1-py_0   conda-forge
    folium:  0.5.0-py_0   conda-forge
    vincent: 0.4.4-py_1   conda-forge


Downloading and Extracting Packages
vincent-0.4.4        | 28 KB    

In [12]:
# focus on Boroughs contain "Toronto" in name
df_Toronto = df_new[df_new['Borough'].str.contains("Toronto")].reset_index()

In [13]:
df_Toronto

Unnamed: 0,index,Postcode,Borough,Neighborhood,Latitude,Longitude
0,37,M4E,East Toronto,The Beaches,43.68,-79.29
1,41,M4K,East Toronto,"The Danforth West,Riverdale",43.68,-79.35
2,42,M4L,East Toronto,"The Beaches West,India Bazaar",43.67,-79.32
3,43,M4M,East Toronto,Studio District,43.66,-79.34
4,44,M4N,Central Toronto,Lawrence Park,43.73,-79.39
5,45,M4P,Central Toronto,Davisville North,43.71,-79.39
6,46,M4R,Central Toronto,North Toronto West,43.72,-79.41
7,47,M4S,Central Toronto,Davisville,43.7,-79.39
8,48,M4T,Central Toronto,"Moore Park,Summerhill East",43.69,-79.38
9,49,M4V,Central Toronto,"Deer Park,Forest Hill SE,Rathnelly,South Hill,...",43.69,-79.4


In [14]:
df_Toronto.shape

(38, 6)

In [15]:
df_Toronto.dtypes

index             int64
Postcode         object
Borough          object
Neighborhood     object
Latitude        float64
Longitude       float64
dtype: object

##### get geographical coordinates of Toronto Canada

In [16]:
# Toronto latitude and longitude values
from geopy.geocoders import Nominatim  # module to convert an address into latitude and longitude values
address = 'Toronto, Canada'
geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geographical coordinates of Toronto are {}, {}'.format(latitude,longitude))



The geographical coordinates of Toronto are 43.653963, -79.387207


##### pull the Toronto metropolis map

In [17]:
Toronto_map = folium.Map(location=[latitude, longitude], zoom_start=12)

# display the map of Toronto
# Toronto_map

##### spot out Toronto boroughs

In [18]:
# instantiate a feature group for the incidents in the dataframe
Borough = folium.map.FeatureGroup()

# loop through the 100 crimes and add each to the incidents feature group
for lat, lng, in zip(df_Toronto.Latitude, df_Toronto.Longitude):
    
    Borough.add_child(
        folium.features.CircleMarker(
            [lat, lng],
            radius=5, # define how big you want the circle markers to be
            fill=True,
            fill_color='blue',
            fill_opacity=0.6
        )
    ) 
    
# mark out boroughs on the map
Toronto_map.add_child(Borough)