# Segmenting and Clustering Neighborhoods in Toronto

In [22]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
from geopy.geocoders import Nominatim
import folium

## Scraping Wikipedia page

In [3]:
wiki_url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
wiki_page = requests.get(wiki_url).text

In [4]:
soup = BeautifulSoup(wiki_page)

- The dataframe will consist of three columns: PostalCode, Borough, and Neighborhood
- Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.
- More than one neighborhood can exist in one postal code area. If a postal code already exists in the dataframe, append further neighborhood names after a comma
- If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.

In [7]:
cities = pd.DataFrame(columns=['PostalCode','Borough','Neighborhood'])

table = soup.find('table')
for row in table.findAll('tr'):
    cells = row.findAll('td')
    if len(cells) > 0:
        postcode = cells[0].text.strip()
        borough = cells[1].text.strip()
        neighborhood = cells[2].text.strip()
        if borough != 'Not assigned':
            if neighborhood == 'Not assigned':
                neighborhood = borough
            # if there is such postalcode in dataframe, don't create new row
            if len(cities.loc[cities.PostalCode == postcode]) > 0:
                cities.loc[cities.PostalCode == postcode, 'Neighborhood'] += ', ' + neighborhood
            else:
                cities = cities.append({
                    'PostalCode': postcode,
                    'Borough': borough,
                    'Neighborhood': neighborhood}, ignore_index=True)

In [8]:
cities.shape

(103, 3)

In [9]:
cities.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Queen's Park,Queen's Park


## Get longitude and latitude for each postal code

Geocoder doesn't work, use provided CSV data.

In [6]:
geodata = pd.read_csv('https://cocl.us/Geospatial_data')
geodata.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [14]:
cities_geodata = pd.merge(cities, geodata, how='inner', left_on='PostalCode', right_on='Postal Code')
cities_geodata.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Postal Code,Latitude,Longitude
0,M3A,North York,Parkwoods,M3A,43.753259,-79.329656
1,M4A,North York,Victoria Village,M4A,43.725882,-79.315572
2,M5A,Downtown Toronto,"Harbourfront, Regent Park",M5A,43.65426,-79.360636
3,M6A,North York,"Lawrence Heights, Lawrence Manor",M6A,43.718518,-79.464763
4,M7A,Queen's Park,Queen's Park,M7A,43.662301,-79.389494


## Mapping neighborhoods

In [20]:
address = 'Toronto, Canada'

geolocator = Nominatim(user_agent='Coursera Capstone')
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinates of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinates of Toronto are 43.653963, -79.387207.


In [23]:
# create map of New York using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(cities_geodata['Latitude'], cities_geodata['Longitude'], cities_geodata['Borough'], cities_geodata['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto