# Segmenting and Clustering Neighborhoods in Toronto

## For this assignment, we will be required to explore and cluster the neighborhoods in Toronto.

    Table of Content
    1. Web Scraping

In [1]:
import pandas as pd
import numpy as np

from bs4 import BeautifulSoup
import requests

import warnings
warnings.filterwarnings('ignore')

  return f(*args, **kwds)
  return f(*args, **kwds)


## 1. Web Scrapping

In [2]:
# get the html of the target page in the form of text.
source = requests\
        .get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')\
        .text

In [3]:
# pass in the html file into a BeautifulSoup and specify our parser as lxml.
soup = BeautifulSoup(source, 'lxml')

In [4]:
# find the table tag in the html
table = soup.find('table')

In [5]:
# find all the td tags under the table tag
tds = table.find_all('td')

In [6]:
postal_code_df = pd.DataFrame(columns=['Postal Code', 'Borough', 'Neighborhood'])

# Loop through a list of td tags with a step size of 3
# and append the info to the dataframe.
for i in range(0, len(tds), 3):
    postcode = tds[i].text.strip()
    borough = tds[i+1].text.strip()
    neighbourhood = tds[i+2].text.strip()
    
    if borough == 'Not assigned':
        continue

    if borough != 'Not assigned' and neighbourhood == 'Not assigned':
        neighbourhood = borough
    
    postal_code_df = postal_code_df.append({
        'Postal Code': postcode,
        'Borough': borough,
        'Neighborhood': neighbourhood
    }, ignore_index=True)

In [7]:
postal_code_df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights


In [8]:
postal_code_df.shape

(212, 3)

##  Geocoder to Get Latitude and Longitude

In [9]:
import os
import pickle
import googlemaps

In [11]:
gmaps = googlemaps.Client(key=key)

In [12]:
def get_lat_lng(postal_code_df):
    lats = []
    lngs = []

    for i in range(postal_code_df.shape[0]):
        current_row = postal_code_df.iloc[i, :]

        geo_info = '{}, Toronto, Ontario'.\
                    format(current_row['Postal Code'])

        geocode_result = gmaps.geocode(geo_info)   

        lat_lng_coords = geocode_result[0]['geometry']['location']

        lats.append(lat_lng_coords['lat'])
        lngs.append(lat_lng_coords['lng'])

    postal_code_df['Latitude'] = lats
    postal_code_df['Longitude'] = lngs
    
    pickle_out = open('postal_code_df.pkl', 'wb')
    pickle.dump(postal_code_df, pickle_out)

    return postal_code_df

In [13]:
exists = os.path.isfile('postal_code_df.pkl')

if exists:
    pickle_in = open('postal_code_df.pkl', 'rb')
    postal_code_df = pickle.load(pickle_in)
else:
    postal_code_df = get_lat_lng(postal_code_df)

In [14]:
postal_code_df.head(11)

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636
3,M5A,Downtown Toronto,Regent Park,43.65426,-79.360636
4,M6A,North York,Lawrence Heights,43.718518,-79.464763
5,M6A,North York,Lawrence Manor,43.718518,-79.464763
6,M7A,Queen's Park,Queen's Park,43.662301,-79.389494
7,M9A,Etobicoke,Islington Avenue,43.667856,-79.532242
8,M1B,Scarborough,Rouge,43.806686,-79.194353
9,M1B,Scarborough,Malvern,43.806686,-79.194353
