# Segmenting and Clustering Neighborhoods in Toronto

### Part I

In [7]:
from bs4 import BeautifulSoup
import requests

import pandas as pd

In [16]:
# request wikipedia for page content
URL_list = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
list_request = requests.get(URL_list)
# parse the page
soup = BeautifulSoup(list_request.content, 'html.parser')
# get table info
name_table = soup.find('table', attrs = {'class': 'wikitable'})
table_head = name_table.find_all('th')
table_rows = name_table.find_all('tr')
column_names = []
for elem in table_head:
    column_names.append(elem.text.strip())
rows_list = []
for elem in table_rows[1:]:
    td_all = elem.find_all('td')
    data_row = [elem.text.strip() for elem in td_all]
    rows_list.append(data_row)
# create dataframe with data from parsed page
df = pd.DataFrame(rows_list, columns = column_names)
df = df.loc[df['Borough'] != 'Not assigned']
df = df.groupby(['Postcode','Borough']).Neighbourhood.apply(lambda x: ', '.join(x)).reset_index()
df['Neighbourhood'][df['Neighbourhood'] == 'Not assigned'] = df['Borough']
# take a look on a dataframe
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [17]:
df.tail()

Unnamed: 0,Postcode,Borough,Neighbourhood
98,M9N,York,Weston
99,M9P,Etobicoke,Westmount
100,M9R,Etobicoke,"Kingsview Village, Martin Grove Gardens, Richv..."
101,M9V,Etobicoke,"Albion Gardens, Beaumond Heights, Humbergate, ..."
102,M9W,Etobicoke,Northwest


In [11]:
df.shape

(103, 3)

### Part II

In [19]:
import io

In [28]:
# get data from Forsquare API
URL_geo = "http://cocl.us/Geospatial_data"
geo_request = requests.get(URL_geo).content
geo_df = pd.read_csv(io.StringIO(geo_request.decode('utf-8')))
# rename cols
geo_df.columns = ['Postcode', 'Latitude', 'Longitude']
df = pd.merge(geo_df, df, on = 'Postcode')
# some reorder
df = df[['Postcode', 'Borough', 'Neighbourhood', 'Latitude', 'Longitude']]
# take a look on a dataframe
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [29]:
df.tail()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
98,M9N,York,Weston,43.706876,-79.518188
99,M9P,Etobicoke,Westmount,43.696319,-79.532242
100,M9R,Etobicoke,"Kingsview Village, Martin Grove Gardens, Richv...",43.688905,-79.554724
101,M9V,Etobicoke,"Albion Gardens, Beaumond Heights, Humbergate, ...",43.739416,-79.588437
102,M9W,Etobicoke,Northwest,43.706748,-79.594054


### Part III