# Segmenting and Clustering Neighborhoods in Toronto

## 1. Create a dataframe with data from Wikipedia

In [18]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

In [19]:
# Scrape data from Wikipedia
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')
soup = BeautifulSoup(source.text, 'lxml')

In [103]:
# Create a dataframe from the data scraped
table = soup.find('table', class_='wikitable sortable')
dataset = []
for tr in table.find_all('tr'):
    dataset.append([td.text.strip() for td in tr.find_all('td')])
df = pd.DataFrame(data=dataset[1::], columns=['PostalCode', 'Borough', 'Neighborhood'])

In [104]:
# Drop cells with boroughs which are 'Not assigned'
df_assigned = df[~(df['Borough']=='Not assigned')].reset_index(drop=True)

In [105]:
# Count the number of postal codes which are listed more than once
list(df_assigned['PostalCode'].duplicated()).count(True)

0

In [108]:
# Count the number of rows with 'Not assigned' neighborhood
list(df_assigned['Neighborhood']=='Not assigned').count(True)

0

In [111]:
# Display the first 10 rows of the dataframe
df_assigned.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


In [110]:
# Print the number of rows of the dataframe
df_assigned.shape

(103, 3)

## 2. Create a dataframe with geographic coordinates data

In [148]:
# Download the csv file
df_csvFile = pd.read_csv('http://cocl.us/Geospatial_data')
df_csvFile.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [156]:
# Create a dataframe from the csv file
df_copy = df_assigned.copy()
df_copy.sort_values(by=['PostalCode'], inplace=True)
df_geo = pd.concat([df_copy.reset_index(drop=True), df_csvFile[['Latitude', 'Longitude']]], axis=1, join='inner')
df_geo.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"Kennedy Park, Ionview, East Birchmount Park",43.727929,-79.262029
7,M1L,Scarborough,"Golden Mile, Clairlea, Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffside, Cliffcrest, Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848
