# Segmenting and Clustering Neighborhoods in Toronto

In [18]:
import pandas as pd
from bs4 import BeautifulSoup
import requests

In [19]:
# specify URL to Wikipedia page

url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
source = requests.get(url).text

In [20]:
soup = BeautifulSoup(source, "xml")

In [21]:
table = soup.find("table")

In [22]:
# create dataframe w/specified columns

columns = ["PostalCode", "Borough", "Neighborhood"]
df = pd.DataFrame(columns = columns)

In [23]:
for tr_cell in table.find_all("tr"):
    row = []
    for td_cell in tr_cell.find_all("td"):
        row.append(td_cell.text.strip())
    if len(row) == 3:
        df.loc[len(df)] = row

In [24]:
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


## Data Cleaning

In [25]:
# Ignore cells with a borough that is Not assigned

df = df[df['Borough'] != 'Not assigned']

In [35]:
# neighborhood will be the same as the borough when cell has a borough but a Not assigned neighborhood

temp_df = df.groupby('PostalCode')['Neighborhood'].apply(lambda x: "%s" % ', '.join(x))
temp_df = temp_df.reset_index(drop=False)
temp_df.rename(columns = {'Neighborhood':'Neighborhood_joined'},inplace=True)

df_merge = pd.merge(df, temp_df, on='PostalCode')

df_merge.drop(['Neighborhood'],axis=1,inplace=True)

df_merge.drop_duplicates(inplace=True)

df_merge.rename(columns={'Neighborhood_joined':'Neighborhood'},inplace=True)

df_merge.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [38]:
print("The dataframe has", df_merge.shape[0], "rows.")

The dataframe has 103 rows.


## Using Geocoder to Find Lat/Lng

In [44]:
# Reading provided CSV as Geocoder wouldn't install properly, including using !conda install -c conda-forge geocoder --yes

url = "https://cocl.us/Geospatial_data"

coord = pd.read_csv(url)
coord.rename(columns={'Postal Code':'PostalCode'},inplace=True)
coord

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
...,...,...,...
98,M9N,43.706876,-79.518188
99,M9P,43.696319,-79.532242
100,M9R,43.688905,-79.554724
101,M9V,43.739416,-79.588437


In [55]:
# join the lat/lng dataframe with the dataframe from Part 1

updated_df = df_merge.set_index("PostalCode").join(coord.set_index("PostalCode"))
updated_df

Unnamed: 0_level_0,Borough,Neighborhood,Latitude,Longitude
PostalCode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
M3A,North York,Parkwoods,43.753259,-79.329656
M4A,North York,Victoria Village,43.725882,-79.315572
M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636
M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
...,...,...,...,...
M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.653654,-79.506944
M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
M7Y,East Toronto,"Business reply mail Processing Centre, South C...",43.662744,-79.321558
M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.636258,-79.498509
