In [0]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

In [0]:
# define the dataframe columns
column_names = ['PostalCode', 'Borough', 'Neighborhood'] 

# instantiate the dataframe
neighborhoods = pd.DataFrame(columns=column_names)

In [0]:
#I'm using Beautifyl Soup to scrape Wikipedia
URL = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

res = requests.get(URL).text
soup = BeautifulSoup(res,'lxml')
for items in soup.find('table', class_='wikitable sortable').find_all('tr')[1::1]:
    data = items.find_all(['th','td'])
    try:
        PostalCode = data[0].text
        Borough = data[1].text
        Neighborhood = data[2].text.split("\n")[0]       
    except: pass
    neighborhoods = neighborhoods.append({'Borough': Borough,
                                          'Neighborhood': Neighborhood,
                                          'PostalCode': PostalCode}, ignore_index=True)

In [122]:
#Here is the input pandas data frame from Wikipedia
neighborhoods.head(20)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Not assigned
9,M8A,Not assigned,Not assigned


In [0]:
# Remove unassigned neighborhoods
neighborhoods = neighborhoods[neighborhoods.Borough != 'Not assigned']

In [115]:
neighborhoods.head(15)

Unnamed: 0,PostalCode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Not assigned
10,M9A,Etobicoke,Islington Avenue
11,M1B,Scarborough,Rouge
12,M1B,Scarborough,Malvern


In [0]:
# Concatonate neighborhoods in same postal code
grouped = neighborhoods.groupby('PostalCode').agg(lambda x: ','.join(set(x)))

In [118]:
grouped.head()

Unnamed: 0_level_0,Borough,Neighborhood
PostalCode,Unnamed: 1_level_1,Unnamed: 2_level_1
M1B,Scarborough,"Malvern,Rouge"
M1C,Scarborough,"Highland Creek,Port Union,Rouge Hill"
M1E,Scarborough,"West Hill,Morningside,Guildwood"
M1G,Scarborough,Woburn
M1H,Scarborough,Cedarbrae


In [0]:
# This finds all still unassigned neighborhoods and gives them the borough name
mask = grouped.Neighborhood == 'Not assigned'

grouped.loc[mask,'Neighborhood'] = grouped.loc[mask,'Borough']

In [125]:
grouped.loc[mask,:]

Unnamed: 0_level_0,Borough,Neighborhood
PostalCode,Unnamed: 1_level_1,Unnamed: 2_level_1
M7A,Queen's Park,Queen's Park


In [126]:
#Here is the shape of my output dataframe
grouped.shape

(103, 2)

In [133]:
# Import CSV file with lat long data

from google.colab import files
uploaded = files.upload()
import io


Saving Geospatial_Coordinates.csv to Geospatial_Coordinates (1).csv


In [0]:
#Put lat long data into dataframe
gc = pd.read_csv(io.BytesIO(uploaded['Geospatial_Coordinates.csv']))

In [135]:
gc.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [0]:
# Merge with grouped dataframe
grpll = grouped.merge(gc, left_on='PostalCode',right_on='Postal Code', how = 'left')

In [139]:
grpll.head()

Unnamed: 0,Borough,Neighborhood,Postal Code,Latitude,Longitude
0,Scarborough,"Malvern,Rouge",M1B,43.806686,-79.194353
1,Scarborough,"Highland Creek,Port Union,Rouge Hill",M1C,43.784535,-79.160497
2,Scarborough,"West Hill,Morningside,Guildwood",M1E,43.763573,-79.188711
3,Scarborough,Woburn,M1G,43.770992,-79.216917
4,Scarborough,Cedarbrae,M1H,43.773136,-79.239476
