# Imports

In [1]:
import folium  # map rendering library
from sklearn.cluster import KMeans  # import k-means from clustering stage
# Matplotlib and associated plotting modules
import matplotlib.colors as colors
import matplotlib.cm as cm
from bs4 import BeautifulSoup  # web scraping libraries and packages
# tranform JSON file into a pandas dataframe
from pandas.io.json import json_normalize
import requests  # library to handle requests
# convert an address into latitude and longitude values
from geopy.geocoders import Nominatim
import json  # library to handle JSON files
import numpy as np  # library to handle data in a vectorized manner
import pandas as pd  # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
print('Libraries imported.')

Libraries imported.


# Scraping the Wikipedia page

In [2]:
# GET request
toronto_data = requests.get(
    'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(toronto_data, 'html.parser')
print('Data downloaded!')

Data downloaded!


# Wrangling and cleaning

In [3]:
postalCodes = []
boroughs = []
neighborhoods = []
# iterate each row of table and find postalcode, borough and neighborhood
for row in soup.find('table').find_all('tr'):
    cells = row.find_all('td')
    if(len(cells) > 0):
        postalCodes.append(cells[0].text.rstrip('\n'))
        boroughs.append(cells[1].text.rstrip('\n'))
        neighborhoods.append(cells[2].text.rstrip('\n'))
        

# Reading into dataframe with additional cleaning

In [4]:
df_toronto = pd.DataFrame({"PostalCode": postalCodes,
                           "Borough": boroughs,
                           "Neighborhood": neighborhoods})

# Filter borough and neighborhoods that are not asssigned
df_toronto = df_toronto[df_toronto.Borough != "Not assigned"].reset_index(drop=True)


def not_assigned_neighborhood(r):
    """
    Function to assign borough values for not assigned neighborhoods
    """
    if r['Neighborhood'] == "Not assigned":
        return r["Borough"]
    else:
        return r['Neighborhood']
        
# Assigning borough values to not assgned neighborhoods               
df_toronto["Neighborhood"] = df_toronto.apply(not_assigned_neighborhood,axis=1)


# Sanity check for the filter
assert len(df_toronto[df_toronto.Borough == "Not assigned"])==0
assert len(df_toronto[df_toronto.Neighborhood == "Not assigned"])==0

print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(df_toronto['Borough'].unique()),
        df_toronto.shape[0]
    )
)

df_toronto.head()

The dataframe has 10 boroughs and 103 neighborhoods.


Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [5]:
df_toronto.shape

(103, 3)

# Reading the latitude and longitude from csv

In [13]:
lat_long = pd.read_csv('https://cocl.us/Geospatial_data')
lat_long.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


# Creating the required dataframe 

In [37]:
# Merging latitude and longitude data
df_toronto_coordinates = df_toronto.merge(lat_long, how='inner', left_on='PostalCode', right_on='Postal Code')
del df_toronto_coordinates['Postal Code']
# Changing order of columns
df_toronto_coordinates = df_toronto_coordinates[["PostalCode","Borough","Latitude","Longitude","Neighborhood"]]
df_toronto_coordinates.head()

# Concatenating neighborhoods for combination of other attributes
df_toronto_coordinates_grouped = df_toronto_coordinates.groupby(["PostalCode", "Borough","Latitude","Longitude"], as_index=False).agg(lambda x: ", ".join(x))

# For dataframe to display as per the question and in exact order
required_postcodes = ["M5G", "M2H", "M4B", "M1J", "M4G", "M4M", "M1R", "M9V", "M9L", "M5V", "M1B", "M5A"]
df_out = df_toronto_coordinates_grouped[df_toronto_coordinates_grouped['PostalCode'].isin(required_postcodes)].reset_index(drop=True)

df_output = pd.DataFrame()
for postcode in required_postcodes:
    df_output = df_output.append(df_out[df_out["PostalCode"]==postcode], ignore_index=True)
    
df_output

Unnamed: 0,PostalCode,Borough,Latitude,Longitude,Neighborhood
0,M5G,Downtown Toronto,43.657952,-79.387383,Central Bay Street
1,M2H,North York,43.803762,-79.363452,Hillcrest Village
2,M4B,East York,43.706397,-79.309937,"Parkview Hill, Woodbine Gardens"
3,M1J,Scarborough,43.744734,-79.239476,Scarborough Village
4,M4G,East York,43.70906,-79.363452,Leaside
5,M4M,East Toronto,43.659526,-79.340923,Studio District
6,M1R,Scarborough,43.750072,-79.295849,"Wexford, Maryvale"
7,M9V,Etobicoke,43.739416,-79.588437,"South Steeles, Silverstone, Humbergate, Jamest..."
8,M9L,North York,43.756303,-79.565963,Humber Summit
9,M5V,Downtown Toronto,43.628947,-79.39442,"CN Tower, King and Spadina, Railway Lands, Har..."
