In [305]:
wikipedia_page = 'List_of_postal_codes_of_Canada:_M'
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

# Use SHIFT+TAB keys to popup inplace code help
%config IPCompleter.greedy = True

# Output multiple statements from one input cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

table_from_top = 1

trace = False
wikipedia_url = 'https://en.wikipedia.org/wiki/{}'.format(wikipedia_page)
page = requests.get(wikipedia_url)
soup = BeautifulSoup(page.content, 'lxml')
tables = soup.find_all('table', {'class': 'wikitable'})
table = tables[table_from_top - 1]

feature_names = []

header_row = table.find('tr')
for header in header_row.find_all('th'):
    feature_name = ' '.join(header.find_all(text=True))
    feature_name.replace('\n', '')
    feature_names.append(feature_name)
    
def has_coords(tag):
    if tag.has_attr('class'):
        if tag['class'][0] == 'latitude' or tag['class'][0] == 'longitude':
            return True
    return False

def get_coords(child):
    coords = []
    for coord in child.find_all(has_coords):
        coords.append(coord.string)
    if coords:
        if trace:
            return 'C = {}'.format(' '.join(coords))
        else:
            return ' '.join(coords)
    else:
        return ''

samples = []
sample_rows = table.find_all('tr')[1:]
for sample_row in sample_rows:
    features = []
    for feature_col in sample_row.find_all('td'):
        feature_value = ''
        text = feature_col.string
        if text:
            if trace:
                features.append('T = {}'.format(text))
            else:
                features.append(text)
            continue
        
        for child in feature_col.children:
            if child.name == 'span':
                if child.has_attr('class'):
                    if child['class'] == 'display:none':
                        continue
                if child.find_all(has_coords):
                    feature_value = get_coords(child)
                    if feature_value:
                        break
                    else:
                        continue
            if child.name == 'sup':
                continue
            if child.name == 'a':
                if child.string[0] == '[':
                    continue            
            if child.name == 'a':
                if trace:
                    feature_value = 'A = {}'.format(child.string)
                else:
                    feature_value = child.string
                break
            if child.name == 'font':
                if trace:
                    feature_value = 'F = {}'.format(child.string)
                else:
                    feature_value = child.string
                break
            try:
                # feature_value = '' for any tags not covered above
                content = child.contents
            except AttributeError:
                # Handle whitespace between child tags, treated as a child string
                if child.isspace():
                    continue
                if trace:
                    feature_value = 'E = {}'.format(child)
                else:
                    feature_value = child
                break
        features.append(feature_value)
    samples.append(dict(zip(feature_names, features)))

'Postcode'

'Borough'

'Neighbourhood'

In [306]:
df = pd.DataFrame(samples)
df.columns = df.columns.str.replace('\n','')
df = df.replace({'\n' : ''})
df1 = df.T.reindex(['Postcode','Borough','Neighbourhood']).T
df = df[df.Borough != 'Not assigned']
df.head()

Unnamed: 0,Borough,Neighbourhood,Postcode
2,North York,Parkwoods,M3A
3,North York,Victoria Village,M4A
4,Downtown Toronto,Harbourfront,M5A
5,Downtown Toronto,Regent Park,M5A
6,North York,Lawrence Heights,M6A


In [307]:
df = df.replace({'Not assigned': ''}, regex=True)
df = df.groupby('Postcode').agg({'Borough':'unique','Neighbourhood':','.join})
df = df.replace({'\n' : ','})
df = df.replace({'\n,' : ','})
df.head()

Unnamed: 0_level_0,Borough,Neighbourhood
Postcode,Unnamed: 1_level_1,Unnamed: 2_level_1
M1B,[Scarborough],"Rouge,Malvern"
M1C,[Scarborough],"Highland Creek,Rouge Hill,Port Union"
M1E,[Scarborough],"Guildwood\n,Morningside,West Hill"
M1G,[Scarborough],Woburn
M1H,[Scarborough],Cedarbrae


In [320]:
df['Neighbourhood'] = df['Neighbourhood'].replace('Not assigned', np.nan)
df = df.replace({'Not assigned': ''}, regex=True)
df = df.replace({'\n' : ''}, regex=True)
df['Neighbourhood'] = df['Neighbourhood'].fillna("NaN")
df
df.loc[df['Borough'] == "Queen's Park"]

Unnamed: 0_level_0,Borough,Neighbourhood
Postcode,Unnamed: 1_level_1,Unnamed: 2_level_1
M1B,[Scarborough],"Rouge,Malvern"
M1C,[Scarborough],"Highland Creek,Rouge Hill,Port Union"
M1E,[Scarborough],"Guildwood\n,Morningside,West Hill"
M1G,[Scarborough],Woburn
M1H,[Scarborough],Cedarbrae
M1J,[Scarborough],Scarborough Village
M1K,[Scarborough],"East Birchmount Park\n,Ionview,Kennedy Park"
M1L,[Scarborough],"Clairlea,Golden Mile,Oakridge"
M1M,[Scarborough],"Cliffcrest,Cliffside,Scarborough Village West\n"
M1N,[Scarborough],"Birch Cliff,Cliffside West\n"


Unnamed: 0_level_0,Borough,Neighbourhood
Postcode,Unnamed: 1_level_1,Unnamed: 2_level_1
M7A,[Queen's Park],","


In [319]:
#df["Borough"]= df["Borough"].str.replace("[", " ", case = False)
#df["Borough"]= df["Borough"].str.replace("]"," ",case = False)
#df.Neighbourhood = np.where(df.Neighbourhood.isnull(), df.Borough, df.Neighbourhood)
#df.loc[(pd.isnull(df.Neighbourhood), 'Neighbourhood')] = df.Borough
df["Neighbourhood"].fillna(method ='ffill', inplace = True) 
df.loc[df['Borough'] == "Queen's Park"]

Unnamed: 0_level_0,Borough,Neighbourhood
Postcode,Unnamed: 1_level_1,Unnamed: 2_level_1
M7A,[Queen's Park],","


In [321]:
def is_missing(Neighbourhood,Borough):    
    if pd.isnull(Neighbourhood):        
        return Borough
    else:
        return Neighbourhood

%timeit df.apply(lambda x: is_missing(x['Neighbourhood'],x['Borough']),axis=1)



6.6 ms ± 145 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [322]:
df.loc[df['Borough'] == "Queen's Park"]

Unnamed: 0_level_0,Borough,Neighbourhood
Postcode,Unnamed: 1_level_1,Unnamed: 2_level_1
M7A,[Queen's Park],","


In [323]:
df.shape

(103, 2)