In [3]:
import os
import codecs
import requests
from bs4 import BeautifulSoup

import pandas as pd
import numpy as np

In [6]:
def fetch_wikipedia(page='List_of_postal_codes_of_Canada:_M', verbose=False):
    url = 'https://en.wikipedia.org/wiki/' + page
    page = requests.get(url).text

    soup = BeautifulSoup(page, 'html5lib')
    if verbose:
        print(soup.prettify())

    return soup

def convert_table(html_soup, name='wiki_table', verbose=False, return_df=True):
    tables = html_soup.findAll("table", { "class" : "wikitable" })
    if verbose:
        # show tables
        for table in tables:
            print("###############")
            print(table.text[:100])

    for tn in range(len(tables)):
        table=tables[tn]
        
        # preinit list of lists
        rows=table.findAll("tr")
        row_lengths=[len(r.findAll(['th','td'])) for r in rows]
        ncols=max(row_lengths)
        nrows=len(rows)
        data=[]
        for i in range(nrows):
            rowD=[]
            for j in range(ncols):
                rowD.append('')
            data.append(rowD)

        # process html
        for i in range(len(rows)):
            row=rows[i]
            rowD=[]
            cells = row.findAll(["td","th"])
            for j in range(len(cells)):
                cell=cells[j]

                #lots of cells span cols and rows so lets deal with that
                cspan=int(cell.get('colspan',1))
                rspan=int(cell.get('rowspan',1))
                for k in range(rspan):
                    for l in range(cspan):
                        data[i+k][j+l]+=cell.text

            data.append(rowD)

        # write data out
            page=name.split('/')[-1]
        fname='t{}__{}.csv'.format(tn, page)
        f = codecs.open(fname, 'w')#,encoding='utf-8')
        for i in range(nrows):
            rowStr=','.join(data[i])
            rowStr=rowStr.replace('\n','')
            if verbose:
                print(rowStr)
            rowStr=rowStr#.encode('unicode_escape')
            f.write(rowStr+'\n')    
    
    f.close()
    
    if return_df:
        return pd.read_csv(fname)
    
    return fname

In [1]:
def postal_codes(raw_df):
    postal_codes = raw_df.replace(to_replace='Not assigned', value=np.nan)
    postal_codes['Neighbourhood'] = postal_codes.Neighbourhood.fillna(postal_codes.Borough)
    
    return (postal_codes
            .dropna(axis=0)
            .sort_values('Neighbourhood')
            .groupby(['Postcode', 'Borough'], 
                     as_index=False,
                     sort=False
                    )['Neighbourhood']
            .agg(lambda col: ', '.join(col)))

In [2]:
def scrape_wiki(page):
    page_html = fetch_wikipedia(page)
    table_df = convert_table(page_html, return_df=True)
    
    return postal_codes(table_df)

In [None]:
postcodes_df = scrape_wiki(page='List_of_postal_codes_of_Canada:_M')
geo_data = (pd.read_csv('./Geospatial_Coordinates.csv', index_col='Postal Code')
              .pipe(postcodes_df.merge, right_index=True, left_on='Postcode'))

geo_data.head(10)