In [3]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import requests

##### 1. Use beautiful soup to get raw html from given wiki link

In [4]:
wiki_link = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
source = requests.get(wiki_link).text
soup = BeautifulSoup(source, 'lxml')

##### 2. Get tag table which has class "wikitable", assume there is only one table tag like that

In [25]:
table = soup.find('table', attrs={'class':'wikitable'})

##### 3. Parse table contents to Dataframe using pandas __read_html__

In [36]:
html_table = '<table>' + "".join(str(item) for item in table.contents) + '</table>'
raw_df = pd.read_html(html_table, header=0)[0]

In [37]:
raw_df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [40]:
row, col = raw_df.shape
print("\tRaw dataframe has %s rows and %s cols" % (row, col))

	Raw dataframe has 288 rows and 3 cols


##### 4. Remove 'Not assigned' Borough

In [41]:
df = raw_df[raw_df['Borough'] != 'Not assigned']
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights


In [42]:
row, col = df.shape
print("Dataframe has %s rows and %s cols after remove 'Not assigned' Borough" % (row, col))

Dataframe has 211 rows and 3 cols after remove 'Not assigned' Borough


##### 4. Replace 'Not assigned' Neighbourhood with Borough

In [45]:
def fill_neighbor(row):
    if row['Neighbourhood'] == 'Not assigned':
        return row['Borough']
    else:
        return row['Neighbourhood']

In [46]:
df['Neighbourhood'] = df.apply(fill_neighbor, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


##### 5. Combine Neighbourhood of the same Postcode and Borough

In [50]:
df = df.groupby(['Postcode','Borough'],as_index=False).agg(lambda x : ', '.join(x))

In [57]:
df.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [58]:
row, col = df.shape
print(" Final Dataframe has %s rows and %s cols" % (row, col))

 Final Dataframe has 103 rows and 3 cols
