#### General Initializing

In [145]:
import requests # library to handle requests
import pandas as pd # library for data analsysis
import numpy as np # library to handle data in a vectorized manner
import random # library for random number generation

!conda install -c conda-forge geopy --yes ;
from geopy.geocoders import Nominatim # module to convert an address into latitude and longitude values

# libraries for displaying images
from IPython.display import Image 
from IPython.core.display import HTML 
    
# tranforming json file into a pandas dataframe library
from pandas.io.json import json_normalize

!conda install -c conda-forge folium=0.5.0 --yes
import folium # plotting library

print('Folium installed')
print('Libraries imported.')

Solving environment: done

# All requested packages already installed.

Solving environment: done

# All requested packages already installed.

Folium installed
Libraries imported.


#### Using the BeautifulSoup lib to get the data from the wikipedia page:

In [147]:
from bs4 import BeautifulSoup

page = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text;
soup = BeautifulSoup(page);

#### Finding the table within the page code:

In [148]:
table = soup.find('table', { 'class' : 'wikitable sortable' })
# print (table.prettify())

#### Parsing the table for the cell values and checking the lists returned:

In [149]:
PostalCode =  []
Borough = []
Neighbourhood = []

for row in table.findAll("tr"):
    cells = row.findAll("td")
    #For each "tr", assign each "td" to a variable.
    if len(cells) == 3:
        PostalCode.append(cells[0].find(text=True))
        Borough.append( cells[1].find(text=True).replace('\n',''))
        Neighbourhood.append( cells[2].find(text=True).replace('\n',''))

In [150]:
print(PostalCode[:5])
print(Borough[:5])
print(Neighbourhood[:5])

['M1A', 'M2A', 'M3A', 'M4A', 'M5A']
['Not assigned', 'Not assigned', 'North York', 'North York', 'Downtown Toronto']
['Not assigned', 'Not assigned', 'Parkwoods', 'Victoria Village', 'Harbourfront']


#### Converting lists into a DataFrame:

In [151]:
df = pd.DataFrame();
df['PostalCode'] = PostalCode;
df['Borough'] = Borough;
df['Neighbourhood'] = Neighbourhood;
df.head(5)

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


#### Removing not assigned post codes:

In [152]:
df = df[df.Borough != 'Not assigned'].reset_index()
df.head(5)

Unnamed: 0,index,PostalCode,Borough,Neighbourhood
0,2,M3A,North York,Parkwoods
1,3,M4A,North York,Victoria Village
2,4,M5A,Downtown Toronto,Harbourfront
3,5,M5A,Downtown Toronto,Regent Park
4,6,M6A,North York,Lawrence Heights


#### Grouping all Neighbourhoods by PostalCode:

In [186]:
g1 = df.groupby(["PostalCode","Borough"], as_index=False).agg( ', '.join)
g1.head(5)

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


#### Assigning borough name to unnamed neighbourhoods:

In [194]:
g1.loc[g1['Neighbourhood'] == 'Not assigned', 'Neighbourhood'] = g1['Borough']
g1[g1['PostalCode'].str.match('M7A')]

Unnamed: 0,PostalCode,Borough,Neighbourhood
85,M7A,Queen's Park,Queen's Park


#### Sanity check:

In [155]:
g1.head(5)

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


#### DataFrame shape:

In [174]:
g1.shape

(103, 3)

#### Getting geo data from csv file:

In [209]:
coord = pd.read_csv('https://cocl.us/Geospatial_data')
coord.rename(columns={'Postal Code': 'PostalCode'}, inplace=True)
coord.head(5)

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


#### Merging data from both DataFrames based on the PostalCode key:

In [223]:
g2 = pd.merge(g1, coord, on='PostalCode')
g2.head(25)

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848
