# Second Notebook for Capstone Project (by Glenn Tiffert)

## Import dependencies

In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests

## Get Wikipedia page and parse with BeautifulSoup

In [2]:
url=requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')
soup = BeautifulSoup(url.text, 'lxml')

In [3]:
table = soup.find('table', class_='wikitable sortable')
table_body = table.find('tbody')
print(table_body)

<tbody><tr>
<th>Postcode</th>
<th>Borough</th>
<th>Neighbourhood
</th></tr>
<tr>
<td>M1A</td>
<td>Not assigned</td>
<td>Not assigned
</td></tr>
<tr>
<td>M2A</td>
<td>Not assigned</td>
<td>Not assigned
</td></tr>
<tr>
<td>M3A</td>
<td><a href="/wiki/North_York" title="North York">North York</a></td>
<td><a href="/wiki/Parkwoods" title="Parkwoods">Parkwoods</a>
</td></tr>
<tr>
<td>M4A</td>
<td><a href="/wiki/North_York" title="North York">North York</a></td>
<td><a href="/wiki/Victoria_Village" title="Victoria Village">Victoria Village</a>
</td></tr>
<tr>
<td>M5A</td>
<td><a href="/wiki/Downtown_Toronto" title="Downtown Toronto">Downtown Toronto</a></td>
<td><a href="/wiki/Harbourfront_(Toronto)" title="Harbourfront (Toronto)">Harbourfront</a>
</td></tr>
<tr>
<td>M5A</td>
<td><a href="/wiki/Downtown_Toronto" title="Downtown Toronto">Downtown Toronto</a></td>
<td><a href="/wiki/Regent_Park" title="Regent Park">Regent Park</a>
</td></tr>
<tr>
<td>M6A</td>
<td><a href="/wiki/North_York" tit

In [4]:
data = []
columns = table_body.tr.text.split()
rows = table_body.find_all('tr')
for row in rows:
    cols = row.find_all('td')
    cols = [ele.text.strip() for ele in cols]
    data.append([ele for ele in cols if ele]) # Get rid of empty values

## Create dataframe, drop rows with unassigned boroughs, replace unassigned neighbourhoods with borough name, re-index

In [5]:
df = pd.DataFrame(data, columns=columns)
df = df.drop(0)
df = df.reset_index(drop=True)
df =df.loc[df['Borough'] != 'Not assigned']
df.Neighbourhood[df.Neighbourhood == 'Not assigned'] = df.Borough
df = df.reset_index(drop=True)

## Combine neighborhoods that share a postcode, display final dataframe

In [6]:
df_new=(df.astype(str).groupby('Postcode')['Borough','Neighbourhood']
    .agg({'Borough':'first','Neighbourhood':lambda x: ', '.join(x)}).reset_index())
df_new

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


## Get shape of dataframe

In [7]:
df_new.shape

(103, 3)

## Fetch latitude and longitude coordinates for neighborhoods using geocoder package

In [8]:

# This cell is commented out because geocoder did not work after repeated tries, so I fell back to Plan B: 
# using the supplied csv file instead.


"""
import geocoder # import geocoder

lat_cords = []
long_cords = []

for postal_code in range(len(df_new['Postcode'])):
    # initialize your variable to None
    lat_lng_coords = None

    # loop until you get the coordinates
    while(lat_lng_coords is None):
      g = geocoder.google('{}, Toronto, Ontario'.format(postal_code))
      lat_lng_coords = g.latlng

    lat_cords.append(lat_lng_coords[0])
    lat_cords.append(lat_lng_coords[1])
    
print (lat_cords, long_cords)
"""

"\nimport geocoder # import geocoder\n\nlat_cords = []\nlong_cords = []\n\nfor postal_code in range(len(df_new['Postcode'])):\n    # initialize your variable to None\n    lat_lng_coords = None\n\n    # loop until you get the coordinates\n    while(lat_lng_coords is None):\n      g = geocoder.google('{}, Toronto, Ontario'.format(postal_code))\n      lat_lng_coords = g.latlng\n\n    lat_cords.append(lat_lng_coords[0])\n    lat_cords.append(lat_lng_coords[1])\n    \nprint (lat_cords, long_cords)\n"

## Geocoder did not work. Use supplied .csv file instead

In [9]:
url = 'https://cocl.us/Geospatial_data'
coord = pd.DataFrame(pd.read_csv(url))
coord = coord.rename (columns={'Postal Code':'Postcode'}) #rename column to match first dataframe for indexing

## Merge the two dataframes

In [10]:
final = pd.merge(df_new,coord, on='Postcode')
final

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848
