<h1>Capstone Week 3 - Part 2</h1>

In [6]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np

<h1>Lets get Wikipedia Content first</h1>

In [4]:
wikipedia_link = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
results = requests.get(wikipedia_link).text
wikipedia_content = BeautifulSoup(results, 'lxml') 

<h1>Now lets extract the data and create our DataFrame</h1>

In [7]:
column_names = ["PostalCode","Borough","Neighborhood"]
df = pd.DataFrame(columns=column_names)
rows = (wikipedia_content.find("table").find_all("tr"))
for row in rows[1:]:
    postal = row.find_all("td")[0].text.replace("\n","") # Skip new lines
    borough = row.find_all("td")[1].text.replace("\n","")
    neighbor = row.find_all("td")[2].text.replace("\n","")
    
    # We skip rows containing Not Assigned Borough or Neighbor
    if ("NOT ASSIGNED" == borough.upper()):
        continue
        
    # If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough. 
    if ("NOT ASSIGNED" == neighbor.upper()):
        neighbor = borough
    
    df = df.append({'PostalCode': postal,
                    'Borough': borough,
                    'Neighborhood' : neighbor}, ignore_index=True)
                    
print(df.head())
print(df.shape)

  PostalCode           Borough      Neighborhood
0        M3A        North York         Parkwoods
1        M4A        North York  Victoria Village
2        M5A  Downtown Toronto      Harbourfront
3        M5A  Downtown Toronto       Regent Park
4        M6A        North York  Lawrence Heights
(212, 3)


<h1>Since more than one neighbourhood can exist in one postal code area, lets combine the rows and separate neighbourhoods by comma. I assume, Borough is the same so I group only by PostalCode</h1>

In [8]:
aggregation_functions = {'Borough': 'first', 'Neighborhood': lambda x: ', '.join(x)}
df_new = df.groupby(df['PostalCode']).aggregate(aggregation_functions).reset_index()
df_new.head()

Unnamed: 0,PostalCode,Neighborhood,Borough
0,M1B,"Rouge, Malvern",Scarborough
1,M1C,"Highland Creek, Rouge Hill, Port Union",Scarborough
2,M1E,"Guildwood, Morningside, West Hill",Scarborough
3,M1G,Woburn,Scarborough
4,M1H,Cedarbrae,Scarborough


In [9]:
# Just a quick check to confirm line 9 (M5V) in instructions
print(np.array(df_new[df_new.PostalCode == 'M5V']['Neighborhood']))

[ 'CN Tower, Bathurst Quay, Island airport, Harbourfront West, King and Spadina, Railway Lands, South Niagara']


In [10]:
# Confirm M5A
print(np.array(df_new[df_new.PostalCode == 'M5A']['Neighborhood']))

['Harbourfront, Regent Park']


In [11]:
print ('Shape of cleaned data:',df_new.shape)

Shape of cleaned data: (103, 3)


In [12]:
print(np.array(df_new[df_new.PostalCode == 'M5G']))

[['M5G' 'Central Bay Street' 'Downtown Toronto']]


<h1>Part 2 - Lets download the CSV containing Latitude/Longitude information</h1>

In [13]:
!wget -q -O 'geospatial_data.csv' https://cocl.us/Geospatial_data
print("Geospatial Data downloaded")

with open('geospatial_data.csv') as f:
    geospatial_data = pd.read_csv(f)


Geospatial Data downloaded


In [14]:
geospatial_data.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


<h1>Now lets get Latitude / Longitude information from our geospatial dataframe</h1>

In [15]:
df_new["Latitude"] = geospatial_data.loc[geospatial_data['Postal Code'] == df_new['PostalCode']]['Latitude']
df_new["Longitude"] = geospatial_data.loc[geospatial_data['Postal Code'] == df_new['PostalCode']]['Longitude']


In [16]:
df_new.head()

Unnamed: 0,PostalCode,Neighborhood,Borough,Latitude,Longitude
0,M1B,"Rouge, Malvern",Scarborough,43.806686,-79.194353
1,M1C,"Highland Creek, Rouge Hill, Port Union",Scarborough,43.784535,-79.160497
2,M1E,"Guildwood, Morningside, West Hill",Scarborough,43.763573,-79.188711
3,M1G,Woburn,Scarborough,43.770992,-79.216917
4,M1H,Cedarbrae,Scarborough,43.773136,-79.239476


In [17]:
# Just to be sure...
df_new[df_new.PostalCode == 'M5G']


Unnamed: 0,PostalCode,Neighborhood,Borough,Latitude,Longitude
57,M5G,Central Bay Street,Downtown Toronto,43.657952,-79.387383
