In [1]:
#Beautiful Soup is a Python package for parsing HTML and XML documents. 
#It creates a parse tree for parsed pages that can be used to extract data from HTML, which is useful for web scraping. 
#Prettify() function in BeautifulSoup will enable us to view how the tags are nested in the document.

#https://medium.com/analytics-vidhya/web-scraping-wiki-tables-using-beautifulsoup-and-python-6b9ea26d8722
#https://erikrood.com/Python_References/web_scrape.html

## Part1
# Import Packages for webs scraping & data manipulation

In [40]:
#Packages
#--Web scraping packages
from bs4 import BeautifulSoup
import requests
#Pandas/numpy for data manipulation
import pandas as pd
import numpy as np

1) Using Beautiful Soup for web scrapping

In [41]:
#Postalcode of Canada in Wiki Page
website_url = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M").text

#loading empty array for postalCode of Canada
postalcode_Canada = []

#Using BeautifulSoup for web Scrapping
soup = BeautifulSoup(website_url, "html.parser")

#identify table we want to scrape
postalcode_Table = soup.find('table',{'class':'wikitable sortable'})

2) take out data from table

In [42]:
#try clause to skip any companies with missing/empty board member tables
try:
#loop through table, grab each of the 4 columns shown (try one of the links yourself to see the layout)
    for row in soup.find_all('table')[0].find_all('tr'):
        cols = row.find_all('td')
        if len(cols) == 3:
            postalcode_Canada.append((website_url, cols[0].text.strip(), cols[1].text.strip(), cols[2].text.strip()))
except: pass  
        
#convert output to new array, check length
postalcode_array = np.asarray(postalcode_Canada)
len(postalcode_array)

288

3) Convert to dataframe from array and Rename column name in dataframe

In [43]:
#convert new array to dataframe
df = pd.DataFrame(postalcode_array)

#rename columns, check output
df.columns = ['URL','PostalCode', 'Borough','Neighborhood']

df1 = df[['PostalCode', 'Borough','Neighborhood']]
df1.head(5)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [44]:
df2 = df1[(df1.Borough != "Not assigned")] 
df2.head(5)

Unnamed: 0,PostalCode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights


In [46]:
#https://stackoverflow.com/questions/54216702/pandas-grouping-by-column-one-and-adding-comma-separated-entries-from-column-two
dfResult = df2.groupby(['PostalCode','Borough'])['Neighborhood'].agg(lambda x: ', '.join(set(x))).reset_index()
#dfResult.sort_values('PostalCode')
dfResult.head(5)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Port Union, Rouge Hill, Highland Creek"
2,M1E,Scarborough,"Morningside, Guildwood, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


6) Checking for replace with Borough column value when Neighboorhood column value is "Not assigned" although Borough has some value.

In [47]:
dfResult['Neighborhood']  = np.where(((dfResult['Borough'] != "Not assigned") & (dfResult['Neighborhood'] == "Not assigned")), dfResult['Borough'],dfResult['Neighborhood'])
dfResult.head(5)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Port Union, Rouge Hill, Highland Creek"
2,M1E,Scarborough,"Morningside, Guildwood, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [48]:
print(dfResult.shape)

(103, 3)


## Part2
Read Geospatial data with csv file

In [49]:
urlGeospatialData = "http://cocl.us/Geospatial_data"
df_geoData=pd.read_csv(urlGeospatialData)
df_geoData.head(5)

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [50]:
df_merged = pd.merge(dfResult, df_geoData, left_on=['PostalCode'],
              right_on=['Postal Code'],
              how='inner')

df_mergedFinal = df_merged[["PostalCode","Borough","Neighborhood","Latitude","Longitude"]]
df_mergedFinal.head(5)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Port Union, Rouge Hill, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Morningside, Guildwood, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


## Part3

In [51]:
print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(df_mergedFinal['Borough'].unique()),
        df_mergedFinal.shape[0]
    )
)

The dataframe has 11 boroughs and 103 neighborhoods.


1)Use geopy library to get the latitude and longitude values of Toronto 

In [52]:
from geopy.geocoders import Nominatim 
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
!pip install folium
import folium #Folium is a great visualization library. Feel free to zoom into the above map, and click on each circle mark to reveal the name of the neighborhood and its respective borough.



In [53]:
address = 'Toronto'
geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


  from ipykernel import kernelapp as app


In [55]:
# create map of New York using latitude and longitude values
map_newyork = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df_mergedFinal['Latitude'], df_mergedFinal['Longitude'], df_mergedFinal['Borough'], df_mergedFinal['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_newyork)  
    
map_newyork