# <font color='black'> Battle of the Neighborhood </font>

This notebook is for the Applied Data Science Capstone course in Coursera. It's the last of the 9 courses provided by IBM as part of their Data Science Professional Certificate.

In [30]:
print('Hello Capstone Project!')

Hello Capstone Project!


##  <font color = '#8E44AD'> Import (and install, if needed) all the python libraries </font>

In [31]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

#!conda install -c conda-forge geopy --yes #uncomment if geopy has not been installed 
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes #uncomment if folium has not been installed 
import folium # map rendering library

#!pip install BeautifulSoup4
from urllib.request import urlopen
from bs4 import BeautifulSoup
import ssl
import csv

#to extract the csv file of latitude and longitude 
import io 

print('Libraries imported.')

Libraries imported.


## <font color='#8E44AD'> Create the data frame </font> 

### <font color = 'lightcoral'> Scrape the table of Canada's Postal Codes, Borough, and Neighborhood - in particular, data of postal codes that starts with the letter M - from wikipedia </font>

In [32]:
#send a GET request to the wikipedia page consisting of list of Canada's postal codes 
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
source = requests.get(url)

#create soup object
soup = BeautifulSoup(source.text, 'lxml')

In [33]:
#scrape the table of Canada's postal codes using soup object
df = []
column_names = []
table = soup.find(class_='wikitable')

#index = index number of each row, tr = table row
for index, tr in enumerate(table.find_all('tr')):
    columns = []
    for value in tr.find_all(['th','td']):
        columns.append(value.text.rstrip())
    
    if (index == 0):
        column_names = columns #first row index is the header
    else:
        df.append(columns)
        
#convert list into Pandas DataFrame
canada_data = pd.DataFrame(data = df, columns = column_names)
canada_data.head()


Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


### <font color = 'lightcoral'> Only process the cells that have an assigned borough. Ignore cells with a borough that is <i>Not assigned.</i> </font>

In [34]:
#get rid of all rows where borough is not assigned

canada_data = canada_data[canada_data['Borough'] != 'Not assigned']

#sort the table by postal code (ascending), for later use to append latitude and longitude
canada_data = canada_data.sort_values('Postal Code')

In [35]:
canada_data.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
9,M1B,Scarborough,"Malvern, Rouge"
18,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
27,M1E,Scarborough,"Guildwood, Morningside, West Hill"
36,M1G,Scarborough,Woburn
45,M1H,Scarborough,Cedarbrae


###  <font color = 'lightcoral'> More than one neighborhood can exist in one postal code area. However, the updated wikipedia table (as of 8/2/20) aggregates all the neighborhood of a unique postal code in a single row. Therefore, there's no need to aggregate any further. </font>

In [36]:
#there's no need to aggregate further because number of rows in the dataframe
#is same as the number of unique postal codes 

print(canada_data.shape[0])
print(len(canada_data['Postal Code'].unique()))

103
103


###  <font color = 'lightcoral'> Moreover, after removing all the rows where the borough value is <em> Not assigned </em>, there's no <b>neighbourhood</b> value that is not assigned.¶ </font>

In [37]:
#there's no value where the Neighbourhood column 
#contains 'Not assigned' after removing all the rows where the borough is 'Not assigned'

canada_data[canada_data['Neighbourhood'] == 'Not assigned']

Unnamed: 0,Postal Code,Borough,Neighbourhood


### <font color = 'lightcoral'> Shape of the table after cleaning up the dataframe </font>

In [38]:
canada_data.shape

(103, 3)

## <font color='#8E44AD'> Complete the dataset by adding latitude and longitude of each postal code </font> 

In [39]:
#read the csv file of latitude and longitude of the respective postal codes

url_ll = 'https://cocl.us/Geospatial_data'
can_latlong = requests.get(url_ll).text
can_latlong = pd.read_csv(io.StringIO(can_latlong))
can_latlong.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [40]:
#unique postal codes in the csv file

can_latlong['Postal Code'].nunique()

103

### <font color = 'lightcoral'> Join the two tables <em> canada_data</em> and <em>can_latlong </em> using Python's merge method</font>

In [41]:
#join the two tables 

ca = pd.merge(canada_data, can_latlong, how='inner', on = 'Postal Code')

### <font color = 'lightcoral'> Final dataset </font>

In [42]:
ca.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [43]:
ca.shape

(103, 5)

## <font color='#8E44AD'> Visualize on the map using <em>Folium</em> </font> 

In [44]:
address = 'Toronto, Ontario'

geolocator = Nominatim(user_agent="canada_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


In [46]:
# create map of Canada using latitude and longitude values
map_canada = folium.Map(location = [latitude, longitude], zoom_start = 10)

# add markers to map
for lat, lng, borough, neighborhood in zip(ca['Latitude'], ca['Longitude'], ca['Borough'], ca['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_canada)  
    
map_canada