# Exploring Neighborhoods in Toronto
## Marge Ogle
### Coursera Capstone Segmenting & Clustering project

## Download & Explore Dataset

In [1]:
# Start by importing required libraries
import numpy as np  # library to handle data in a vetorized manner

import pandas as pd  # library for data analysis

import json # Library to handle JSON files

!conda install -c conda-forge geopy --yes 
from geopy.geocoders import Nominatim # convert an address into latitude & longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # transform JSON file into a panda dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import K-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes
import folium # map rendering library

print('Libraries have been imported!')

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    openssl-1.1.1d             |       h516909a_0         2.1 MB  conda-forge
    ca-certificates-2019.11.28 |       hecc5488_0         145 KB  conda-forge
    certifi-2019.11.28         |   py36h9f0ad1d_1         149 KB  conda-forge
    python_abi-3.6             |          1_cp36m           4 KB  conda-forge
    geographiclib-1.50         |             py_0          34 KB  conda-forge
    geopy-1.21.0               |             py_0          58 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         2.5 MB

The following NEW packages will be INSTALLED:

    geographiclib:   1.50-py_0         conda-forge
    geopy:           1

In [8]:
# Gets the content out of the URL 

# The Wikipage that contains the Postal Code table 
url = ('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').format()

toronto_data = pd.read_html(url,header=0)

toronto_data[0]

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor
7,M7A,Downtown Toronto,Queen's Park
8,M8A,Not assigned,Not assigned
9,M9A,Etobicoke,Islington Avenue


In [10]:
# take the table and move it into a Dataframe
# for further processing and analysis
df_table = toronto_data[0]
df_table

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor
7,M7A,Downtown Toronto,Queen's Park
8,M8A,Not assigned,Not assigned
9,M9A,Etobicoke,Islington Avenue


### Examining the data, we find several Boroughs/Neighborhoods that are not assigned
### as well as Boroughs with multiple Neighborhoods.  All of this will need to be cleaned up.

In [11]:
# replace the Not Assigned values as NaN 
df_table.replace("Not assigned", np.nan, inplace = True)
df_table

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,,
1,M2A,,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor
7,M7A,Downtown Toronto,Queen's Park
8,M8A,,
9,M9A,Etobicoke,Islington Avenue


In [12]:
# drop the rows in which Boroughs are 'Not assigned'
df_table.dropna(subset=['Borough'], axis=0, inplace=True)

# reset index because we did drop a few rows.
df_table.reset_index(drop=True, inplace=True)
# Let's see the results df_table
df_table

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,Lawrence Heights
4,M6A,North York,Lawrence Manor
5,M7A,Downtown Toronto,Queen's Park
6,M9A,Etobicoke,Islington Avenue
7,M1B,Scarborough,Rouge
8,M1B,Scarborough,Malvern
9,M3B,North York,Don Mills North


## Examine the Table and Group accordingly

In [13]:
# Group the table by Postcode 
# Examine the results then group Neighborhoods into a Matching Post code
toronto_df = df_table.groupby('Postcode')
toronto_df
   

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7f522b4789e8>

In [14]:
for Postcode, Postcode_df in toronto_df:
    print(Postcode)
    print(Postcode_df)

M1B
  Postcode      Borough Neighbourhood
7      M1B  Scarborough         Rouge
8      M1B  Scarborough       Malvern
M1C
   Postcode      Borough   Neighbourhood
20      M1C  Scarborough  Highland Creek
21      M1C  Scarborough      Rouge Hill
22      M1C  Scarborough      Port Union
M1E
   Postcode      Borough Neighbourhood
32      M1E  Scarborough     Guildwood
33      M1E  Scarborough   Morningside
34      M1E  Scarborough     West Hill
M1G
   Postcode      Borough Neighbourhood
38      M1G  Scarborough        Woburn
M1H
   Postcode      Borough Neighbourhood
42      M1H  Scarborough     Cedarbrae
M1J
   Postcode      Borough        Neighbourhood
53      M1J  Scarborough  Scarborough Village
M1K
   Postcode      Borough         Neighbourhood
65      M1K  Scarborough  East Birchmount Park
66      M1K  Scarborough               Ionview
67      M1K  Scarborough          Kennedy Park
M1L
   Postcode      Borough Neighbourhood
78      M1L  Scarborough      Clairlea
79      M1L  Scarbor

    Postcode Borough Neighbourhood
115      M9N    York        Weston
M9P
    Postcode    Borough Neighbourhood
125      M9P  Etobicoke     Westmount
M9R
    Postcode    Borough         Neighbourhood
136      M9R  Etobicoke     Kingsview Village
137      M9R  Etobicoke  Martin Grove Gardens
138      M9R  Etobicoke      Richview Gardens
139      M9R  Etobicoke          St. Phillips
M9V
    Postcode    Borough     Neighbourhood
173      M9V  Etobicoke    Albion Gardens
174      M9V  Etobicoke  Beaumond Heights
175      M9V  Etobicoke        Humbergate
176      M9V  Etobicoke         Jamestown
177      M9V  Etobicoke       Mount Olive
178      M9V  Etobicoke       Silverstone
179      M9V  Etobicoke     South Steeles
180      M9V  Etobicoke       Thistletown
M9W
    Postcode    Borough Neighbourhood
186      M9W  Etobicoke     Northwest


In [15]:
# For Postcodes with multiple neighbourhoods, join them into 1 cell
toronto_df = df_table.groupby(['Postcode','Borough'])['Neighbourhood'].apply(','.join).reset_index()
toronto_df

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park"
7,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge"
8,M1M,Scarborough,"Cliffcrest,Cliffside,Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff,Cliffside West"


#### Let's confirm the New size

In [16]:
toronto_df.shape

(103, 3)

## Latitude and Longitude

In [23]:
# Now for latitude & longitude
address = 'Toronto, CA'

geolocator = Nominatim(user_agent='ca_explorer')
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geographical coordinate of Toronto are {},{}.'.format(latitude, longitude))


The geographical coordinate of Toronto are 43.653963,-79.387207.


#### Create a Dataframe that contains the matching Postal Codes
#### with the specific Latitude and Longitude

In [None]:
# Geospatial file with the sheet name of Geospatial_Coordinates-1
lat_long_df = pd.read_excel("http://cocl.us/Geopatial_data", "Geospatial_coordinates-1")
lat_long_df.head(5)

In [24]:
# Merge Latitude & Longitude into your toronto dataframe


ModuleNotFoundError: No module named 'geocoder'

In [24]:
print('The dataframe has {} boroughs and {} neighborhoods.'.format(
    len(df_table['Borough'].unique()),
    df_table.shape[0]))

The dataframe has 10 boroughs and 210 neighborhoods.


In [9]:
# get the latitude and longitude values
import geocoder

# initialize your variable to None
lat_lng_coords = None

# loop until you get the coordinates
while(lat_lng_coords is None):
    g = geocoder.google('{}, Toronto, Ontario'.format(postal_code))
    lat_lng_coords = g.latlng
    
latitude = lat_lng_coords[0]
longitude = lat_lng_coords[1]

ModuleNotFoundError: No module named 'geocoder'