# Segmenting and Clustering Neighborhoods in Toronto

# Solution to the 1st question starts here...

#### We first import and download all the dependencies that we will need for this assignment

In [1]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

!conda install html5lib # Will be used to import the table from wikipedia to Jupyter Notebook



print('Libraries imported.')

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    openssl-1.1.1g             |       h516909a_0         2.1 MB  conda-forge
    certifi-2020.4.5.1         |   py36h9f0ad1d_0         151 KB  conda-forge
    python_abi-3.6             |          1_cp36m           4 KB  conda-forge
    ca-certificates-2020.4.5.1 |       hecc5488_0         146 KB  conda-forge
    geographiclib-1.50         |             py_0          34 KB  conda-forge
    geopy-1.21.0               |             py_0          58 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         2.5 MB

The following NEW packages will be INSTALLED:

    geographiclib:   1.50-py_0         conda-forge
    geopy:           1

#### In the below table, we import the table from the wikipedia page directly to this workbook

In [44]:
from pandas.io.html import read_html
page = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

wikitable = read_html(page,  attrs={"class":"wikitable"})

print ("Extracted {num} wikitables".format(num=len(wikitable)))



Extracted 1 wikitables


In [45]:
wikitable[0].head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


#### The raw table from the wikipedia page is assigned to a variable so that we start cleaning the table and removing unwanted rows

In [54]:
rawtable_from_wikipedia = wikitable[0]

In [55]:
rawtable_from_wikipedia.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


#### The raw table from the wikipedia page is assigned to another variable so that we can have a copy of the original extract

In [59]:
Toronto_Raw = rawtable_from_wikipedia

In [65]:
Toronto_Raw.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [66]:
Toronto_Raw.shape

(180, 3)

#### Below we create a new dataframe without rows containing 'Not assigned' in 'Borough' column

In [62]:
Toronto_Raw_Clean = Toronto_Raw[Toronto_Raw['Borough'] != 'Not assigned'].reset_index(drop=True)
Toronto_Raw_Clean.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [68]:
Toronto_Raw_Clean.shape

(103, 3)

#### Below we check if there are any blank values in each of the columns by using the count() option. As you can see, the below counts match. The count option does not count blanks.

In [73]:
Toronto_Raw_Clean.count()

Postal Code     103
Borough         103
Neighborhood    103
dtype: int64

#### Below we check if there are any postal codes that are in the dataframe twice. It appears that the wikipedia article is updated now as there are no postal code duplicate rows

In [76]:
Toronto_Raw_Clean['Postal Code'].value_counts()

M4A    1
M1V    1
M4T    1
M3M    1
M2L    1
M5L    1
M3J    1
M5W    1
M8W    1
M1X    1
M1K    1
M6B    1
M9L    1
M5P    1
M5V    1
M3C    1
M4G    1
M3N    1
M4M    1
M4X    1
M4N    1
M1L    1
M4W    1
M5E    1
M6M    1
M8Z    1
M9R    1
M6K    1
M7Y    1
M5R    1
M8Y    1
M1M    1
M1W    1
M2N    1
M6R    1
M5S    1
M9C    1
M1R    1
M6H    1
M5J    1
M3B    1
M3A    1
M3L    1
M1H    1
M1E    1
M8V    1
M4E    1
M1S    1
M2K    1
M2P    1
M4R    1
M3H    1
M5H    1
M4H    1
M1T    1
M4C    1
M3K    1
M5M    1
M4K    1
M4P    1
M4B    1
M9V    1
M6P    1
M4L    1
M5C    1
M9A    1
M2R    1
M5K    1
M9B    1
M1B    1
M6S    1
M5T    1
M4V    1
M6E    1
M4Y    1
M9M    1
M8X    1
M5A    1
M6L    1
M2H    1
M7R    1
M1N    1
M1J    1
M5N    1
M5G    1
M7A    1
M6G    1
M2J    1
M5B    1
M6C    1
M6A    1
M4S    1
M9W    1
M4J    1
M5X    1
M9N    1
M1P    1
M1G    1
M9P    1
M6N    1
M2M    1
M6J    1
M1C    1
Name: Postal Code, dtype: int64

#### Below 2 cells we check if there are any rows in column which as a value of 'Not assigned' or if it is blank. We see the results below which are 0 i.e. we do not have anything that is blank or with a value of 'Not assigned' in column "Neighborhood".

In [77]:
Toronto_Raw_Clean.query('Neighborhood == "Not assigned"').Neighborhood.count()

0

In [78]:
Toronto_Raw_Clean.query('Neighborhood == ""').Neighborhood.count()

0

## Here is the output of using the shape function for the above cleaned dataset

In [79]:
Toronto_Raw_Clean.shape

(103, 3)

#### the solution to the 1st question ends here....

# Solution to the 2nd question starts here...

### We download the data Geospatial_Coordinates csv file from http://cocl.us/Geospatial_data

In [80]:
!wget -q -O 'Geospatial_Coordinates.csv' http://cocl.us/Geospatial_data
print('Data downloaded!')

Data downloaded!


In [98]:
df_Geospatial_Coordinates = pd.read_csv('Geospatial_Coordinates.csv')
df_Geospatial_Coordinates.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


### Using the below line of code, we combine the Geospatial Coordinates file with the Postal Code Toronto data extracted from Wikipedia

In [85]:
combined_dataframe = Toronto_Raw_Clean.merge(df_Geospatial_Coordinates, on="Postal Code", how = 'inner')

In [86]:
combined_dataframe.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


#### the solution to the 2nd question ends here....

# Solution to the 3rd question starts here...

### Clustering and Visualizing data...

#### Below we imported the required libraries

In [99]:
import random # library for random number generation

!conda install -c conda-forge geopy --yes 
from geopy.geocoders import Nominatim # module to convert an address into latitude and longitude values

# libraries for displaying images
from IPython.display import Image 
from IPython.core.display import HTML 
    
# tranforming json file into a pandas dataframe library
from pandas.io.json import json_normalize

!conda install -c conda-forge folium=0.5.0 --yes
import folium # plotting library

print('Folium installed')
print('Libraries imported.')

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - geopy


The following packages will be UPDATED:

    ca-certificates: 2020.1.1-0        --> 2020.4.5.1-hecc5488_0     conda-forge
    certifi:         2020.4.5.1-py36_0 --> 2020.4.5.1-py36h9f0ad1d_0 conda-forge
    openssl:         1.1.1g-h7b6447c_0 --> 1.1.1g-h516909a_0         conda-forge

Preparing transaction: done
Verifying transaction: done
Executing transaction: done
Solving environment: done

# All requested packages already installed.

Folium installed
Libraries imported.


#### We filter out only rows containing 'Toronto' in column 'Borough' and assign it a new dataframe

In [105]:
new_dataframe_toronto = combined_dataframe[combined_dataframe['Borough'].str.contains('Toronto')]
print(new_dataframe_toronto.head())

   Postal Code           Borough                                 Neighborhood  \
2          M5A  Downtown Toronto                    Regent Park, Harbourfront   
4          M7A  Downtown Toronto  Queen's Park, Ontario Provincial Government   
9          M5B  Downtown Toronto                     Garden District, Ryerson   
15         M5C  Downtown Toronto                               St. James Town   
19         M4E      East Toronto                                  The Beaches   

     Latitude  Longitude  
2   43.654260 -79.360636  
4   43.662301 -79.389494  
9   43.657162 -79.378937  
15  43.651494 -79.375418  
19  43.676357 -79.293031  


#### Here is the new dataframe with the 'Borough' column containing the word 'Toronto'

In [106]:
new_dataframe_toronto

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
15,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
19,M4E,East Toronto,The Beaches,43.676357,-79.293031
20,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
24,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
25,M6G,Downtown Toronto,Christie,43.669542,-79.422564
30,M5H,Downtown Toronto,"Richmond, Adelaide, King",43.650571,-79.384568
31,M6H,West Toronto,"Dufferin, Dovercourt Village",43.669005,-79.442259


#### Below we locate the coordinates for Toronto, Ontario

In [109]:
address = 'Toronto, ON'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto, ON are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto, ON are 43.6534817, -79.3839347.


#### Below we set the variables for the Visualization

In [116]:
# create map of Manhattan using latitude and longitude values
Toronto_Map = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(new_dataframe_toronto['Latitude'], new_dataframe_toronto['Longitude'], new_dataframe_toronto['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='black',
        fill=True,
        fill_color='red',
        fill_opacity=0.7,
        parse_html=False).add_to(Toronto_Map)  
    
Toronto_Map

#### the solution to the 3rd question ends here....