# Segmenting and Clustering Neighborhoods in Toronto

In [1]:
import numpy as np # library to handle data as arrays and vectors

import pandas as pd # Tools for handling data structures
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

!conda install -c conda-forge geopy --yes 
from geopy.geocoders import Nominatim # Conversion of an address to latitudes and longitudes

import requests # library to handle requests
from pandas.io.json import json_normalize # transforming JSON to a pandas dataframe


# Importing Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes 
import folium # map rendering library

print('Libraries imported.')

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    geopy-1.20.0               |             py_0          57 KB  conda-forge
    ca-certificates-2019.11.28 |       hecc5488_0         145 KB  conda-forge
    certifi-2019.11.28         |           py36_0         149 KB  conda-forge
    geographiclib-1.50         |             py_0          34 KB  conda-forge
    openssl-1.1.1d             |       h516909a_0         2.1 MB  conda-forge
    ------------------------------------------------------------
                                           Total:         2.5 MB

The following NEW packages will be INSTALLED:

    geographiclib:   1.50-py_0         conda-forge
    geopy:           1.20.0-py_0       conda-forge

The following packages will be UPDATED:

    ca-

## Web scraping the table with postal codes from the Wikipedia page

In [4]:
from bs4 import BeautifulSoup # NextBeautifulSoup is imported for scraping the table off the wikipedia page

In [7]:
# getting data from internet
link='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
link_page= requests.get(link).text

# using beautiful soup to parse the HTML/XML codes.
soup = BeautifulSoup(link_page,'lxml')

In [12]:
table = soup.find('table', {'class': 'wikitable sortable'}).tbody #Source info on the wikipedia page reveals that the table is classified as wiki sortable

In [13]:
rows = table.find_all('tr') # Page source also reveals that tr indicates tags for rows and th represents tags for columns

# taking out the sapces in between 
columns = ['PostalCode', 'Borough', 'Neighborhood']
# [v.text.replace('\n','') for v in rows[0].find_all('th')]
print(columns)

['PostalCode', 'Borough', 'Neighborhood']


## Assigning columns to the data frames

In [14]:
df = pd.DataFrame(columns=columns)
df

Unnamed: 0,PostalCode,Borough,Neighborhood


## Preparing the table in csv format

In [33]:
from pandas import DataFrame

for i in range(1, len(rows)):
    tds = rows[i].find_all('td')
    
    if len(tds) == 3:
        values = [tds[0].text, tds[1].text, tds[2].text.replace('\n','')]
    else:
        values = [td.text.replace('\n','') for td in tds]

    # populating the dataframe with the data that we just extarcted from the HTML table
    df = df.append(pd.Series(values, index=columns), ignore_index=True)

export_csv= df.to_csv(r'C:\Users\fhce\Desktop\export_dataframe.csv', index = None, header=True)



In [34]:
print(df)

     PostalCode           Borough  \
0           M1A      Not assigned   
1           M1A      Not assigned   
2           M1A      Not assigned   
3           M1A      Not assigned   
4           M1A      Not assigned   
5           M1A      Not assigned   
6           M1A      Not assigned   
7           M2A      Not assigned   
8           M3A        North York   
9           M4A        North York   
10          M5A  Downtown Toronto   
11          M6A        North York   
12          M6A        North York   
13          M7A  Downtown Toronto   
14          M8A      Not assigned   
15          M9A      Queen's Park   
16          M1B       Scarborough   
17          M1B       Scarborough   
18          M2B      Not assigned   
19          M3B        North York   
20          M4B         East York   
21          M4B         East York   
22          M5B  Downtown Toronto   
23          M5B  Downtown Toronto   
24          M6B        North York   
25          M7B      Not assigned   
2

## Next the not assigned cells are removed

In [38]:
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M1A,Not assigned,Not assigned
2,M1A,Not assigned,Not assigned
3,M1A,Not assigned,Not assigned
4,M1A,Not assigned,Not assigned


In [40]:
data = df[df.Borough != 'Not assigned'] # Removing the not assigned rows
data.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
8,M3A,North York,Parkwoods
9,M4A,North York,Victoria Village
10,M5A,Downtown Toronto,Harbourfront
11,M6A,North York,Lawrence Heights
12,M6A,North York,Lawrence Manor


## Integrating the Neighborhoods with duplicate postal codes

In [41]:
data = data.groupby(['PostalCode','Borough'])['Neighborhood'].apply(lambda x: ', '.join(x)).reset_index()
data.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern, Rouge, Malvern, Rouge, Malvern..."
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union, Highla..."
2,M1E,Scarborough,"Guildwood, Morningside, West Hill, Guildwood, ..."
3,M1G,Scarborough,"Woburn, Woburn, Woburn, Woburn"
4,M1H,Scarborough,"Cedarbrae, Cedarbrae, Cedarbrae, Cedarbrae"


## Number of rows in the cleaned table

In [42]:
data.shape[0]

103

## Obtaining the Latitudes and Longitudes

In [46]:
geocoder = pd.read_csv("https://cocl.us/Geospatial_data") 

# Both tables should have the same column name for postal codes before the two tables are merged

geocoder.rename(columns={'Postal Code':'PostalCode'}, inplace=True)

In [47]:
geocoder.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [49]:
geo = pd.merge(data, geocoder, on='PostalCode') #Merging the two tables by postal code


In [50]:
geo.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern, Rouge, Malvern, Rouge, Malvern...",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union, Highla...",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill, Guildwood, ...",43.763573,-79.188711
3,M1G,Scarborough,"Woburn, Woburn, Woburn, Woburn",43.770992,-79.216917
4,M1H,Scarborough,"Cedarbrae, Cedarbrae, Cedarbrae, Cedarbrae",43.773136,-79.239476
