### Segmenting and Clustering Neighborhoods in Toronto

### Import required packages

In [5]:
import pandas as pd # library for data analysis
import numpy as np # library to handle data in a vectorized manner
import random # library for random number generation 

!conda install -c conda-forge geopy --yes
from geopy.geocoders import Nominatim # module to convert an address into latitude and longitude values

#libararies for displaying images

from IPython.display import Image
from IPython.core.display import HTML
from IPython.display import display_html

# transforming json file into a panda dataframe library 
from pandas.io.json import json_normalize

!conda install -c conda-forge folium=0.5.0 --yes
import folium # plotting library 
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors

print('All packages and Moduled Installed and Imported!!!.')





Collecting package metadata (current_repodata.json): done
Solving environment: failed with initial frozen solve. Retrying with flexible solve.
Collecting package metadata (repodata.json): done
Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python-3.7-OpenCE

  added / updated specs:
    - folium=0.5.0


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    altair-4.1.0               |             py_1         614 KB  conda-forge
    branca-0.4.2               |     pyhd8ed1ab_0          26 KB  conda-forge
    certifi-2021.5.30          |   py37h89c1867_0         141 KB  conda-forge
    folium-0.5.0               |             py_0          45 KB  conda-forge
    openssl-1.1.1k             |       h7f98852_0         2.1 MB  conda-forge
    python_abi-3.7             |          2_cp37m           4 KB  conda-forge
    vincent-0.4.4              |           

In [108]:
### Scraping the Wikipedia page for the table of postal codes of Canada 

In [148]:
raw_df = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')

In [149]:
raw_df

[                                                    0  \
 0                                     M1ANot assigned   
 1                     M1BScarborough(Malvern / Rouge)   
 2   M1CScarborough(Rouge Hill / Port Union / Highl...   
 3   M1EScarborough(Guildwood / Morningside / West ...   
 4                              M1GScarborough(Woburn)   
 5                           M1HScarborough(Cedarbrae)   
 6                 M1JScarborough(Scarborough Village)   
 7   M1KScarborough(Kennedy Park / Ionview / East B...   
 8   M1LScarborough(Golden Mile / Clairlea / Oakridge)   
 9   M1MScarborough(Cliffside / Cliffcrest / Scarbo...   
 10       M1NScarborough(Birch Cliff / Cliffside West)   
 11  M1PScarborough(Dorset Park / Wexford Heights /...   
 12                 M1RScarborough(Wexford / Maryvale)   
 13                          M1SScarborough(Agincourt)   
 14  M1TScarborough(Clarks Corners / Tam O'Shanter ...   
 15  M1VScarborough(Milliken / Agincourt North / St...   
 16     M1WSca

### Read the raw table to one column dataframe

In [150]:
# column_names

column_names = ['temp_info']
temp_table = pd.DataFrame(columns = column_names)

for i in range(0, 8):
    
    for data in raw_df[0][i]:
        temp_info = data
        temp_table = temp_table.append({'temp_info': temp_info}, ignore_index = True)

In [151]:
temp_table.head(160)

Unnamed: 0,temp_info
0,M1ANot assigned
1,M1BScarborough(Malvern / Rouge)
2,M1CScarborough(Rouge Hill / Port Union / Highl...
3,M1EScarborough(Guildwood / Morningside / West ...
4,M1GScarborough(Woburn)
...,...
155,M8VEtobicoke(New Toronto / Mimico South / Humb...
156,M8WEtobicoke(Alderwood / Long Branch)
157,M8XEtobicoke(The Kingsway / Montgomery Road / ...
158,M8YEtobicoke(Old Mill South / King's Mill Park...


In [152]:
temp_table.shape

(160, 1)

### Split one columns in multiple columns

In [153]:
temp_table['postal code'] = temp_table['temp_info'].str.slice(stop=3)

In [154]:
temp_table['temp_info'] = temp_table['temp_info'].str.slice(start=3)

In [155]:
temp_table.head(160)

Unnamed: 0,temp_info,postal code
0,Not assigned,M1A
1,Scarborough(Malvern / Rouge),M1B
2,Scarborough(Rouge Hill / Port Union / Highland...,M1C
3,Scarborough(Guildwood / Morningside / West Hill),M1E
4,Scarborough(Woburn),M1G
...,...,...
155,Etobicoke(New Toronto / Mimico South / Humber ...,M8V
156,Etobicoke(Alderwood / Long Branch),M8W
157,Etobicoke(The Kingsway / Montgomery Road / Old...,M8X
158,Etobicoke(Old Mill South / King's Mill Park / ...,M8Y


In [156]:
temp_table[['Borough', 'Neighborhoods', "crap"]] = temp_table.temp_info.str.split("(", expand=True,)

In [157]:
temp_table

Unnamed: 0,temp_info,postal code,Borough,Neighborhoods,crap
0,Not assigned,M1A,Not assigned,,
1,Scarborough(Malvern / Rouge),M1B,Scarborough,Malvern / Rouge),
2,Scarborough(Rouge Hill / Port Union / Highland...,M1C,Scarborough,Rouge Hill / Port Union / Highland Creek),
3,Scarborough(Guildwood / Morningside / West Hill),M1E,Scarborough,Guildwood / Morningside / West Hill),
4,Scarborough(Woburn),M1G,Scarborough,Woburn),
...,...,...,...,...,...
155,Etobicoke(New Toronto / Mimico South / Humber ...,M8V,Etobicoke,New Toronto / Mimico South / Humber Bay Shores),
156,Etobicoke(Alderwood / Long Branch),M8W,Etobicoke,Alderwood / Long Branch),
157,Etobicoke(The Kingsway / Montgomery Road / Old...,M8X,Etobicoke,The Kingsway / Montgomery Road / Old Mill North),
158,Etobicoke(Old Mill South / King's Mill Park / ...,M8Y,Etobicoke,Old Mill South / King's Mill Park / Sunnylea /...,


In [158]:
temp_table = temp_table.drop(['temp_info', 'crap'], axis=1)
temp_table

Unnamed: 0,postal code,Borough,Neighborhoods
0,M1A,Not assigned,
1,M1B,Scarborough,Malvern / Rouge)
2,M1C,Scarborough,Rouge Hill / Port Union / Highland Creek)
3,M1E,Scarborough,Guildwood / Morningside / West Hill)
4,M1G,Scarborough,Woburn)
...,...,...,...
155,M8V,Etobicoke,New Toronto / Mimico South / Humber Bay Shores)
156,M8W,Etobicoke,Alderwood / Long Branch)
157,M8X,Etobicoke,The Kingsway / Montgomery Road / Old Mill North)
158,M8Y,Etobicoke,Old Mill South / King's Mill Park / Sunnylea /...


In [159]:
temp_table['Neighborhoods'] = temp_table['Neighborhoods'].str.replace(')','')

temp_table['Neighborhoods'] = temp_table['Neighborhoods'].str.replace('/',',')


### Show the temp_table

In [160]:
temp_table

Unnamed: 0,postal code,Borough,Neighborhoods
0,M1A,Not assigned,
1,M1B,Scarborough,"Malvern , Rouge"
2,M1C,Scarborough,"Rouge Hill , Port Union , Highland Creek"
3,M1E,Scarborough,"Guildwood , Morningside , West Hill"
4,M1G,Scarborough,Woburn
...,...,...,...
155,M8V,Etobicoke,"New Toronto , Mimico South , Humber Bay Shores"
156,M8W,Etobicoke,"Alderwood , Long Branch"
157,M8X,Etobicoke,"The Kingsway , Montgomery Road , Old Mill North"
158,M8Y,Etobicoke,"Old Mill South , King's Mill Park , Sunnylea ,..."


In [161]:
cleaned_df = temp_table.rename(columns={"postal code":"PostalCode", "Neighborhoods":"Neighborhood"})

In [162]:
cleaned_df

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,
1,M1B,Scarborough,"Malvern , Rouge"
2,M1C,Scarborough,"Rouge Hill , Port Union , Highland Creek"
3,M1E,Scarborough,"Guildwood , Morningside , West Hill"
4,M1G,Scarborough,Woburn
...,...,...,...
155,M8V,Etobicoke,"New Toronto , Mimico South , Humber Bay Shores"
156,M8W,Etobicoke,"Alderwood , Long Branch"
157,M8X,Etobicoke,"The Kingsway , Montgomery Road , Old Mill North"
158,M8Y,Etobicoke,"Old Mill South , King's Mill Park , Sunnylea ,..."


### Further Processing and Clearning per requirement

In [163]:
cleaned_df_1 = cleaned_df[cleaned_df.Borough != 'Not assigned']

In [164]:
cleaned_df_1

Unnamed: 0,PostalCode,Borough,Neighborhood
1,M1B,Scarborough,"Malvern , Rouge"
2,M1C,Scarborough,"Rouge Hill , Port Union , Highland Creek"
3,M1E,Scarborough,"Guildwood , Morningside , West Hill"
4,M1G,Scarborough,Woburn
5,M1H,Scarborough,Cedarbrae
...,...,...,...
155,M8V,Etobicoke,"New Toronto , Mimico South , Humber Bay Shores"
156,M8W,Etobicoke,"Alderwood , Long Branch"
157,M8X,Etobicoke,"The Kingsway , Montgomery Road , Old Mill North"
158,M8Y,Etobicoke,"Old Mill South , King's Mill Park , Sunnylea ,..."


In [165]:
cleaned_df_1['Neighborhood'] = np.where(cleaned_df_1['Neighborhood']=='Not assigned', cleaned_df_1['Borough'], cleaned_df_1['Neighborhood'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


In [146]:
cleaned_df_1

Unnamed: 0,PostalCode,Borough,Neighborhood
1,M1B,Scarborough,"Malvern , Rouge"
2,M1C,Scarborough,"Rouge Hill , Port Union , Highland Creek"
3,M1E,Scarborough,"Guildwood , Morningside , West Hill"
4,M1G,Scarborough,Woburn
5,M1H,Scarborough,Cedarbrae
...,...,...,...
155,M8V,Etobicoke,"New Toronto , Mimico South , Humber Bay Shores"
156,M8W,Etobicoke,"Alderwood , Long Branch"
157,M8X,Etobicoke,"The Kingsway , Montgomery Road , Old Mill North"
158,M8Y,Etobicoke,"Old Mill South , King's Mill Park , Sunnylea ,..."


In [166]:
df_cleaned = cleaned_df_1

### Part 2: Getting Coordinates for each Postal code

In [169]:
!conda install -c conda-forge geopy --yes

Collecting package metadata (current_repodata.json): done
Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python-3.7-OpenCE

  added / updated specs:
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    geographiclib-1.52         |     pyhd8ed1ab_0          35 KB  conda-forge
    geopy-2.2.0                |     pyhd8ed1ab_0          67 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         102 KB

The following NEW packages will be INSTALLED:

  geographiclib      conda-forge/noarch::geographiclib-1.52-pyhd8ed1ab_0
  geopy              conda-forge/noarch::geopy-2.2.0-pyhd8ed1ab_0



Downloading and Extracting Packages
geopy-2.2.0          | 67 KB     | ##################################### | 100% 
geographiclib-1.52   | 35 KB     | #########################

In [172]:
# Install the geocoder package

!conda install -c conda-forge geocoder

Collecting package metadata (current_repodata.json): done
Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python-3.7-OpenCE

  added / updated specs:
    - geocoder


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    geocoder-1.38.1            |             py_1          53 KB  conda-forge
    ratelim-0.1.6              |             py_2           6 KB  conda-forge
    ------------------------------------------------------------
                                           Total:          59 KB

The following NEW packages will be INSTALLED:

  geocoder           conda-forge/noarch::geocoder-1.38.1-py_1
  ratelim            conda-forge/noarch::ratelim-0.1.6-py_2



Downloading and Extracting Packages
geocoder-1.38.1      | 53 KB     | ##################################### | 100% 
ratelim-0.1.6        | 6 KB      | ##################################### |

In [175]:
# import geocoder # import geocoder

# for postal_code in df_cleaned['PostalCode']:


#     # initialize your variable to None
#     lat_lng_coords = None

#     # loop until you get the coordinates
#     while(lat_lng_coords is None):
#       g = geocoder.google('{}, Toronto, Ontario'.format(postal_code))
#       lat_lng_coords = g.latlng

#     df_cleaned['latitude'] = df_cleaned['latitude'].append(lat_lng_coords[0])
#     df_cleaned['longitude'] = df_cleaned['longitude'].append(lat_lng_coords[1])
    
### This takes too long to run, will get "GeoSpatial Dataset" avaiable to finish this assignment. 



### Get the geospatial data

In [177]:
lat_lon = pd.read_csv('https://cocl.us/Geospatial_data')

In [178]:
lat_lon.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [179]:
lat_lon.rename(columns={'Postal Code': 'PostalCode'}, inplace=True)
df_merged = pd.merge(df_cleaned,lat_lon,on='PostalCode')
df_merged

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern , Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill , Port Union , Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood , Morningside , West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
...,...,...,...,...,...
88,M8V,Etobicoke,"New Toronto , Mimico South , Humber Bay Shores",43.605647,-79.501321
89,M8W,Etobicoke,"Alderwood , Long Branch",43.602414,-79.543484
90,M8X,Etobicoke,"The Kingsway , Montgomery Road , Old Mill North",43.653654,-79.506944
91,M8Y,Etobicoke,"Old Mill South , King's Mill Park , Sunnylea ,...",43.636258,-79.498509


In [180]:
df_toronto = df_merged[df_merged['Borough'].str.contains('Toronto', regex=False)]

In [182]:
df_toronto.head() # Getting the rows from the DF which contains 'Toronto' in the Borough

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
37,M4E,East Toronto,The Beaches,43.676357,-79.293031
40,M4J,East YorkEast Toronto,The Danforth East,43.685347,-79.338106
41,M4K,East Toronto,"The Danforth West , Riverdale",43.679557,-79.352188
42,M4L,East Toronto,"India Bazaar , The Beaches West",43.668999,-79.315572
43,M4M,East Toronto,Studio District,43.659526,-79.340923


In [183]:
### Getting the Geolocation of Toronto

address = 'Toronto, ON'

geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
#print('The geograpical coordinate of New York City are {}, {}.'.format(latitude, longitude))

print('The geographical coordinate of Toronto are {}, {}'.format(latitude, longitude))

The geographical coordinate of Toronto are 43.6534817, -79.3839347


### Visualizing Neighborhoods using Folium 

In [187]:
map_toronto = folium.Map(location=[latitude,longitude],zoom_start=12)

for lat,lng,borough,neighbourhood in zip(df_toronto['Latitude'],df_toronto['Longitude'],df_toronto['Borough'],df_toronto['Neighborhood']):
    label = '{}, {}'.format(neighbourhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
    [lat,lng],
    radius=5,
    popup=label,
    color='blue',
    fill=True,
    fill_color='#3186cc',
    fill_opacity=0.7,
    parse_html=False).add_to(map_toronto)
map_toronto

### To cluster neighborhoods based on geolocation

In [190]:
k=5
toronto_clustering = df_toronto.drop(['PostalCode','Borough','Neighborhood'],1)
kmeans = KMeans(n_clusters = k,random_state=0).fit(toronto_clustering)
kmeans.labels_
df_toronto.insert(0, 'Cluster Labels', kmeans.labels_)

In [192]:
df_toronto.head()

Unnamed: 0,Cluster Labels,PostalCode,Borough,Neighborhood,Latitude,Longitude
37,4,M4E,East Toronto,The Beaches,43.676357,-79.293031
40,4,M4J,East YorkEast Toronto,The Danforth East,43.685347,-79.338106
41,4,M4K,East Toronto,"The Danforth West , Riverdale",43.679557,-79.352188
42,4,M4L,East Toronto,"India Bazaar , The Beaches West",43.668999,-79.315572
43,4,M4M,East Toronto,Studio District,43.659526,-79.340923


In [211]:
### Visualize the clusters

# create map
map_clusters = folium.Map(location=[latitude,longitude],zoom_start=13)

# set color scheme for the clusters
colors_array = cm.rainbow(np.linspace(0, 1, len(range(k))))

rainbow = [colors.rgb2hex(color) for color in colors_array]

#print(rainbow)

# add markers to the map
markers_colors = []
for lat, lon, neighbourhood, cluster in zip(df_toronto['Latitude'], df_toronto['Longitude'], df_toronto['Neighborhood'], df_toronto['Cluster Labels']):
    label = folium.Popup(' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

['#8000ff', '#00b5eb', '#80ffb4', '#ffb360', '#ff0000']
