# Segmentation and Clustering of Neighbourhood

### Import Required Module for Python

In [2]:
import itertools
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
from pandas.io.json import json_normalize

### Code to scrape the Wikipedia page of "List of postal codes of Canada"

In [3]:
req_canada=requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')
soup_canada=BeautifulSoup(req_canada.content,'lxml')
table_canada=soup_canada.find_all('table')
df=pd.read_html(str(table_canada))

### convert the list to dataframe using panda

In [4]:
neighbor_canada=pd.DataFrame(df[0])
neighbor_canada.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


### Ignore cells with a borough that is Not assigned

In [5]:
neighbor_canada.drop(neighbor_canada[neighbor_canada['Borough']=="Not assigned"].index,axis=0, inplace=True)
neighbor_canada.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


### If more than one neighborhood exists in one postal code area, these rows will be combined into one row with the neighborhoods separated with a comma

In [6]:
df_canada = neighbor_canada.groupby(['Postal Code','Borough'], sort=False).agg(', '.join)
df_canada.reset_index(inplace=True)


### If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.

In [7]:
df_canada.loc[df_canada['Neighbourhood']=="Not assigned",'Neighbourhood']=df_canada.loc[df_canada['Neighbourhood']=="Not assigned",'Borough']
df_canada.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [8]:
df_canada.shape

(103, 3)

### Find out the geographical coordinates of each postal code

In [9]:
latitude_longitude = pd.read_csv('https://cocl.us/Geospatial_data')


### Merge the latitude and longitude to the dataframe in separate columns

In [10]:
df_with_coordinate = pd.merge(df_canada,latitude_longitude,on='Postal Code')

### Find out the neighbourhood of Toronto based on "Borough" column

In [11]:
df_toronto = df_with_coordinate[df_with_coordinate['Borough'].str.contains('Toronto')]
df_toronto

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
15,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
19,M4E,East Toronto,The Beaches,43.676357,-79.293031
20,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
24,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
25,M6G,Downtown Toronto,Christie,43.669542,-79.422564
30,M5H,Downtown Toronto,"Richmond, Adelaide, King",43.650571,-79.384568
31,M6H,West Toronto,"Dufferin, Dovercourt Village",43.669005,-79.442259


## Part 3

### Get the latitude and longitude values of Toronto

In [12]:
from geopy.geocoders import Nominatim
geo_locator = Nominatim(user_agent="toronto_explorer")
location_toronto = geo_locator.geocode("Toronto, ON")
latitude_toronto = location_toronto.latitude
longitude_toronto = location_toronto.longitude
print('latitude:',latitude_toronto,',', 'longitude:',longitude_toronto)

latitude: 43.6534817 , longitude: -79.3839347


### To visualize the map of Toronto using coordinates 

In [13]:
import folium
Toronto_map=folium.Map(location=[latitude_toronto,longitude_toronto])
Toronto_map


### Explore the Toronto neighbourhood

In [15]:
for latitude_toronto, longitude_toronto, borough, neighborhood in zip(
        df_toronto['Latitude'], 
        df_toronto['Longitude'], 
        df_toronto['Borough'], 
        df_toronto['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    
    folium.CircleMarker(
        [latitude_toronto, longitude_toronto],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.5,
        parse_html=False).add_to(Toronto_map)  

Toronto_map   

### Using K Means Clustering Method, cluster the Toronto dataset

In [16]:
from sklearn.cluster import KMeans
kmeans=KMeans(n_clusters=5)
kmeans.fit(df_toronto.drop(['Postal Code','Borough','Neighbourhood'],axis=1))

df_toronto['Cluster Labels']=kmeans.labels_


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


### Now, Visualize the toronto map based on cluster data 

In [17]:
for latitude_toronto, longitude_toronto, neighborhood, cluster in zip(
        df_toronto['Latitude'], 
        df_toronto['Longitude'], 
        df_toronto['Neighbourhood'],
        df_toronto['Cluster Labels']):
    label = folium.Popup(' Cluster ' + str(cluster), parse_html=True)
    
    folium.CircleMarker(
        [latitude_toronto, longitude_toronto],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.5,
        parse_html=False).add_to(Toronto_map)  

Toronto_map  

### That's the end of this assignment.