# 1. import necessaries library for setting up the environment

In [1]:
#main library for data analysis
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
#library for json file
import json 
#convert address to latitude, longitude values 
from geopy.geocoders import Nominatim
#library for handle request
import requests
#transform json file into pandas df
from pandas.io.json import json_normalize
#matplotlib for plotting module
import matplotlib.cm as cm
import matplotlib.colors as colors
#slkearn for KMeans Cluster
from sklearn.cluster import KMeans
#folium for mapping 
!conda install -c conda-forge folium=0.5.0 --yes
import folium

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - folium=0.5.0


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    ca-certificates-2019.6.16  |       hecc5488_0         145 KB  conda-forge
    openssl-1.1.1c             |       h516909a_0         2.1 MB  conda-forge
    folium-0.5.0               |             py_0          45 KB  conda-forge
    altair-3.1.0               |           py36_0         724 KB  conda-forge
    branca-0.3.1               |             py_0          25 KB  conda-forge
    certifi-2019.6.16          |           py36_1         149 KB  conda-forge
    vincent-0.4.4              |             py_1          28 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         3.2 MB

The following NEW packages will be 

# 2. Web scrapping 

In [2]:
#import Beautifulsoup for scrapping wiki
from bs4 import BeautifulSoup

In [3]:
#store wiki page in url
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
#open url with beautiful soup
res = requests.get(url)
soup = BeautifulSoup(res.text, 'html.parser')

In [5]:
#print(soup)

#### if you print soup you get text in html so we needed to scrape them into table with 3 columns
#### Create a [ ] so we can stored data within 3 cloumns 

In [4]:
#create an empty list to stored 
postalCode_list = [] 
borough_list = [] 
neighborhood_list = []
#create for loop to find tag<tr> and tag<td>
for i in soup.find('table').find_all('tr'):
    x = i.find_all('td')
    if (len(x) > 0):
        postalCode_list.append(x[0].text)
        borough_list.append(x[1].text)
        neighborhood_list.append(x[2].text.rstrip('\n'))
print( 'This is postalCode_list',postalCode_list[0:5])
print('This is borough_list',borough_list[0:5])
print('This is neihborhood_list',neighborhood_list[0:5])

This is postalCode_list ['M1A', 'M2A', 'M3A', 'M4A', 'M5A']
This is borough_list ['Not assigned', 'Not assigned', 'North York', 'North York', 'Downtown Toronto']
This is neihborhood_list ['Not assigned', 'Not assigned', 'Parkwoods', 'Victoria Village', 'Harbourfront']


#### Let's put these in DataFrame

In [5]:
df = [('PostalCode', postalCode_list), ('Borough', borough_list),('Neighborhood', 
       neighborhood_list)]

In [6]:
df1 = pd.DataFrame.from_dict(dict(df))
df1.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


#### As we can see there are 'Not assigned' in Borough, Neighborhood columns, we don't want to deal with missing valued

In [8]:
df1_dropna = df1[df1['Borough'] != 'Not assigned'].reset_index(drop = True)
df1_dropna.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights


#### From the instruction,
#### 'More than one neighborhood can exist in one postal code area. 
#### For example, in the table on the Wikipedia page, 
#### you will notice that M5A is listed twice and has two neighborhoods: 
#### Harbourfront and Regent Park. These two rows will be combined 
#### into one row with the neighborhoods separated with a comma 
#### as shown in row 11 in the above table
##### let's grouped neighbood by Postal and Borough

In [9]:
df1_dropna_grouped = df1_dropna.groupby(['PostalCode','Borough'],
as_index = False).agg(lambda x: ','.join(x))
df_toronto = df1_dropna_grouped
df_toronto.loc[80:85]

Unnamed: 0,PostalCode,Borough,Neighborhood
80,M6M,York,"Del Ray,Keelesdale,Mount Dennis,Silverthorn"
81,M6N,York,"The Junction North,Runnymede"
82,M6P,West Toronto,"High Park,The Junction South"
83,M6R,West Toronto,"Parkdale,Roncesvalles"
84,M6S,West Toronto,"Runnymede,Swansea"
85,M7A,Queen's Park,Not assigned


#### According to the instruction,
#### 'If a cell has a borough but a Not assigned neighborhood, 
#### then the neighborhood will be the same as the borough. 
#### So for the 9th cell in the table on the Wikipedia page, 
#### the value of the Borough and the Neighborhood columns will be Queen's Park.'
#### 85th has 'Not assigned' valued in Neighborhood, so we decied to assign <br><br> the value to be the same as Borough

In [10]:
neighborhood_na = df1_dropna_grouped['Neighborhood'] == 'Not assigned'
#located the 'Not assigned' and assign to the same value as Borough
df1_dropna_grouped.loc[neighborhood_na, 'Neighborhood'] = df1_dropna_grouped.loc[neighborhood_na, 'Borough']
df1_dropna_grouped.loc[80:85]

Unnamed: 0,PostalCode,Borough,Neighborhood
80,M6M,York,"Del Ray,Keelesdale,Mount Dennis,Silverthorn"
81,M6N,York,"The Junction North,Runnymede"
82,M6P,West Toronto,"High Park,The Junction South"
83,M6R,West Toronto,"Parkdale,Roncesvalles"
84,M6S,West Toronto,"Runnymede,Swansea"
85,M7A,Queen's Park,Queen's Park


In [11]:
Cleaned_df = df1_dropna_grouped
Cleaned_df.shape

(103, 3)

# 3. Getting Coordinates from csv file

#### After we have built a dataframe of the postal code of each neighborhood along with the borough name and neighborhood name, in order to utilize the Foursquare location data, we need to get the latitude and the longitude coordinates of each neighborhood.

In [12]:
!wget -q -O 'Geospatial_data.csv' https://cocl.us/Geospatial_data
print('download completed!')

download completed!


In [13]:
coor = pd.read_csv('Geospatial_data.csv')
coor.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


#### As you can see in coor DataFrame has the same column as our df_toronto DataFrame so let's merge them together to get a full df 

In [14]:
Cleaned_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [15]:
#First we indexes by df.PostalCode = coor.Postal Code and inner join(intersect)
df_toronto_1 = Cleaned_df.set_index('PostalCode')
coor_1 = coor.set_index('Postal Code')
df_toronto_full = pd.concat([df_toronto_1, coor_1], axis = 1, join = 'inner')
df_toronto_full.reset_index(inplace = True)
df_toronto_full.rename(columns = {'index':'PostalCode'}, inplace = True)
Toronto_df = df_toronto_full

In [96]:
Toronto_df.loc[1:11]
print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(Toronto_df['Borough'].unique()),
        Toronto_df.shape[0]))

The dataframe has 11 boroughs and 103 neighborhoods.


# 4. Explore and Cluster
##### In this part i do alot of experiment on each df 
##### the reult maybe not accurate for KMeans (set k = feature size of feature set)

In [87]:
#Use geopy library to get the latitude and longitude values of Toronto.
#In order to define an instance of the geocoder, we need to define a user_agent. 
#We will name our agent toronto_neigh-agent, as shown below.
address = 'Toronto, Ontario'

geolocator = Nominatim(user_agent="toronto_neigh-agent")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinates of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinates of Toronto are 43.653963, -79.387207.


In [88]:
#Mapping Toronto with Toronto_df which contain all the Borough
map_toronto_all = folium.Map(location=[latitude, longitude], zoom_start=11)

for lat, long, post, borough, neigh in zip(Toronto_df['Latitude'], Toronto_df['Longitude'], Toronto_df['PostalCode'], Toronto_df['Borough'], Toronto_df['Neighborhood']):
    label = "{} ({}): {}".format(borough, post, neigh)
    popup = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, long],
        radius=5,
        popup=popup,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto_all)
    
map_toronto_all

In [98]:
# Mapping with KMeans Clustering
toronto_map = folium.Map(location=[latitude, longitude], zoom_start=12)

X = Toronto_df['Latitude']
Y = Toronto_df['Longitude']
Z = np.stack((X, Y), axis=1)

kmeans = KMeans(n_clusters=11, random_state=0).fit(Z)

clusters = kmeans.labels_
colors = ['red', 'green', 'blue', 'yellow', 'black', 'grey', 'y', 'navy', 'darksalmon', 'lightpink', 'forestgreen']
Toronto_df['Cluster'] = clusters

for latitude, longitude, borough, cluster in zip(Toronto_df['Latitude'], Toronto_df['Longitude'], Toronto_df['Borough'], Toronto_df['Cluster']):
    label = folium.Popup(borough, parse_html=True)
    folium.CircleMarker(
        [latitude, longitude],
        radius=5,
        popup=label,
        color='black',
        fill=True,
        fill_color=colors[cluster],
        fill_opacity=0.7).add_to(toronto_map)  

toronto_map

##### According to the instruction, explore and cluster the neighborhoods in Toronto. You can decide to work with only boroughs that contain the word Toronto and then replicate the same analysis we did to the New York City data. 

In [89]:
#Using only Borough cols that contains word 'Toronto'
Borough_Toronto = Toronto_df[Toronto_df['Borough'].str.contains('Toronto')]
#reset index apply to df
Borough_Toronto.reset_index(drop = True, inplace = True)
Borough_Toronto

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M4E,East Toronto,The Beaches,43.676357,-79.293031
1,M4K,East Toronto,"The Danforth West,Riverdale",43.679557,-79.352188
2,M4L,East Toronto,"The Beaches West,India Bazaar",43.668999,-79.315572
3,M4M,East Toronto,Studio District,43.659526,-79.340923
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879
5,M4P,Central Toronto,Davisville North,43.712751,-79.390197
6,M4R,Central Toronto,North Toronto West,43.715383,-79.405678
7,M4S,Central Toronto,Davisville,43.704324,-79.38879
8,M4T,Central Toronto,"Moore Park,Summerhill East",43.689574,-79.38316
9,M4V,Central Toronto,"Deer Park,Forest Hill SE,Rathnelly,South Hill,...",43.686412,-79.400049


In [99]:
#feature size of feature set and shape
print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(Borough_Toronto['Borough'].unique()),
        Borough_Toronto.shape[0]))
print('This is a shape of New df:', Borough_Toronto.shape)

The dataframe has 4 boroughs and 38 neighborhoods.
This is a shape of New df: (38, 6)


In [90]:
# create map of Borough_Toronto using latitude and longitude values
map_Toronto = folium.Map(location=[latitude, longitude], zoom_start=12)

# Marker
for lat, lng, borough, neighborhood in zip(Borough_Toronto['Latitude'], Borough_Toronto['Longitude'], Borough_Toronto['Borough'], Borough_Toronto['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_Toronto)  
    
map_Toronto

In [100]:
# Mapping with KMeans Clustering
Borough_Toronto_map = folium.Map(location=[latitude, longitude], zoom_start=12)

X = Borough_Toronto['Latitude']
Y = Borough_Toronto['Longitude']
Z = np.stack((X, Y), axis=1)

kmeans = KMeans(n_clusters=4, random_state=0).fit(Z)

clusters = kmeans.labels_
colors = ['red', 'green', 'blue', 'yellow']
Borough_Toronto['Cluster'] = clusters

for latitude, longitude, borough, cluster in zip(Borough_Toronto['Latitude'], Borough_Toronto['Longitude'], Borough_Toronto['Borough'], Borough_Toronto['Cluster']):
    label = folium.Popup(borough, parse_html=True)
    folium.CircleMarker(
        [latitude, longitude],
        radius=5,
        popup=label,
        color='black',
        fill=True,
        fill_color=colors[cluster],
        fill_opacity=0.7).add_to(Borough_Toronto_map)  

Borough_Toronto_map

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
