# For Capstone Project 
### Week 3 Final 
#### 1. Setting up the environment¶

In [17]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json

#!conda install -c conda-forge geopy --yes
from geopy.geocoders import Nominatim

from bs4 import BeautifulSoup
from urllib.request import urlopen
import requests
from pandas.io.json import json_normalize

import matplotlib.cm as cm
import matplotlib.colors as colors

from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes
import folium

print('Libraries imported.')

Fetching package metadata .............
Solving package specifications: .

Package plan for installation in environment /opt/conda/envs/DSX-Python35:

The following NEW packages will be INSTALLED:

    altair:  2.2.2-py35_1 conda-forge
    branca:  0.3.1-py_0   conda-forge
    folium:  0.5.0-py_0   conda-forge
    vincent: 0.4.4-py_1   conda-forge

altair-2.2.2-p 100% |################################| Time: 0:00:00  49.40 MB/s
branca-0.3.1-p 100% |################################| Time: 0:00:00  32.88 MB/s
vincent-0.4.4- 100% |################################| Time: 0:00:00  39.16 MB/s
folium-0.5.0-p 100% |################################| Time: 0:00:00  48.53 MB/s
Libraries imported.


#### 2.Getting data from Wikipedia using BeautifulSoup

In [4]:
#https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M
source=requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup=BeautifulSoup(source, 'lxml')  
table=soup.find('table')
table_rows=table.findAll('tr')
#l is the list to be passed through the function to obtain dataframe
l = []
for tr in table_rows:
    td = tr.find_all('td')
    raw_row = [tr.text for tr in td]
    #to check for empty rows and exclude these cases
    if raw_row == [] or raw_row[1] == 'Not assigned':
        continue

    #to parse neighborhood rows to remove \n
    edited_row=raw_row[0:len(raw_row)-1]
    edited_row.append(raw_row[-1][:-1])
    
    #check for empty neighborhoods to be replaced with borough 
    if edited_row[2] == 'Not assigned':
        edited_row[2] = edited_row[1]
    
    #adding entry to l
    l.append(edited_row)
    
    
df=pd.DataFrame(l, columns=["Postcode", "Borough", 'Neighborhood'])

df2=df.groupby(['Postcode', 'Borough'])['Neighborhood'].apply(list).apply(lambda x:', '.join(x)).to_frame().reset_index()
df2.head(15)

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [5]:
df2.shape


(103, 3)

#### 3. Get geographical coordinates CSV files from "http://cocl.us/Geospatial_data"

In [6]:
geo_data=pd.read_csv("http://cocl.us/Geospatial_data")

geo_data.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


#### 4.Combine two data frames

In [18]:
df_geo=geo_data.drop(columns=['Postal Code'])

df3=pd.concat([df2, df_geo], axis=1)
display(df3)

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848


#### 5.Clustering the neighborhoods in Toronto

In [19]:
toronto_map = folium.Map(location=[43.65, -79.4], zoom_start=12)

X = df3['Latitude']
Y = df3['Longitude']
Z = np.stack((X, Y), axis=1)

kmeans = KMeans(n_clusters=4, random_state=0).fit(Z)

clusters = kmeans.labels_
colors = ['red', 'green', 'blue', 'yellow']
df3['Cluster'] = clusters

for latitude, longitude, borough, cluster in zip(df3['Latitude'], df3['Longitude'], df3['Borough'], df3['Cluster']):
    label = folium.Popup(borough, parse_html=True)
    folium.CircleMarker(
        [latitude, longitude],
        radius=5,
        popup=label,
        color='black',
        fill=True,
        fill_color=colors[cluster],
        fill_opacity=0.7).add_to(toronto_map)  

toronto_map