# IBM Coursera Capstone Week 3 Assignment
## Clustering Neighbourhoods in Toronto
### Oct 19, 2019

### Part 1

#### Import required libraries

In [1]:
import pandas as pd
# import urllib.request, urllib.parse, urllib.error
import requests
from bs4 import BeautifulSoup
from tabulate import tabulate

#### Scrape the table from Wikipedia and turn it into a dataframe

In [2]:
res = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')
soup = BeautifulSoup(res.content,'lxml')
table = soup.find_all('table')[0] 
df = pd.read_html(str(table))[0]
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


### Data cleanup
#### 1. Remove Postcode with "Not Assigned" boroughs

In [4]:
df = df[df.Borough != 'Not assigned']
# df.reset_index(inplace=True)
# df.drop('index',axis = 1)
df.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Not assigned
10,M9A,Etobicoke,Islington Avenue
11,M1B,Scarborough,Rouge
12,M1B,Scarborough,Malvern


#### 2. If a cell has a borough but a 'Not assigned' neighborhood, then the neighborhood will be the same as the borough. 

In [7]:
for i, row in df.iterrows():
    if df.loc[i, 'Neighbourhood'] == 'Not assigned':
        df.loc[i, 'Neighbourhood'] = df.loc[i, 'Borough']
    else:
        continue
        
df.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Queen's Park
10,M9A,Etobicoke,Islington Avenue
11,M1B,Scarborough,Rouge
12,M1B,Scarborough,Malvern


#### 3. Merge neighbourhoods with the same postcode

In [13]:
df_2 = df.groupby(by=['Postcode','Borough']).agg(lambda x: ', '.join(x))
df_2.reset_index(inplace=True)
df_2

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


#### Number of rows in the cleaned up dataframe

In [14]:
df_2.shape

(103, 3)

### Part 2

#### Fetch the csv

In [15]:
!wget -q -O 'toneighbourhood_location.csv' http://cocl.us/Geospatial_data
print('Data downloaded!')

Data downloaded!


In [16]:
location_df = pd.read_csv('toneighbourhood_location.csv')
location_df.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


#### Create two empty columns for Lat and Long

In [29]:
location_df.loc[0, 'Postal Code'] == df_2.loc[0, 'Postcode']

df_2['Latitude'] = ''
df_2['Longitude'] = ''

df_2.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",,
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",,
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",,
3,M1G,Scarborough,Woburn,,
4,M1H,Scarborough,Cedarbrae,,


#### For loop to match Lat Long into df_2

In [31]:
for i, row in location_df.iterrows():
    for j, row in df_2.iterrows():
        if df_2.loc[j, 'Postcode'] == location_df.loc[i, 'Postal Code']:
            df_2['Latitude'][j] = location_df['Latitude'][i]
            df_2['Longitude'][j] = location_df['Longitude'][i]
        else:
            continue
        
df_2.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.8067,-79.1944
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.7845,-79.1605
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.7636,-79.1887
3,M1G,Scarborough,Woburn,43.771,-79.2169
4,M1H,Scarborough,Cedarbrae,43.7731,-79.2395
5,M1J,Scarborough,Scarborough Village,43.7447,-79.2395
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",43.7279,-79.262
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.7111,-79.2846
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",43.7163,-79.2395
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.6927,-79.2648
