<h1 align="center" style="color:blue"> Segmenting and Clustering Neighborhoods in Toronto</h1>

<h3 align="center" style="color:green"> *** Applied Data Science Capstone (Week3, course) ***</h3> 

In [57]:
# import libraries
import pandas as pd
import numpy as np

<h1 style="color:green"> Part I: Data Preparation  </h1>

## Scrape Wikipedia page

In [58]:
wiki_url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

# read HTML -> convert to DataFrame
html = pd.read_html(wiki_url)
dt = pd.DataFrame(html[0])
dt.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


## Transform DataFrame

### 3 columns: PostalCode, Borough, Neighborhood

In [59]:
# rename PostCode -> Postalcode
dt.rename(columns = { 'Postcode' : 'PostalCode'}, inplace=True)
dt.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


### Ignore cells with a borough that is "Not assigned"

In [60]:
dt = dt[dt.Borough != 'Not assigned']
dt.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor


### Handle "Not assigned" neighborhood

In [61]:
dt.Neighbourhood.replace('Not assigned', dt.Borough, inplace=True)

### Combinating data

In [62]:
dt = dt.groupby(['PostalCode', 'Borough'])['Neighbourhood'].apply(','.join).reset_index()
dt

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park"
7,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge"
8,M1M,Scarborough,"Cliffcrest,Cliffside,Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff,Cliffside West"


### Shape of Data Frame

In [63]:
dt.shape

(103, 3)

In [64]:
print('Number of rows: ', dt.shape[0])

Number of rows:  103


<h1 style="color:green"> Part II: Get the latitude and the longitude coordinates </h1>

## Download & Pre-process GeoSpatial data

In [65]:
# download geospatial dataset
!wget -q -O 'Geo_data.csv' https://cocl.us/Geospatial_data
!ls

Geo_data.csv


In [66]:
geo = pd.read_csv('Geo_data.csv')
geo.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103 entries, 0 to 102
Data columns (total 3 columns):
Postal Code    103 non-null object
Latitude       103 non-null float64
Longitude      103 non-null float64
dtypes: float64(2), object(1)
memory usage: 2.5+ KB


In [67]:
geo.rename(columns={'Postal Code' : 'PostalCode'}, inplace=True)
geo.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103 entries, 0 to 102
Data columns (total 3 columns):
PostalCode    103 non-null object
Latitude      103 non-null float64
Longitude     103 non-null float64
dtypes: float64(2), object(1)
memory usage: 2.5+ KB


In [68]:
geo.head(10)

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
5,M1J,43.744734,-79.239476
6,M1K,43.727929,-79.262029
7,M1L,43.711112,-79.284577
8,M1M,43.716316,-79.239476
9,M1N,43.692657,-79.264848


## Merge Data

In [69]:
dt.head(10)

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park"
7,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge"
8,M1M,Scarborough,"Cliffcrest,Cliffside,Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff,Cliffside West"


In [70]:
dt = dt.merge(geo, on="PostalCode", how="left")
dt.head(10)

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest,Cliffside,Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff,Cliffside West",43.692657,-79.264848
