# Segmenting and Clustering Neighborhoods in Toronto
### 1. Starting by creating a new Notebook for this assignment.
#### First, let's download all the dependencies that we will need further

In [1]:
import numpy as np                                                               # library to handle data in a vectorized manner

import pandas as pd                                                              # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json                                                                                # library to handle JSON files

!conda install -c conda-forge geopy --yes                # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim                           # convert an address into latitude and longitude values

import requests                                                                         # library to handle requests
from pandas.io.json import json_normalize                        # transform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes        # uncomment this line if you haven't completed the Foursquare API lab
!conda install -c conda-forge libxslt==1.1.28 --yes
!conda install -c conda-forge html5lib --yes
!conda install -c conda-forge BeautifulSoup4 --yes
import html5lib 
import folium   
                                                                           # map rendering library
print('Libraries imported.')

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    geopy-1.21.0               |             py_0          58 KB  conda-forge
    geographiclib-1.50         |             py_0          34 KB  conda-forge
    certifi-2019.11.28         |           py36_0         149 KB  conda-forge
    openssl-1.1.1d             |       h516909a_0         2.1 MB  conda-forge
    ca-certificates-2019.11.28 |       hecc5488_0         145 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         2.5 MB

The following NEW packages will be INSTALLED:

    geographiclib:   1.50-py_0         conda-forge
    geopy:           1.21.0-py_0       conda-forge

The following packages will be UPDATED:

    ca-

#### The next step is to scrape the Wikipedia page and wrangle the data of Toronto neighborhoods.

In [2]:
# Using BeautifulSoup for wrangling the data 
from bs4 import BeautifulSoup as bs
from urllib.request import urlopen
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
file = urlopen(url)
html = file.read()
# to read html - returns list of dataframes
toronto_data = pd.read_html(html, header=0)

#let's see the  first dataframe 
toronto_df = toronto_data [0]
toronto_df.head(11)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor
7,M7A,Downtown Toronto,Queen's Park
8,M8A,Not assigned,Not assigned
9,M9A,Etobicoke,Islington Avenue


#### To clean the dataframe from all records where boroughs are 'Not assigned'

In [3]:
#cleaning the dataframe from records where Borough is not 'Not assigned'
df_toronto = toronto_df[toronto_df['Borough'] != 'Not assigned']
df_toronto.head(11)

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor
7,M7A,Downtown Toronto,Queen's Park
9,M9A,Etobicoke,Islington Avenue
10,M1B,Scarborough,Rouge
11,M1B,Scarborough,Malvern
13,M3B,North York,Don Mills North


#### If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.

In [4]:
# replacing the borough where neighborhood is 'Not assigned'
neighborhoods = df_toronto['Neighbourhood'].where(df_toronto['Neighbourhood'] != 'Not assigned', other = df_toronto['Borough'], axis = 0)

# dataframe with postcode and borough 
df_toronto_n = pd.concat([df_toronto['Postcode'], df_toronto['Borough'], neighborhoods], axis = 1)
df_toronto_n.head(11)

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor
7,M7A,Downtown Toronto,Queen's Park
9,M9A,Etobicoke,Islington Avenue
10,M1B,Scarborough,Rouge
11,M1B,Scarborough,Malvern
13,M3B,North York,Don Mills North


#### For neighborhoods with the identical postal code area, combine into one row with neighborhoods separated with  a comma.

In [5]:
#grouping based on Postcode all neighborhoods into comma separated with a comma
neighborhoods_tor = df_toronto_n.groupby(['Postcode', 'Borough'])['Neighbourhood'].apply(list).apply(lambda x: ', '.join(x)).to_frame()

neighborhoods_tor.reset_index(inplace = True)
neighborhoods_tor.head(11)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


#### using the .shape method to print the number of rows of your dataframe.

In [6]:
neighborhoods_tor.shape

(103, 3)