# Peer-graded Assignment: Segmenting and Clustering Neighborhoods in Toronto (Part 1)

### Importing all the libraries and dependencies needed

In [2]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - folium=0.5.0


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    certifi-2020.6.20          |   py36h9f0ad1d_0         151 KB  conda-forge
    openssl-1.1.1g             |       h516909a_0         2.1 MB  conda-forge
    python_abi-3.6             |          1_cp36m           4 KB  conda-forge
    vincent-0.4.4              |             py_1          28 KB  conda-forge
    ca-certificates-2020.6.20  |       hecda079_0         145 KB  conda-forge
    branca-0.4.1               |             py_0          26 KB  conda-forge
    altair-4.1.0               |             py_1         614 KB  conda-forge
    folium-0.5.0               |             py_0          45 KB  conda-forge
    ------------------------------------------------------------
                       

In [4]:
from bs4 import BeautifulSoup

### Load and Scrap the required Data

##### Scrapping and parsing of data has been done with the BeautifulSoup Package

In [5]:
page = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")
soup = BeautifulSoup(page.content, 'html.parser')

In [6]:
table = soup.find('tbody')
rows = table.select('tr')
row = [r.get_text() for r in rows]

### Cleaning of Data

In [7]:
df = pd.DataFrame(row)
df1 = df[0].str.split('\n', expand=True)
df2 = df1.rename(columns=df1.iloc[0])
df3 = df2.drop(df2.index[0])
df3.head()

Unnamed: 0,Unnamed: 1,Postal Code,Unnamed: 3,Borough,Unnamed: 5,Neighborhood,Unnamed: 7
1,,M1A,,Not assigned,,Not assigned,
2,,M2A,,Not assigned,,Not assigned,
3,,M3A,,North York,,Parkwoods,
4,,M4A,,North York,,Victoria Village,
5,,M5A,,Downtown Toronto,,"Regent Park, Harbourfront",


### Drop cells with a borough that is "Not assigned"

In [12]:
df4 = df3[df3.Borough != 'Not assigned'].reset_index(drop=True)
df4.head(10)

Unnamed: 0,Unnamed: 1,Postal Code,Unnamed: 3,Borough,Unnamed: 5,Neighborhood,Unnamed: 7
0,,M3A,,North York,,Parkwoods,
1,,M4A,,North York,,Victoria Village,
2,,M5A,,Downtown Toronto,,"Regent Park, Harbourfront",
3,,M6A,,North York,,"Lawrence Manor, Lawrence Heights",
4,,M7A,,Downtown Toronto,,"Queen's Park, Ontario Provincial Government",
5,,M9A,,Etobicoke,,"Islington Avenue, Humber Valley Village",
6,,M1B,,Scarborough,,"Malvern, Rouge",
7,,M3B,,North York,,Don Mills,
8,,M4B,,East York,,"Parkview Hill, Woodbine Gardens",
9,,M5B,,Downtown Toronto,,"Garden District, Ryerson",


### Combining Neighbourhoods based on similar Postcode and Borough Name

In [11]:
df5 = df4.groupby(['Postal Code', 'Borough'], sort = False).agg(','.join)
df5.reset_index(inplace = True)
df5.head(10)

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


### Assigning Borough values to the Neignbourhood where value is "Not assigned"

In [15]:
# for Neighborhood="Not assigned", make the value the same as Borough
for index, row in df5.iterrows():
    if row["Neighborhood"] == "Not assigned":
        row["Neighborhood"] = row["Borough"]

In [18]:
df5.head(12)

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


### Print the number of rows in the dataframe

In [21]:
df5.shape

(103, 3)