# Segmenting and Clustering Neighborhoods in Toronto - Part 2

## Importing required Libraries

In [2]:
from bs4 import BeautifulSoup   #Python package for parsing HTML and XML documents

import numpy as np # library to handle data in a vectorized manner
import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

%matplotlib inline

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Solving environment: done

## Package Plan ##

  environment location: /home/jupyterlab/conda

  added / updated specs: 
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    openssl-1.0.2p             |       h470a237_1         3.1 MB  conda-forge
    certifi-2018.10.15         |        py36_1000         138 KB  conda-forge
    geopy-1.17.0               |             py_0          49 KB  conda-forge
    ca-certificates-2018.10.15 |       ha4d7672_0         135 KB  conda-forge
    conda-4.5.11               |        py36_1000         651 KB  conda-forge
    geographiclib-1.49         |             py_0          32 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         4.1 MB

The following NEW packages will be INSTALLED:

    geographiclib:   1.49-py_0            conda-forge
    geopy:           

### Assigning wikipedia Article to __*url*__

In [3]:
url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

### Prasing table using *requests* and *BeautifulSoup*

In [4]:
req=requests.get(url)
data=req.text
    
soup = BeautifulSoup(data,"html.parser")
table = soup.table

### __*table*__ now has all the table data in wikipedia article

In [5]:
table_rows=[] # creating an empty list

#### Iterating over *'tr'* and *'td'* tags in __'table'__

When in __*'td'*__ tag, search for __*'a'*__ tag and then get the title by doing this we can filter the Grayed Out and Not Assigned cells

Grayed Out and Not Assigned cells will be replaced by __*'None'*__

Extracting table data into __*table_rows*__

In [6]:
trs=table.find_all('tr')
for tr in trs:
    td = tr.find_all('td')
    if len(td)==0:
        continue
    postal_code = td[0].getText()
    district = td[1].find('a')
    if district is None:
        dist_name = 'None'
    else:
        dist_name = district.getText() 
    Neighborhood = td[2].find('a')
    if Neighborhood is None:
        Neig_hood = 'None'
    else:
        Neig_hood = Neighborhood.getText()
    table_rows.append([postal_code,dist_name,Neig_hood])

#### Creating Dataframe from __*table_rows*__ and assigning column names

In [7]:
df=pd.DataFrame(table_rows,columns=['PostalCode','Borough','Neighborhood'])

#### Filtering the rows which has 'None'. 
'None' indicates grayed out and Not assigned cells

In [8]:
df1=df[df['Neighborhood'] != 'None']

In [9]:
df2=df1[df1['Borough'] != 'None']

In [10]:
df3=df2.reset_index()

In [11]:
df4=df3.drop('index',axis=1)

#### Below step joins values in Neighborhood column based on postalCode and Borough

In [12]:
p_codes=df4.groupby(['PostalCode','Borough'])['Neighborhood'].apply(','.join).reset_index()

In [13]:
p_codes.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [14]:
p_codes.shape

(84, 3)

In [15]:
df5=p_codes  # copying Dataframe 'P-codes' to df5

### Creating 2 functions to get Latitude and Longitude

I'm using "Geolocator" tool insted of geocoder which was given to us for sample 

I'm using Neighboorhod and Borough to get the latitude and longitude. 

In [17]:
# Function for getting Latitude
def lat(elem):
    n,b=elem
    n1=list(n.split(","))
    a='{},{}'.format(n1[0],b)
    geolocator = Nominatim()
    location = geolocator.geocode(a)
    if location == None:
        location = geolocator.geocode(b)
        latitude = location.latitude
        longitude = location.longitude
    else:
        latitude = location.latitude
        longitude = location.longitude
    return latitude

In [18]:
# Function for getting Longitude
def lng(elem):
    n,b=elem
    n1=list(n.split(","))
    a='{},{}'.format(n1[0],b)
    #print(a)
    geolocator = Nominatim()
    location = geolocator.geocode(a)
    if location == None:
        location = geolocator.geocode(b)
        latitude = location.latitude
        longitude = location.longitude
    else:
        latitude = location.latitude
        longitude = location.longitude
    return longitude

#### Applying functions and getting latittude and Longitude

In [19]:
df5['latitude'] = df5[['Neighborhood','Borough']].apply(lat,axis=1)



In [21]:
df5['longitude'] = df5[['Neighborhood','Borough']].apply(lng,axis=1)



In [24]:
df5

Unnamed: 0,PostalCode,Borough,Neighborhood,latitude,longitude
0,M1B,Scarborough,"Rouge,Malvern",43.80493,-79.165837
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.790117,-79.173334
2,M1E,Scarborough,"Morningside,West Hill",43.782601,-79.204958
3,M1G,Scarborough,Woburn,43.759824,-79.225291
4,M1H,Scarborough,Cedarbrae,43.756467,-79.226692
5,M1J,Scarborough,Scarborough Village,43.743742,-79.211632
6,M1K,Scarborough,"Ionview,Kennedy Park",43.73599,-79.276515
7,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge",43.708823,-79.295986
8,M1M,Scarborough,"Cliffcrest,Cliffside",43.721939,-79.236232
9,M1N,Scarborough,Birch Cliff,43.702112,-79.260091
