# Segmenting and Clustering Neighborhoods in Toronto - Part 1

## Importing required Libraries

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

#### Assigning wikipedia Article to varibale __url__

In [2]:
url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

### Prasing wikipedia table using *requests* and *BeautifulSoup*

In [3]:
req=requests.get(url)
data=req.text
    
soup = BeautifulSoup(data,"html.parser")
table = soup.table

### __*'table'*__ now has all the table data in wikipedia article

In [4]:
table_rows=[] # creating an empty list

### Iterating over *'tr'* and *'td'* tags in __'table'__
When in __*'td'*__ tag, search for __*'a'*__ tag and then get the title of the tag. By doing this we can filter the Grayed Out and Not Assigned cells

Grayed Out and Not Assigned cells will be replaced by __*'None'*__

Extracting HTML table data into __*table_rows*__
 

In [5]:
trs=table.find_all('tr')
for tr in trs:
    td = tr.find_all('td')
    if len(td)==0:
        continue
    postal_code = td[0].getText()
    district = td[1].find('a')
    if district is None:
        dist_name = 'None'
    else:
        dist_name = district.getText() 
    Neighborhood = td[2].find('a')
    if Neighborhood is None:
        Neig_hood = 'None'
    else:
        Neig_hood = Neighborhood.getText()
    table_rows.append([postal_code,dist_name,Neig_hood])

#### Creating Dataframe from __*table_rows*__ and assigning column names

In [6]:
df=pd.DataFrame(table_rows,columns=['PostalCode','Borough','Neighborhood'])

In [7]:
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,,
1,M2A,,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


#### Filtering the rows which has 'None'. 
__*'None'*__ indicates grayed out and Not assigned cells

In [8]:
df1=df[df['Neighborhood'] != 'None']

In [9]:
df2=df1[df1['Borough'] != 'None']

In [10]:
df3=df2.reset_index()

In [11]:
df4=df3.drop('index',axis=1)

#### Below step joins values in Neighborhood column based on postalCode and Borough

In [12]:
p_codes=df4.groupby(['PostalCode','Borough'])['Neighborhood'].apply(','.join).reset_index()

In [13]:
p_codes

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"Ionview,Kennedy Park"
7,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge"
8,M1M,Scarborough,"Cliffcrest,Cliffside"
9,M1N,Scarborough,Birch Cliff


In [14]:
p_codes.shape

(84, 3)