# Segmenting and Clustering Neighborhoods in Toronto Notebook - Q1 #

Let's start with adding the libraries for web scraping

In [1]:
import requests
from bs4 import BeautifulSoup

Load the webpage and turn it into a soup object

In [2]:
website_url = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(website_url, 'lxml')

Extract the data table

In [3]:
my_table = soup.find('table',{'class':'wikitable sortable'})

Split the rows

In [4]:
rows = my_table.find_all('tr')

Assign each column in a row to corresponding PostalCode, Borough, Neighborhood list by considering:
* Ignore cells with a borough that is **Not assigned**
* If a borough is listed twice and has two neighborhoods, these two rows will be combined into one row with the neighborhoods separated with a comma
* A **Not assigned** neighborhood will be the same as the borough

_We are ignoring the first row as it has the headers_

In [5]:
# Initiate lists
postcode = []
borough = []
neighborhood = []

# The index of the first row with non 'Not Assigned' borough
i = 1

# Find the index of the first row with non 'Not Assigned' borough
for j in range (1, len(rows)) : # Starts from 1 because first row is header
    parsed_row = rows[j].find_all('td')
    if parsed_row[1].text != 'Not assigned' : # Borough is not 'Not Assigned'
        i = j # Update the index of the first row with non 'Not Assigned' borough
        break # Leave for loop
print("The index of the first row with non 'Not Assigned' borough: " + str(i))

 # Parse the first row with non 'Not Assigned' borough and append data to the lists
parsed_row = rows[i].find_all('td') # Parse the first row with non 'Not Assigned' borough
postcode.append(parsed_row[0].text) # Append postcode to postcode list
borough.append(parsed_row[1].text) # Append borough to borough list
if parsed_row[2].text.rstrip() == 'Not assigned' : # Neighborhood is 'Not Assigned'
    neighborhood.append(parsed_row[1]).text # Append borough to the neighborhood list
else : # Neighborhood is not 'Not Assigned'
    neighborhood.append(parsed_row[2].text.rstrip()) # Append neighborhood to neighborhood list

print("First row with non 'Not Assigned' borough: " + str(postcode + borough + neighborhood))

# Start with the next row and traverse rows
for k in range (i + 1, len(rows)) :
    parsed_row = rows[k].find_all('td')
    if parsed_row[0].text == postcode[-1] : # Postcodes are not different
        if parsed_row[2].text.rstrip() == 'Not assigned' : # Neighborhood is 'Not Assigned'
            neighborhood[-1] += ', ' + parsed_row[1].text # Add borough to the neighborhood
        else : # Neighborhood is not 'Not Assigned'
            neighborhood[-1] += ', ' + parsed_row[2].text.rstrip() # Add neighborhood to the neighborhood
    else : # Postcodes are different
        if parsed_row[1].text != 'Not assigned' : # Borough is not 'Not Assigned'
            postcode.append(parsed_row[0].text) # Append postcode to postcode list
            borough.append(parsed_row[1].text) # Append borough to borough list
            if parsed_row[2].text.rstrip() == 'Not assigned' : # Neighborhood is 'Not Assigned'
                neighborhood.append(parsed_row[1].text) # Append borough to neighborhood list
            else : # Neighborhood is not 'Not Assigned'
                neighborhood.append(parsed_row[2].text.rstrip()) # Append neighborhood to neighborhood list

The index of the first row with non 'Not Assigned' borough: 3
First row with non 'Not Assigned' borough: ['M3A', 'North York', 'Parkwoods']


Import pandas library and zip the lists to convert to a dataframe

In [6]:
import pandas as pd
list_of_tuples = list(zip(postcode, borough, neighborhood))
df = pd.DataFrame(list_of_tuples, columns = ['Postcode', 'Borough', 'Neighborhood'])
df.head(12)

Unnamed: 0,Postcode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Queen's Park,Queen's Park
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Rouge, Malvern"
7,M3B,North York,Don Mills North
8,M4B,East York,"Woodbine Gardens, Parkview Hill"
9,M5B,Downtown Toronto,"Ryerson, Garden District"


There are 103 rows in the dataframe as seen below

In [7]:
df.shape

(103, 3)