# First Assignment - Explore and cluster the neighborhoods in Toronto

## Scraping data from Wiki

In [2]:
from bs4 import BeautifulSoup
from urllib.request import urlopen
import pandas as pd

url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
page = urlopen(url)
html = page.read().decode("utf-8")
toronto_soup = BeautifulSoup(html, "html.parser")

## Creating DataFrame

In [4]:
table = toronto_soup.find('table', attrs={'class':'wikitable sortable'}) #getting the postal code tables
table_rows = table.find_all('tr')
headers = [h.get_text().rstrip() for h in table.find_all('th')] #getting the headers
l = []
for tr in table_rows: #retrieving the values
    td = tr.find_all('td')
    if td:
        row = [tr.text.rstrip() for tr in td]
        l.append(row)
df = pd.DataFrame(l, columns=headers)
df.dropna()
print("DataFrame original shape: ", df.shape)
df.head()

DataFrame original shape:  (180, 3)


Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


## Dropping 'Not Assigned' borough cells

In [5]:
###################################
# Dropping 'Not Assigned' cells ###
###################################
df_filtered = df[df['Borough'] != 'Not assigned'] 
print("DataFrame filtered shape: ", df_filtered.shape)
df_filtered.head()

DataFrame filtered shape:  (103, 3)


Unnamed: 0,Postal Code,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


## Checking for duplicates Postal Codes

In [6]:
#############################
# Checking for duplicates ###
#############################
duplicate = df_filtered.duplicated(subset=['Postal Code'])
duplicate_values = 0
if duplicate.any():
    duplicate_values += 1
    print(df_filtered['Postal Code'][~duplicate], end='\n\n')
if duplicate_values == 0:
    print("No duplicate values any longer") #there are no duplicates
    

No duplicate values any longer


## Looking for 'Not Assigned' Neighbourhood

In [7]:
print(df_filtered.loc[df_filtered['Neighbourhood'] == 'Not Assigned'])

Empty DataFrame
Columns: [Postal Code, Borough, Neighbourhood]
Index: []


## Dataframe shape

In [8]:
print("Final data frame shape:",  df_filtered.shape)

Final data frame shape: (103, 3)
