# Segmenting and Clustering Neighborhoods in Toronto project

## Part 1

In [1]:
# import the necessary libraries to run BeautifulSoup and requests
!pip install bs4
!pip install lxml
!pip install html5lib
from bs4 import BeautifulSoup
import requests
import lxml
import pandas as pd
import numpy as np
import html5lib
print("Libraries are installed!")

Collecting bs4
  Downloading bs4-0.0.1.tar.gz (1.1 kB)
Building wheels for collected packages: bs4
  Building wheel for bs4 (setup.py) ... [?25ldone
[?25h  Created wheel for bs4: filename=bs4-0.0.1-py3-none-any.whl size=1272 sha256=74e61011cefac7b9f7a0e72cd651ee5e64251614709c6db233d12c6768f11383
  Stored in directory: /tmp/wsuser/.cache/pip/wheels/0a/9e/ba/20e5bbc1afef3a491f0b3bb74d508f99403aabe76eda2167ca
Successfully built bs4
Installing collected packages: bs4
Successfully installed bs4-0.0.1
Libraries are installed!


#### Create a variable called 'url' and define it as the Wikipedia site for Canadian postal codes

In [2]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
url

'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

#### Read in the url into a variable called "Data

In [3]:
Data = requests.get(url).text

#### Create a BeautifulSoup object

In [4]:
soup = BeautifulSoup(Data,'html5lib')

#### Create an empty list called 'table_contents' and create a variable called 'table' that has the table from the Wikipedia page

In [5]:
table_contents=[]
table = soup.find('table')       

#### Extract information from the table and populate it into the table_contents list

In [6]:
for row in table.findAll('td'):
    cell = {}
    if row.span.text=='Not assigned':
        pass
    else:
        cell['PostalCode'] = row.p.text[:3]
        cell['Borough'] = (row.span.text).split(')')[0]
        cell['Neighborhood'] = (((((row.span.text).split('(')[1]).strip(')')).replace(' /',',')).replace(')',' ')).strip(' ')
        table_contents.append(cell)

#### Create a dataframe called 'DataFrame' and populate it with the contents of the list 'table_contents'

In [7]:
DataFrame = pd.DataFrame(table_contents)
DataFrame

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York(Parkwoods,Parkwoods
1,M4A,North York(Victoria Village,Victoria Village
2,M5A,Downtown Toronto(Regent Park / Harbourfront,"Regent Park, Harbourfront"
3,M6A,North York(Lawrence Manor / Lawrence Heights,"Lawrence Manor, Lawrence Heights"
4,M7A,Queen's Park(Ontario Provincial Government,Ontario Provincial Government
...,...,...,...
98,M8X,Etobicoke(The Kingsway / Montgomery Road / Old...,"The Kingsway, Montgomery Road, Old Mill North"
99,M4Y,Downtown Toronto(Church and Wellesley,Church and Wellesley
100,M7Y,East TorontoBusiness reply mail Processing Cen...,Enclave of M4L
101,M8Y,Etobicoke(Old Mill South / King's Mill Park / ...,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


#### Update the Borough information for some of the Boroughs so that it doesn't include the long text.

In [8]:
DataFrame['Borough']=DataFrame['Borough'].replace({'Downtown TorontoStn A PO Boxes25 The Esplanade':'Downtown Toronto Stn A',
                                             'East TorontoBusiness reply mail Processing Centre969 Eastern':'East Toronto Business',
                                             'EtobicokeNorthwest':'Etobicoke Northwest','East YorkEast Toronto':'East York/East Toronto',
                                             'MississaugaCanada Post Gateway Processing Centre':'Mississauga'})

#### What is the shape of the dataframe?

In [9]:
DataFrame.shape

(103, 3)

#### Group DataFrame by postcode

In [12]:
df_postalcode = DataFrame.groupby(['PostalCode','Borough'])['Neighborhood'].apply(', '.join).reset_index()
df_postalcode.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough(Malvern / Rouge,"Malvern, Rouge"
1,M1C,Scarborough(Rouge Hill / Port Union / Highland...,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough(Guildwood / Morningside / West Hill,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough(Woburn,Woburn
4,M1H,Scarborough(Cedarbrae,Cedarbrae


#### What is the shape of the dataframe after grouping?

In [17]:
df_postalcode.shape

(103, 3)

#### Create a variable for the csv file called 'csv_path' and read it in as dataframe 'latlong_df'

In [18]:
csv_path = 'https://cocl.us/Geospatial_data'
latlong_df = pd.read_csv(csv_path, index_col='Postal Code')
# show first 5 lines of the dataframe
latlong_df.head()

Unnamed: 0_level_0,Latitude,Longitude
Postal Code,Unnamed: 1_level_1,Unnamed: 2_level_1
M1B,43.806686,-79.194353
M1C,43.784535,-79.160497
M1E,43.763573,-79.188711
M1G,43.770992,-79.216917
M1H,43.773136,-79.239476


#### Find the Latitude and Longitude for each PostalCode and create a new dataframe called 'Toronto_df' that combines all the data

In [20]:
Toronto_df = df_postalcode.join(latlong_df, on='PostalCode')
Toronto_df

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough(Malvern / Rouge,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough(Rouge Hill / Port Union / Highland...,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough(Guildwood / Morningside / West Hill,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough(Woburn,Woburn,43.770992,-79.216917
4,M1H,Scarborough(Cedarbrae,Cedarbrae,43.773136,-79.239476
...,...,...,...,...,...
98,M9N,York(Weston,Weston,43.706876,-79.518188
99,M9P,Etobicoke(Westmount,Westmount,43.696319,-79.532242
100,M9R,Etobicoke(Kingsview Village / St. Phillips / M...,"Kingsview Village, St. Phillips, Martin Grove ...",43.688905,-79.554724
101,M9V,Etobicoke(South Steeles / Silverstone / Humber...,"South Steeles, Silverstone, Humbergate, Jamest...",43.739416,-79.588437
