# Segmenting and Clustering Neighborhoods in Toronto (Latitude and Longitude) 

#### Importing libraries

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import csv

In [2]:
pd.set_option('max_colwidth', 800)

#### Getting the source of the webpage and assigining the variable source to it and iniatilizing the beautifulsoup object to soup

In [3]:
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text 
soup = BeautifulSoup(source, 'lxml')

#### Initializing the csv_writer object and writing the name of the columns on it as the first row

In [4]:
csv_file = open('toronto_postal_codes.csv', 'w')
csv_writer = csv.writer(csv_file)
csv_writer.writerow(['Postal Code', 'Borough', 'Neighbourhood'])

35

#### Scraping the webpage to extract the data table

In [5]:
table = soup.find('table', class_ = 'wikitable sortable') # Gets the table from the webpage
rows = table.find_all('tr') # Gets the table rows

postcodes = [] # Initializes the raw postcodes list
boroughs = [] # Initializes the raw boroughs list
neighbourhoods = [] # Initializes the raw neighbourhoods list

for row in rows:    
    columns = row.find_all('td')
    try :
        if columns[1].text != 'Not assigned':  # To skip if the borough name is 'Not Assigned'
            
            postcode = columns[0].text
            postcodes.append(postcode)
            
            borough = columns[1].text
            boroughs.append(borough)
            
            neighbourhood = columns[2].text.split('\n')[0] # Removing the newline character at the end     
            
            if neighbourhood == 'Not assigned': # Assigning the same name to neighbourhood if it is 'Not Assigned'
                neighbourhood = borough            
                
            neighbourhoods.append(neighbourhood)
             
    except Exception as e : # To skip the first row which contains column names
        pass 
    
postcode_explored = [] # Initializing the list of explored postcodes
for index_i, postcode_i in enumerate(postcodes) :   
    if postcode_i not in postcode_explored :
        nbds = neighbourhoods[index_i]
        for index_f, postcode_f in enumerate(postcodes) :
            if postcode_i == postcode_f and index_i != index_f:
                nbds = nbds + ', ' + neighbourhoods[index_f] # Concatenating the neighbourhood names
        csv_writer.writerow([postcode_i, boroughs[index_i], nbds]) # Writing the rows in the csv file
        postcode_explored.append(postcode_i)

#### Closing the csv file

In [6]:
csv_file.close()

#### Creating the pandas dataframe

In [7]:
df = pd.read_csv('toronto_postal_codes.csv')

#### Getting the shape of the dataframe

In [8]:
df.shape

(103, 3)

#### First five rows of the dataframe

In [9]:
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Queen's Park,Queen's Park


#### Getting the Latitudes and Longitudes from the csv file into a pandas dataframe

In [10]:
# The code was removed by Watson Studio for sharing.

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


#### Set indexes of two dataframes to its Postal Code columns and merge them

In [11]:
df1 = df.set_index('Postal Code')
coors1 = df_coors.set_index('Postal Code')
df_coors = pd.concat([df1, coors1], axis=1, join='inner')
df_coors.head()

Unnamed: 0_level_0,Borough,Neighbourhood,Latitude,Longitude
Postal Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
M3A,North York,Parkwoods,43.753259,-79.329656
M4A,North York,Victoria Village,43.725882,-79.315572
M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636
M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763
M7A,Queen's Park,Queen's Park,43.662301,-79.389494


#### Resetting the index

In [13]:
df_coors.reset_index(inplace=True)
df_coors.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763
4,M7A,Queen's Park,Queen's Park,43.662301,-79.389494


#### Creating a test dataframe to check the coordinates are added as required by the question

In [14]:
# create a new test dataframe
column_names = ["Postal Code", "Borough", "Neighborhood", "Latitude", "Longitude"]
test_df = pd.DataFrame(columns=column_names)

test_list = ["M5G", "M2H", "M4B", "M1J", "M4G", "M4M", "M1R", "M9V", "M9L", "M5V", "M1B", "M5A"]

for postcode in test_list:
    test_df = test_df.append(df_coors[df_coors["Postal Code"]==postcode], ignore_index=True, sort= True)
    
test_df

Unnamed: 0,Borough,Latitude,Longitude,Neighborhood,Neighbourhood,Postal Code
0,Downtown Toronto,43.657952,-79.387383,,Central Bay Street,M5G
1,North York,43.803762,-79.363452,,Hillcrest Village,M2H
2,East York,43.706397,-79.309937,,"Woodbine Gardens, Parkview Hill",M4B
3,Scarborough,43.744734,-79.239476,,Scarborough Village,M1J
4,East York,43.70906,-79.363452,,Leaside,M4G
5,East Toronto,43.659526,-79.340923,,Studio District,M4M
6,Scarborough,43.750072,-79.295849,,"Maryvale, Wexford",M1R
7,Etobicoke,43.739416,-79.588437,,"Albion Gardens, Beaumond Heights, Humbergate, Jamestown, Mount Olive, Silverstone, South Steeles, Thistletown",M9V
8,North York,43.756303,-79.565963,,Humber Summit,M9L
9,Downtown Toronto,43.628947,-79.39442,,"CN Tower, Bathurst Quay, Island airport, Harbourfront West, King and Spadina, Railway Lands, South Niagara",M5V
