## Data science project - Segmenting and Clustering Neighborhoods in Toronto

### Data cleanup



In [41]:
import pandas as pd

page_content_list = pd.read_html("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")

# Page contains several tables, let's access the first one
df = page_content_list[0]

# let's look at the data we got
print("df shape: " + str(df.shape))
df.head()


df shape: (180, 3)


Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [42]:
# Let's clean up the "Not assigned" -Borough rows
df = df[df['Borough'] != 'Not assigned']

# Let's also clean up any remaining "Not assigned" -neighbourhoods by assigning the rows Borough value to Neighbourhood column in that case.
df.loc[df['Neighbourhood'] == 'Not assigned', 'Neighbourhood'] = df.loc[df['Neighbourhood'] == 'Not assigned', 'Borough'].values

print("df shape: " + str(df.shape))
df.head()

df shape: (103, 3)


Unnamed: 0,Postal Code,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [43]:
def getCoords(postal_code: str):

    # Here the provided preloaded CSV file was used because the geocoder was so unreliable.
    # Let's read the geospatial data to pandas dataframe. 
    coords = pd.read_csv("Geospatial_Coordinates.csv")

    # and return the wanted coordinates as a tuple
    return (coords[coords['Postal Code']==postal_code]['Latitude'].values[0],
            coords[coords['Postal Code']==postal_code]['Longitude'].values[0] )



In [44]:

postal_code_list = df['Postal Code'].to_list()
lat_coords = []
long_coords = []

for postal_code in postal_code_list:

    coords = getCoords(postal_code)
    lat_coords.append(coords[0])
    long_coords.append(coords[1])


df['Latitude'] = lat_coords
df['Longitude'] = long_coords
df.reset_index(inplace=True, drop=True)

# Let's display 
df.head(12)

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village",43.667856,-79.532242
6,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
7,M3B,North York,Don Mills,43.745906,-79.352188
8,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
