# Segmenting and Clustering Neighborhoods in Toronto - 1


In [2]:
import pandas as pd
import numpy as np
import bs4 as bs
import requests

### Importing data through wiki and scrapping it through Beautiful soup and finding the table

In [3]:

url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
request = requests.get(url)
bsoup = bs.BeautifulSoup(request.content,'lxml') 
table = bsoup.find_all('table')[0]
df = pd.read_html(str(table))
data = pd.read_json(df[0].to_json(orient='records'))

In [4]:
data.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180 entries, 0 to 179
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Postal Code   180 non-null    object
 1   Borough       180 non-null    object
 2   Neighborhood  180 non-null    object
dtypes: object(3)
memory usage: 4.3+ KB


In [7]:
data.isnull().sum()

Postal Code     0
Borough         0
Neighborhood    0
dtype: int64

##### Choosing only data where field Borough doesn't have not assigned value

In [19]:

new_data = data[data['Borough'] != 'Not assigned' ]
new_data

Unnamed: 0,Postal Code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
160,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
165,M4Y,Downtown Toronto,Church and Wellesley
168,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
169,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


##### Grouping data

In [20]:

new_data = new_data.groupby(['Borough', 'Postal Code'], as_index=False).agg(','.join)
new_data

Unnamed: 0,Borough,Postal Code,Neighborhood
0,Central Toronto,M4N,Lawrence Park
1,Central Toronto,M4P,Davisville North
2,Central Toronto,M4R,"North Toronto West, Lawrence Park"
3,Central Toronto,M4S,Davisville
4,Central Toronto,M4T,"Moore Park, Summerhill East"
...,...,...,...
98,York,M6C,Humewood-Cedarvale
99,York,M6E,Caledonia-Fairbanks
100,York,M6M,"Del Ray, Mount Dennis, Keelsdale and Silverthorn"
101,York,M6N,"Runnymede, The Junction North"


In [26]:
raw_data = new_data[new_data['Neighborhood']== 'Not assigned']
raw_data.head()

Unnamed: 0,Borough,Postal Code,Neighborhood


#### There are no NA values in Neighborhood but in case there is we can run the code by assigning borough value to neighborhood

In [29]:
new_data['Neighborhood'] = np.where(new_data['Neighborhood'] == 'Not assigned',
                                     new_data['Borough'],new_data['Neighborhood'])

In [31]:
new_data.shape

(103, 3)

# In order to utilize the Foursquare location data, we need to get the latitude and the longitude coordinates of each neighborhood. 

In [32]:
geospatial_url = "http://cocl.us/Geospatial_data"
geo_df = pd.read_csv(geospatial_url)
geo_df.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [33]:
Merged_data = pd.merge(new_data, geo_df, on = 'Postal Code')
Merged_data.head()

Unnamed: 0,Borough,Postal Code,Neighborhood,Latitude,Longitude
0,Central Toronto,M4N,Lawrence Park,43.72802,-79.38879
1,Central Toronto,M4P,Davisville North,43.712751,-79.390197
2,Central Toronto,M4R,"North Toronto West, Lawrence Park",43.715383,-79.405678
3,Central Toronto,M4S,Davisville,43.704324,-79.38879
4,Central Toronto,M4T,"Moore Park, Summerhill East",43.689574,-79.38316
