In [9]:
import requests
import pandas as pd
from bs4 import BeautifulSoup

In [10]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
html = requests.get(url).text

### Parse wiki doc

In [11]:
soup = BeautifulSoup(html,'html.parser')

In [12]:
df = pd.DataFrame(columns=['PostalCode','Borough','Neighborhood'])
rows = soup.findAll('tr')
i=-0
for tr in rows:
    cells =tr.findAll('td')
    c1= [c.text for c in cells]
    if(len(c1) != 0 and c1[0].startswith('M') and ~c1[1].find("Not assigned")>=0):    
        df.loc[i] = [c1[0],c1[1],c1[2] ]
        i=i+1
       

In [13]:
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods\n
1,M4A,North York,Victoria Village\n
2,M5A,Downtown Toronto,Harbourfront\n
3,M5A,Downtown Toronto,Regent Park\n
4,M6A,North York,Lawrence Heights\n


In [14]:
df.columns

Index(['PostalCode', 'Borough', 'Neighborhood'], dtype='object')

In [15]:
df.shape

(212, 3)

#### Replace unwanted text '\n' at the end of Neighborhood

In [16]:
df['Neighborhood'] = df['Neighborhood'].str.replace('\n','')

In [17]:
df.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights
5,M6A,North York,Lawrence Manor
6,M7A,Queen's Park,Not assigned
7,M9A,Etobicoke,Islington Avenue
8,M1B,Scarborough,Rouge
9,M1B,Scarborough,Malvern


#### If borough is there but Neighborhood is not assigned then borough and neighborhood should be same

In [18]:
df[df['Borough'].str.contains('Park')]

Unnamed: 0,PostalCode,Borough,Neighborhood
6,M7A,Queen's Park,Not assigned


In [19]:
#check
df.loc[df['Neighborhood']=='Not assigned','PostalCode']


6    M7A
Name: PostalCode, dtype: object

In [20]:
df.loc[df['Neighborhood']=='Not assigned','Borough']

6    Queen's Park
Name: Borough, dtype: object

In [21]:
#Assign all neighborhoods to borough whose neighborhood are not assigned
df.loc[df['Neighborhood']=='Not assigned','Neighborhood'] = df.loc[df['Neighborhood']=='Not assigned','Borough'] 

In [22]:
df[df['Borough'].str.contains('Park')]

Unnamed: 0,PostalCode,Borough,Neighborhood
6,M7A,Queen's Park,Queen's Park


#### Merger more than one same neighborhood into a single row sepearted by columns

In [23]:
# Example row
df[df['PostalCode'].str.contains('M5A')]

Unnamed: 0,PostalCode,Borough,Neighborhood
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park


In [24]:
#Merge all rows and create dataframe
dfNew = df.groupby(['PostalCode','Borough'],sort=False).agg(lambda col: ', '.join(col))
dfNew.reset_index(inplace=True)
dfNew[dfNew['PostalCode'].str.contains('M5A')]

Unnamed: 0,PostalCode,Borough,Neighborhood
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"


In [25]:
dfNew.shape

(103, 3)

## Merge Lat and Long

Geospatial_Coordiantes.csv file contains lat and long of canada according to postal code

In [26]:
cord = pd.read_csv("Geospatial_Coordinates.csv")
cord.columns = ['PostalCode','Latitude','Longitude']
cord.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [27]:
dfNew['Latitude'] =''
dfNew['Longitude'] =''
dfNew.columns

Index(['PostalCode', 'Borough', 'Neighborhood', 'Latitude', 'Longitude'], dtype='object')

#### Assign lat and lon

In [28]:
for row in cord.itertuples():
    dfNew.loc[dfNew.PostalCode ==row[1],'Latitude'] = row[2]
    dfNew.loc[dfNew.PostalCode ==row[1],'Longitude'] = row[3]


In [29]:
dfNew.head(12)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.7533,-79.3297
1,M4A,North York,Victoria Village,43.7259,-79.3156
2,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.6543,-79.3606
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.7185,-79.4648
4,M7A,Queen's Park,Queen's Park,43.6623,-79.3895
5,M9A,Etobicoke,Islington Avenue,43.6679,-79.5322
6,M1B,Scarborough,"Rouge, Malvern",43.8067,-79.1944
7,M3B,North York,Don Mills North,43.7459,-79.3522
8,M4B,East York,"Woodbine Gardens, Parkview Hill",43.7064,-79.3099
9,M5B,Downtown Toronto,"Ryerson, Garden District",43.6572,-79.3789


### Cluster Toronto Neighborhood

In [30]:
import folium 
import matplotlib.cm as cm
import matplotlib.colors as colors

from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

In [31]:
address = 'Toronto'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of New York City are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of New York City are 43.653963, -79.387207.


In [32]:
filter_df = dfNew.loc[dfNew['Borough'].str.contains('Toronto')]

In [35]:
#only records contain 'Toronto' in Borough
filter_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
2,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.6543,-79.3606
9,M5B,Downtown Toronto,"Ryerson, Garden District",43.6572,-79.3789
15,M5C,Downtown Toronto,St. James Town,43.6515,-79.3754
19,M4E,East Toronto,The Beaches,43.6764,-79.293
20,M5E,Downtown Toronto,Berczy Park,43.6448,-79.3733


In [38]:
# create map of New York using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(filter_df['Latitude'], filter_df['Longitude'], filter_df['Borough'], filter_df['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto