# Install libraries

In [None]:
!pip3 install beautifulsoup4 lxml requests pandas sklearn folium  

### Import libraries

In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

from sklearn.cluster import KMeans
import folium

# Display options for Pandas

In [None]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# Scrape Toronto Neighborhoods data

Get content of https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M as source to be scraped:

In [None]:
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(source, 'lxml')

Selecting only HTML-snippet with rows of table:

In [None]:
table_rows = soup.select('table.wikitable > tbody > tr')

Removing 1st row that is a header:

In [None]:
table_rows_without_header = table_rows[1:]

Collecting table data into Python list data structure:

In [None]:
scraped_list = []
for tr in table_rows_without_header:
    td = tr.find_all('td')
    scraped_list.append((td[0].text, td[1].text, td[2].text.rstrip('\n')))

Creating empty dataframe:

In [None]:
scraped_neighborhoods_raw = pd.DataFrame(scraped_list, columns=['Postal Code', 'Borough', 'Neighborhood'])

Filtering out rows, where _Borough_ column has `Not assigned` value:

In [None]:
scraped_neighborhoods_filtered_na_boroughs = scraped_neighborhoods_raw[scraped_neighborhoods_raw['Borough'] != 'Not assigned']

Combine neighborhoods with same _PostalCode_ into single row:

In [None]:
scraped_neighborhoods_clean = scraped_neighborhoods_filtered_na_boroughs.groupby(['Postal Code', 'Borough'], as_index=False, sort=False).agg({'Neighborhood': lambda x: "%s" % ', '.join(x)})

For all rows, where _Borough_ is known, but _Neighborhood_ is `Not assigned` the neighborhood will be the same as the borough:

In [None]:
borough_for_not_assigned_neighborhoods = scraped_neighborhoods_clean[scraped_neighborhoods_clean['Neighborhood'] == 'Not assigned']['Borough']
scraped_neighborhoods_clean.loc[scraped_neighborhoods_clean['Neighborhood'] == 'Not assigned', 'Neighborhood'] = borough_for_not_assigned_neighborhoods

In [None]:
scraped_neighborhoods_clean.shape

# Obtaining latitude and the longitude coordinates for neighborhoods

As `geocoder` lib is very unstable, let's load data from http://cocl.us/Geospatial_data CSV-file:

In [None]:
geospatial_data = pd.read_csv('http://cocl.us/Geospatial_data')

Now, let's merge 2 existing dataframes into single one:

In [None]:
neighborhoods_full_data = pd.merge(scraped_neighborhoods_clean, geospatial_data, on='Postal Code')