# Answer 1

Before we start with the main lab content, let's download all the dependencies that we will need.

In [2]:
import requests #request API with python
from bs4 import BeautifulSoup # Beautiful Soup is a Python library for pulling data out of HTML and XML files.
import numpy as np # library for vectorized computation
import pandas as pd # library to process data as dataframes
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes
import folium # map rendering library
print('Libraries imported.')

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - folium=0.5.0


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    openssl-1.1.1c             |       h516909a_0         2.1 MB  conda-forge
    branca-0.3.1               |             py_0          25 KB  conda-forge
    vincent-0.4.4              |             py_1          28 KB  conda-forge
    altair-3.1.0               |           py36_0         724 KB  conda-forge
    ca-certificates-2019.6.16  |       hecc5488_0         145 KB  conda-forge
    certifi-2019.6.16          |           py36_1         149 KB  conda-forge
    folium-0.5.0               |             py_0          45 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         3.2 MB

The following NEW packages will be 

In [11]:
#Get the text from wiki url and then parse with BeautifulSoup finding the table class
wiki_text = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text # get entire page
soup = BeautifulSoup(wiki_text,'lxml') # convert text to xml soup object in order to extract what i need.
table_dataset = soup.find('table',{'class':'wikitable'}) # find table with  BeautifulSoup find function
print(table_dataset)

<table class="wikitable sortable">
<tbody><tr>
<th>Postcode</th>
<th>Borough</th>
<th>Neighbourhood
</th></tr>
<tr>
<td>M1A</td>
<td>Not assigned</td>
<td>Not assigned
</td></tr>
<tr>
<td>M2A</td>
<td>Not assigned</td>
<td>Not assigned
</td></tr>
<tr>
<td>M3A</td>
<td><a href="/wiki/North_York" title="North York">North York</a></td>
<td><a href="/wiki/Parkwoods" title="Parkwoods">Parkwoods</a>
</td></tr>
<tr>
<td>M4A</td>
<td><a href="/wiki/North_York" title="North York">North York</a></td>
<td><a href="/wiki/Victoria_Village" title="Victoria Village">Victoria Village</a>
</td></tr>
<tr>
<td>M5A</td>
<td><a href="/wiki/Downtown_Toronto" title="Downtown Toronto">Downtown Toronto</a></td>
<td><a href="/wiki/Harbourfront_(Toronto)" title="Harbourfront (Toronto)">Harbourfront</a>
</td></tr>
<tr>
<td>M5A</td>
<td><a href="/wiki/Downtown_Toronto" title="Downtown Toronto">Downtown Toronto</a></td>
<td><a href="/wiki/Regent_Park" title="Regent Park">Regent Park</a>
</td></tr>
<tr>
<td>M6A</td>

In [21]:
dfs = pd.read_html(str(table_dataset))# Read into a list table html 
df = pd.concat(dfs) #then convert to a dataframe
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [22]:
#The dataframe will consist of three columns: PostalCode, Borough, and Neighborhood, so we need to rename column Postcode
df.rename(columns={'Postcode': 'Postalcode'}, inplace=True)
df.head()

Unnamed: 0,Postalcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [23]:
#Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.
neighborhoods_data = df[df['Borough'] != 'Not assigned'].reset_index(drop=True)
neighborhoods_data.head()

Unnamed: 0,Postalcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights


In [45]:
#More than one neighborhood can exist in one postal code area. For example, in the table on the Wikipedia page, you will notice that M5A is listed twice and has two neighborhoods: Harbourfront and Regent Park. These two rows will be combined into one row with the neighborhoods separated with a comma as shown in row 11 in the above table.
neighborhoods_data = neighborhoods_data.groupby(['Postalcode','Borough'], sort=False).agg( ', '.join).reset_index()
neighborhoods_data.head(12)

Unnamed: 0,Postalcode,Borough,Neighbourhood
0,M5H,Downtown Toronto,"Adelaide, King, Richmond"
1,M1S,Scarborough,Agincourt
2,M1V,Scarborough,"Agincourt North, L'Amoreaux East, Milliken, St..."
3,M9V,Etobicoke,"Albion Gardens, Beaumond Heights, Humbergate, ..."
4,M8W,Etobicoke,"Alderwood, Long Branch"
5,M3H,North York,"Bathurst Manor, Downsview North, Wilson Heights"
6,M2K,North York,Bayview Village
7,M5M,North York,"Bedford Park, Lawrence Manor East"
8,M5E,Downtown Toronto,Berczy Park
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [40]:
#Now we have 103 rows
neighborhoods_data.shape

(103, 3)

In [46]:
#If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough. So for the 9th cell in the table on the Wikipedia page, the value of the Borough and the Neighborhood columns will be Queen's Park.
for index, data_row in neighborhoods_data.iterrows():
    if data_row['Neighbourhood'] == 'Not assigned':
        data_row['Neighbourhood'] = data_row['Borough']
neighborhoods_data

Unnamed: 0,Postalcode,Borough,Neighbourhood
0,M5H,Downtown Toronto,"Adelaide, King, Richmond"
1,M1S,Scarborough,Agincourt
2,M1V,Scarborough,"Agincourt North, L'Amoreaux East, Milliken, St..."
3,M9V,Etobicoke,"Albion Gardens, Beaumond Heights, Humbergate, ..."
4,M8W,Etobicoke,"Alderwood, Long Branch"
5,M3H,North York,"Bathurst Manor, Downsview North, Wilson Heights"
6,M2K,North York,Bayview Village
7,M5M,North York,"Bedford Park, Lawrence Manor East"
8,M5E,Downtown Toronto,Berczy Park
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


In the last cell of your notebook, use the .shape method to print the number of rows of your dataframe.

In [47]:
neighborhoods_data.shape

(103, 3)

# # Answer 2

Now that you have built a dataframe of the postal code of each neighborhood along with the borough name and neighborhood name, in order to utilize the Foursquare location data, we need to get the latitude and the longitude coordinates of each neighborhood.

Before we start with the main lab content, let's download all the dependencies that we will need.

In [50]:
import io #Core tools for working with streams

First we need to load csv file that has the geographical coordinates of each postal code from http://cocl.us/Geospatial_data

In [65]:
csv_file_content=requests.get("http://cocl.us/Geospatial_data").content#get the csv content with the help of requests module
lat_long_df=pd.read_csv(io.StringIO(csv_file_content.decode('utf-8'))) #convert into pandas dataframe
print("data loaded")

data loaded


In [66]:
#inspect the content
lat_long_df.head(3)

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711


Rename Postal Code column in order to merge data with another dataframe

In [67]:
lat_long_df.rename(columns={'Postal Code': 'Postalcode'}, inplace=True)
lat_long_df.head(1)

Unnamed: 0,Postalcode,Latitude,Longitude
0,M1B,43.806686,-79.194353


Then we can merge the tables

In [68]:
neighborhoods_data = pd.merge(lat_long_df, neighborhoods_data, on='Postalcode')
neighborhoods_data = neighborhoods_data[['Postalcode', 'Borough', 'Neighbourhood', 'Latitude', 'Longitude']]#change the columns order
neighborhoods_data.head()

Unnamed: 0,Postalcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In the result we will still have the same 103 rows, but now with two more columns

In [70]:
neighborhoods_data.shape

(103, 5)