## Segmenting and Clustering Neighborhoods in Toronto - Part I

## Pre-processing the data

In [4]:
# importing the necessary libraries
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests

In [5]:
# downloading the data
wikipedia_link = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
raw_wikipedia_page = requests.get(wikipedia_link).text
# using beautiful soup to parse the HTML/XML codes
soup = BeautifulSoup(raw_wikipedia_page,'xml')

## Processing data - extracting data from wikipedia

In [6]:
table = soup.find('table')
Postcode = []
Borough = []
Neighborhood = []

for tr_cell in table.find_all('tr'):
    counter = 1
    Postcode_var = -1
    Borough_var = -1
    Neighborhood_var = -1
    
    for td_cell in tr_cell.find_all('td'):
        if counter == 1:
            Postcode_var = td_cell.text
        if counter == 2:
            Borough_var = td_cell.text
            tag_a_Borough = td_cell.find('a')
        if counter == 3:
            Neighborhood_var = str(td_cell.text).strip()
            tag_a_Neighborhood = td_cell.find('a')
            
        counter +=1
    
    if (Postcode_var == 'Not assigned' or Borough_var == 'Not assigned' or Neighborhood_var == 'Not assigned'):
        continue
    try:
        if ((tag_a_Borough is None) or (tag_a_Neighborhood is None)):
            continue
    except:
        pass
    if(Postcode_var == -1 or Borough_var == -1 or Neighborhood_var == -1):
        continue
        
    Postcode.append(Postcode_var)
    Borough.append(Borough_var)
    Neighborhood.append(Neighborhood_var)

## Integrating postal codes with more than one neighborhood

In [7]:
unique_p = set(Postcode)
print('number of unique Postal codes:', len(unique_p))
Postcode_u = []
Borough_u = []
Neighborhood_u = []

for postcode_unique_element in unique_p:
    p_var = ''; b_var = ''; n_var = '';
    for postcode_idx, postcode_element in enumerate(Postcode):
        if postcode_unique_element == postcode_element:
            p_var = postcode_element;
            b_var = Borough[postcode_idx]
            if n_var == '':
                n_var = Neighborhood[postcode_idx]
            else:
                n_var = n_var + ',' + Neighborhood[postcode_idx]
    Postcode_u.append(p_var)
    Borough_u.append(b_var)
    Neighborhood_u.append(n_var)
                        

number of unique Postal codes: 77


## Creating the Pandas dataframe

In [8]:
toronto_dict = {'Postcode':Postcode_u, 'Borough':Borough_u, 'Neighborhood':Neighborhood_u}
df_toronto = pd.DataFrame.from_dict(toronto_dict)
df_toronto.to_csv('toronto_part1.csv')
df_toronto.head(12)

Unnamed: 0,Postcode,Borough,Neighborhood
0,M5X,Downtown Toronto,"First Canadian Place,Underground city"
1,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge"
2,M6S,West Toronto,"Runnymede,Swansea"
3,M3K,North York,CFB Toronto
4,M2M,North York,"Newtonbrook,Willowdale"
5,M9L,North York,Humber Summit
6,M2K,North York,Bayview Village
7,M5M,North York,Bedford Park
8,M4Y,Downtown Toronto,Church and Wellesley
9,M5S,Downtown Toronto,University of Toronto


In [9]:
df_toronto.shape

(77, 3)

## Segmenting and Custering Neighborhoods in Toronto - Part II

## Extracting the latitude and longitude coordinates of each neighborhood

In [10]:
!pip install geocoder




Collecting geocoder
[?25l  Downloading https://files.pythonhosted.org/packages/4f/6b/13166c909ad2f2d76b929a4227c952630ebaf0d729f6317eb09cbceccbab/geocoder-1.38.1-py2.py3-none-any.whl (98kB)
[K     |████████████████████████████████| 102kB 16.2MB/s ta 0:00:01
Collecting ratelim (from geocoder)
  Downloading https://files.pythonhosted.org/packages/f2/98/7e6d147fd16a10a5f821db6e25f192265d6ecca3d82957a4fdd592cad49c/ratelim-0.1.6-py2.py3-none-any.whl
Installing collected packages: ratelim, geocoder
Successfully installed geocoder-1.38.1 ratelim-0.1.6


In [11]:
import geocoder
latitude = []
longitude = []
for elem in Postcode_u:
# initialize your variable to None
    lat_lng_coords = None

# Loop until you get the coordinates
    while (lat_lng_coords is None):
        g = geocoder.arcgis('{}, Toronto, Ontario'.format(elem))
        lat_lng_coords = g.latlng
        
    latitude.append(lat_lng_coords[0])
    longitude.append(lat_lng_coords[1])
    print(elem, 'is Received')

M5X is Received
M1L is Received
M6S is Received
M3K is Received
M2M is Received
M9L is Received
M2K is Received
M5M is Received
M4Y is Received
M5S is Received
M5E is Received
M5T is Received
M4X is Received
M1P is Received
M1R is Received
M6L is Received
M3C is Received
M1C is Received
M4W is Received
M4H is Received
M4J is Received
M5J is Received
M9A is Received
M6R is Received
M6J is Received
M5P is Received
M1T is Received
M8X is Received
M8Y is Received
M4C is Received
M6K is Received
M5L is Received
M4A is Received
M6A is Received
M3L is Received
M1B is Received
M3A is Received
M5A is Received
M6H is Received
M2H is Received
M1E is Received
M2J is Received
M1K is Received
M5K is Received
M1J is Received
M4L is Received
M9N is Received
M9R is Received
M8Z is Received
M5C is Received
M6P is Received
M1G is Received
M1M is Received
M9M is Received
M4G is Received
M2R is Received
M9V is Received
M4N is Received
M2L is Received
M3J is Received
M4K is Received
M3H is Received
M1V is R

## Create a pandas dataframe with the latitudes and longitudes

In [12]:
toronto_dict = {'Postcode':Postcode_u, 'Borough':Borough_u, 'Neighborhood':Neighborhood_u,'Latitude':latitude, 'Longitude':longitude}
df_toronto = pd.DataFrame.from_dict(toronto_dict)
df_toronto.to_csv('toronto_base.csv')
df_toronto.head(12)

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
0,M5X,Downtown Toronto,"First Canadian Place,Underground city",43.64828,-79.381461
1,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge",43.713133,-79.285055
2,M6S,West Toronto,"Runnymede,Swansea",43.649885,-79.474929
3,M3K,North York,CFB Toronto,43.739026,-79.46732
4,M2M,North York,"Newtonbrook,Willowdale",43.791475,-79.413605
5,M9L,North York,Humber Summit,43.7595,-79.556852
6,M2K,North York,Bayview Village,43.781015,-79.380542
7,M5M,North York,Bedford Park,43.73546,-79.419164
8,M4Y,Downtown Toronto,Church and Wellesley,43.666585,-79.381302
9,M5S,Downtown Toronto,University of Toronto,43.66311,-79.401801


## Segmenting and Custering Neighborhoods in Toronto - Part III

## Exploring Neighborhoods in Toronto

In [13]:
# importing new libraries
import numpy as np
import pandas as pd 
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

!conda config --add channels conda-forge

!conda install -c conda-forge geopy -- yes

from geopy.geocoders import Nominatim

import requests

!conda install -c conda-forge folium

import folium

print('Libraries imported')

Solving environment: failed

PackagesNotFoundError: The following packages are not available from current channels:

  - yes

Current channels:

  - https://conda.anaconda.org/conda-forge/linux-64
  - https://conda.anaconda.org/conda-forge/noarch
  - https://repo.anaconda.com/pkgs/main/linux-64
  - https://repo.anaconda.com/pkgs/main/noarch
  - https://repo.anaconda.com/pkgs/free/linux-64
  - https://repo.anaconda.com/pkgs/free/noarch
  - https://repo.anaconda.com/pkgs/r/linux-64
  - https://repo.anaconda.com/pkgs/r/noarch
  - https://repo.anaconda.com/pkgs/pro/linux-64
  - https://repo.anaconda.com/pkgs/pro/noarch

To search for alternate channels that may provide the conda package you're
looking for, navigate to

    https://anaconda.org

and use the search bar at the top of the page.


Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - folium


The following packages will be downloaded:

    package       

## Load the pandas dataframe (created in part 2 of this assignment)

In [14]:
df_toronto=pd.read_csv('toronto_base.csv')
df_toronto.head(12)

Unnamed: 0.1,Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
0,0,M5X,Downtown Toronto,"First Canadian Place,Underground city",43.64828,-79.381461
1,1,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge",43.713133,-79.285055
2,2,M6S,West Toronto,"Runnymede,Swansea",43.649885,-79.474929
3,3,M3K,North York,CFB Toronto,43.739026,-79.46732
4,4,M2M,North York,"Newtonbrook,Willowdale",43.791475,-79.413605
5,5,M9L,North York,Humber Summit,43.7595,-79.556852
6,6,M2K,North York,Bayview Village,43.781015,-79.380542
7,7,M5M,North York,Bedford Park,43.73546,-79.419164
8,8,M4Y,Downtown Toronto,Church and Wellesley,43.666585,-79.381302
9,9,M5S,Downtown Toronto,University of Toronto,43.66311,-79.401801


## Create a map of Toronto

In [15]:
toronto_latitude = 43.6532; toronto_longitude = -79.3832
map_toronto = folium.Map(location = [toronto_latitude, toronto_longitude], zoom_start =10.7)

# adding markers to the map
for lat, lng, borough, neighborhood in zip(df_toronto['Latitude'], df_toronto['Longitude'], df_toronto['Borough'], df_toronto['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label,parse_html=True)
    folium.CircleMarker([lat, lng],radius=5,popup=label,color='blue',fill=True, fill_color='#3186cc', fill_opacity=0.7).add_to(map_toronto)
    
map_toronto

## Create a new dataframe with neighborhoods in Scarborough

In [16]:
# @hiddel_cell
CLIENT_ID = 'DMWCRXUQYSNJX051ME1DNS5JS2RQO15XHBTGIXYH3ZK15NE1'
CLIENT_SECRET ='2DYQH1IYEPETF540XRZYCITFNETWSE1V3V1KGXCCB43CLZ1Z'
VERSION = '20180604'

In [17]:
scarborough_data = df_toronto[df_toronto['Borough'] == 'Scarborough'].reset_index(drop=True)
scarborough_data.head(10)

Unnamed: 0.1,Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
0,1,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge",43.713133,-79.285055
1,13,M1P,Scarborough,"Dorset Park,Scarborough Town Centre,Wexford He...",43.759975,-79.268974
2,14,M1R,Scarborough,"Maryvale,Wexford",43.75071,-79.30056
3,17,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.78573,-79.15875
4,26,M1T,Scarborough,Tam O'Shanter,43.784725,-79.299047
5,35,M1B,Scarborough,"Rouge,Malvern",43.811525,-79.195517
6,40,M1E,Scarborough,"Morningside,West Hill",43.76569,-79.175256
7,42,M1K,Scarborough,"Ionview,Kennedy Park",43.726245,-79.26367
8,44,M1J,Scarborough,Scarborough Village,43.743125,-79.23175
9,51,M1G,Scarborough,Woburn,43.768359,-79.21759


## Create a map of Scarborough and its neighborhoods

In [18]:
address_scar = 'Scarborough, Toronto'
latitude_scar = 43.773077
longitude_scar = -79.257774
print('The geographical coordinate of Scarborough are {}, {}.'.format(latitude_scar, longitude_scar))

The geographical coordinate of Scarborough are 43.773077, -79.257774.


In [19]:
map_scarb = folium.Map(location=[latitude_scar, longitude_scar], zoom_start=12)

# adding markers to the map
for lat, lng, label in zip(scarborough_data['Latitude'], scarborough_data['Longitude'], scarborough_data['Neighborhood']):
    label = folium.Popup(label,parse_html=True)
    folium.CircleMarker([lat, lng],radius=5,popup=label,color='blue',fill=True, fill_color='#3186cc', fill_opacity=0.7).add_to(map_scarb)
    
map_scarb

## Get the top 100 venues in the neighborhood 'Woburn', from Scarborough

In [21]:
neighborhood_latitude = scarborough_data.loc[9, 'Latitude']
neighborhood_longitude = scarborough_data.loc[9,'Longitude']
neighborhood_name = scarborough_data.loc[9, 'Neighborhood']
print('Latitude and longitude values of "{}" are {},{}.'.format(neighborhood_name,neighborhood_latitude,neighborhood_longitude))

Latitude and longitude values of "Woburn" are 43.76835912100006,-79.21758999999997.


In [22]:
LIMIT = 100
radius = 500
url = 'https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&ll={},{}&v={}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, latitude_scar, longitude_scar, VERSION,
                                                                                                                          radius, LIMIT)

In [23]:
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5d6c8ea09ba3e5002c8b928b'},
 'response': {'suggestedFilters': {'header': 'Tap to show:',
   'filters': [{'name': 'Open now', 'key': 'openNow'}]},
  'headerLocation': 'Scarborough City Centre',
  'headerFullLocation': 'Scarborough City Centre, Toronto',
  'headerLocationGranularity': 'neighborhood',
  'totalResults': 42,
  'suggestedBounds': {'ne': {'lat': 43.7775770045, 'lng': -79.25155367954714},
   'sw': {'lat': 43.7685769955, 'lng': -79.26399432045285}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '5085ec39e4b0b1ead2eb0818',
       'name': 'Disney Store',
       'location': {'address': '300 Borough Drive',
        'crossStreet': 'in Scarborough Town Centre',
        'lat': 43.775537,
        'lng': -79.256833,
        'labeledLa

In [24]:
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [25]:
import json
from pandas.io.json import json_normalize

venues = results['response']['groups'][0]['items']
nearby_venues = json_normalize(venues)

filtered_columns = ['venue.name','venue.categories','venue.location.lat','venue.location.lng']
nearby_venues = nearby_venues.loc[:,filtered_columns]

nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head(10)

Unnamed: 0,name,categories,lat,lng
0,Disney Store,Toy / Game Store,43.775537,-79.256833
1,American Eagle Outfitters,Clothing Store,43.775908,-79.258352
2,SEPHORA,Cosmetics Shop,43.775017,-79.258109
3,DAVIDsTEA,Tea Room,43.776613,-79.258516
4,Tommy Hilfiger Company Store,Clothing Store,43.776015,-79.257369
5,Coliseum Scarborough Cinemas,Movie Theater,43.775995,-79.255649
6,St. Andrews Fish & Chips,Fish & Chips Shop,43.771865,-79.252645
7,Chipotle Mexican Grill,Mexican Restaurant,43.77641,-79.258069
8,Hot Topic,Clothing Store,43.77545,-79.257929
9,Shoppers Drug Mart,Pharmacy,43.773305,-79.251662


In [26]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

42 venues were returned by Foursquare.
