# ASSIGN NEIGHBORHOODS OF TORONTO

In [1]:
# Path module to open privat and local json file
# from pathlib import Path
from os import path
# working with json and tranforming json file into a pandas dataframe library
import json
from pandas.io.json import json_normalize

import requests # library to handle requests
import pandas as pd # library for data analsysis
import numpy as np # library to handle data in a vectorized manner
import random # library for random number generation

from geopy.geocoders import Nominatim # module to convert an address into latitude and longitude values

# libraries for displaying images
from IPython.display import Image 
from IPython.core.display import HTML 
import folium # plotting library

# regex
import re


In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

## Pre-Processing the data:
Let's import, clear and rename the data frame

In [3]:
series_of_dataframes = pd.read_html(url)    # import raw data
df_first = series_of_dataframes[0]          # first dataframe from sereies of dataframes

# Select not assigned (na)
# na_nan_series = (df_first['Borough']=='Not assigned') & (df_first['Neighborhood'].isnull())
na_series = df_first['Borough']=='Not assigned'

# define clear of na's dataframe
df = df_first[na_series==False].rename(columns={'Postal code': 'PostalCode'})
print (df.head())

  PostalCode           Borough                                  Neighborhood
2        M3A        North York                                     Parkwoods
3        M4A        North York                              Victoria Village
4        M5A  Downtown Toronto                    Regent Park / Harbourfront
5        M6A        North York             Lawrence Manor / Lawrence Heights
6        M7A  Downtown Toronto  Queen's Park / Ontario Provincial Government


## Pre-Processing the data (continued):
1) If more than one neighborhood exist in one postal code area they need to be combined.

2) If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.

lets merge this properly:

In [4]:
# 1) First point (see above)

# let's check shape of initial DataFrame:
print ('The inititial Dataframe has {} number of rows'.format(df_first.shape[0]))
print ('The filtered Dataframe has {} number of rows'.format(df.shape[0]))
'''
One can see some empties were drop!
'''

# Check if PostalCode is unique for every row and there are no doubled postal codes
doubled = df['PostalCode'].unique().shape
if (df.shape[0]==doubled[0]):
    print ('PostalCode is OK, none of its values is doubled')
else:
    print ('some incongruences found, please check consistency')

# Check if neighborhoods are repited:
# let's split neighborhoods "hyphen-separated" into columns
neighborhoods = df['Neighborhood'].str.split(pat="/", expand=True)
# let's check duplicated rows
duplic = neighborhoods[neighborhoods.duplicated(keep=False)] 
print('Duplicated found in:')
print (duplic)

# let's check this rows in detail:
print (df_first.iloc[11])
print (df_first.iloc[20])
print (df_first.iloc[65])
print (df_first.iloc[74])
print (df_first.iloc[83])
print (df_first.iloc[91])
print (df_first.iloc[92])
print (df_first.iloc[109])

'''
CONCLUSION ON THIS:
There are neighborhoods so big that they have several Postal codes so:
    Nothing is wrong with them
'''

# let's continue and separate them with comma as asked
df['Neighborhood'] = df['Neighborhood'].str.replace('/', ',')
print ('Dataframe now comma-separated\n', df.head())



The inititial Dataframe has 180 number of rows
The filtered Dataframe has 103 number of rows
PostalCode is OK, none of its values is doubled
Duplicated found in:
              0     1     2     3     4     5     6     7
11    Don Mills  None  None  None  None  None  None  None
20    Don Mills  None  None  None  None  None  None  None
65    Downsview  None  None  None  None  None  None  None
74    Downsview  None  None  None  None  None  None  None
83    Downsview  None  None  None  None  None  None  None
91   Willowdale  None  None  None  None  None  None  None
92    Downsview  None  None  None  None  None  None  None
109  Willowdale  None  None  None  None  None  None  None
Postal code            M3B
Borough         North York
Neighborhood     Don Mills
Name: 11, dtype: object
Postal code            M3C
Borough         North York
Neighborhood     Don Mills
Name: 20, dtype: object
Postal code            M3K
Borough         North York
Neighborhood     Downsview
Name: 65, dtype: object
P

In [5]:
# 2) Second point (see above)

# let's check if a borough is given but neighborhood is empty
na_nan_series = ((df['Borough']!='Not assigned')|(df['Borough']!='')) & (df['Neighborhood'].isnull())
test = df[na_nan_series]
if (test.empty):
    print ('None of the given neighborhoods is unnamed')
else:
    print ('error')
    print ('###############')

# continue...

None of the given neighborhoods is unnamed


## Conclusion
The Data Frame is now clear of empties and in the format in which was asked.
Let's conclude this part with dimensions of resulting Data Frame and head & tail methods:

In [6]:
# shape of resulting Data Frame:
print ('The resulting shape of my Data Frame is: {}'.format(df.shape))
print ('Here head:')
print (df.head())
print ('and tail:')
print (df.tail())

The resulting shape of my Data Frame is: (103, 3)
Here head:
  PostalCode           Borough                                  Neighborhood
2        M3A        North York                                     Parkwoods
3        M4A        North York                              Victoria Village
4        M5A  Downtown Toronto                    Regent Park , Harbourfront
5        M6A        North York             Lawrence Manor , Lawrence Heights
6        M7A  Downtown Toronto  Queen's Park , Ontario Provincial Government
and tail:
    PostalCode           Borough  \
160        M8X         Etobicoke   
165        M4Y  Downtown Toronto   
168        M7Y      East Toronto   
169        M8Y         Etobicoke   
178        M8Z         Etobicoke   

                                          Neighborhood  
160    The Kingsway , Montgomery Road , Old Mill North  
165                               Church and Wellesley  
168              Business reply mail Processing CentrE  
169  Old Mill South , 

# GEOCODER

Here we need to get the latitude and the longitude coordinates of each neighborhood.
For this we are going to merge dataframes (one aout of wikipedia and the other sourced from Coursera as CSV).
I was not able to bring anything of geocoder to yield and I gave up! RRRRhhh.... :/

In [7]:
# !cat ../data/Geospatial_Coordinates.csv
geo_df = pd.read_csv('../data/Geospatial_Coordinates.csv')
geo_df = geo_df.rename(columns={'Postal Code': 'PostalCode'})

geo_df = df.merge(geo_df, how='outer', on='PostalCode')
print (geo_df)


    PostalCode           Borough  \
0          M3A        North York   
1          M4A        North York   
2          M5A  Downtown Toronto   
3          M6A        North York   
4          M7A  Downtown Toronto   
..         ...               ...   
98         M8X         Etobicoke   
99         M4Y  Downtown Toronto   
100        M7Y      East Toronto   
101        M8Y         Etobicoke   
102        M8Z         Etobicoke   

                                          Neighborhood   Latitude  Longitude  
0                                            Parkwoods  43.753259 -79.329656  
1                                     Victoria Village  43.725882 -79.315572  
2                           Regent Park , Harbourfront  43.654260 -79.360636  
3                    Lawrence Manor , Lawrence Heights  43.718518 -79.464763  
4         Queen's Park , Ontario Provincial Government  43.662301 -79.389494  
..                                                 ...        ...        ...  
98     The Kin

# CLUSTERING

We are going to cluster the best place to live in in Toronto where it should be easier to study, to determine which postal codes have the best of these venues to go to and understand their features (conclusion), so first:
  - download venues per postal code and features
  - join dataframes so features are joined with latitude & longitude
  - make 3 groups to study
  - from these 3 groups observe features



## Get credentials from local json file
For the very first we are going to download credentials from same json file I have stored in my computer.
PLEASE, understand that these credentials are personal and I don't want to post them in here!

In [8]:
# Get credentials from json file
#########################################################################

# Open json file and return content of it
def readFile(nameFile):
    Path = path.abspath(nameFile)
    with open(Path, 'r') as file:
        content = file.read()
        return content


## Globals initialization
To read the data we call to this json file 

In [9]:
# Globals initialization
#########################################################################
# read json file and save it
id_key = readFile('credentials.json')
id_key = json.loads(id_key)
# # inizializate id credentials
client_id = id_key["client_id"]
client_secret = id_key["client_secret"]
version = str(20180602)
# object for URL from foursquare's API
main_URL = 'https://api.foursquare.com/v2/'

## Classes
We are going to generate a class with the links to the Foursquare API. This will simplify the code and make it easier to understand

In [10]:
# Classes---
##########################################################################
'''
class Link() -> generates a link to query the json file out of foursquare's API:
    method venue -> generates this link for venues and needs of: 
        option,         venues
        location,       as: latitud, longitud
        query           as: coffee, chinese food, windsurf...
'''
class Link:

    global main_URL, client_id, client_secret, version

    def __init__(self, **kwargs):
        self.__dict__.update(kwargs)

    def venue(self, **kwargs):
        self.option = '/' + self.option + '?'
        self.location = 'll=' + self.location
        self.query = '&query=' + self.query
        url = str(main_URL+'venues'+self.option+self.location+self.query+'&client_id='+client_id+'&client_secret='+client_secret+'&v='+version)
        # kwargs should have the same name as in Foursquare
        for key, value in kwargs.items():
            append = '&' + key + '=' + str(value)
            url = url + append
        return (url)

    # TODO: Finish it...
    def explore(self):
        self.option = '/' + self.option + '?'
        self.location = 'll=' + self.location
        self.query = '&query=' + self.query
        url = str(main_URL+'venues'+self.option+self.location+self.query+'&client_id='+client_id+'&client_secret='+client_secret+'&v='+version)
        return (url)



## Functions
    - function to avoid generating testing variables of json indenxed files (only for test)
    - function that extracts the category of the venue
    - function to pass a link and get a request and a json data out of it
    - function to keep only columns that include venue name, and anything that is associated with location

In [11]:
# function to avoid generating testing variables of json indenxed files (only for test):
def view_json(var):
    var = json.dumps(var, sort_keys=True, indent=2)
    return print (var)


In [12]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']


In [13]:
# function to pass a link and get a request and a json data out of it
def get_df(link):
    results = requests.get(link).json()
    # print (results)
    # Pretty printing:
    json_string = json.dumps(results, sort_keys=True, indent=2)
    # print(json_string)
    # generate a python variable to access data:
    jdata = json.loads(json_string)
    jdata = jdata['response']['venues']
    # jdata = jdata['response']['groups']
    # view_json(jdata)              # object of a self-made function which prints json beautified!
    # lets import this data into pandas
    dataframe = json_normalize(jdata)
    # print (df.columns)
    # print (df.head())
    return dataframe


In [14]:
# keep only columns that include venue name, and anything that is associated with location
def filter_df(dataframe):
    filtered_columns = ['name', 'categories'] + [col for col in dataframe.columns if col.startswith('location.')] + ['id']
    dataframe = dataframe.loc[:, filtered_columns]  
    dataframe = dataframe.rename(columns=lambda x: re.sub('location.','',x))
    return dataframe


##  Request and view information from url
Import data of Toronto, clean it and assign it to postal codes for later continue with clustering

In [15]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

# Let's import, clear and rename the data frame
series_of_dataframes = pd.read_html(url)    # import raw data
df_first = series_of_dataframes[0]          # first dataframe from sereies of dataframes

# Select not assigned (na)
# na_nan_series = (df_first['Borough']=='Not assigned') & (df_first['Neighborhood'].isnull())
na_series = df_first['Borough']=='Not assigned'

# define clear of na's dataframe
df = df_first[na_series==False].rename(columns={'Postal code': 'PostalCode'})
# print (df.head())

# let's continue and separate them with comma as asked
df['Neighborhood'] = df['Neighborhood'].str.replace('/', ',')
# print ('Dataframe now comma-separated\n', df.head())

# !cat ../data/Geospatial_Coordinates.csv
geo_df = pd.read_csv('./Geospatial_Coordinates.csv')
geo_df = geo_df.rename(columns={'Postal Code': 'PostalCode'})

geo_df = df.merge(geo_df, how='outer', on='PostalCode')
print (geo_df)

    PostalCode           Borough  \
0          M3A        North York   
1          M4A        North York   
2          M5A  Downtown Toronto   
3          M6A        North York   
4          M7A  Downtown Toronto   
..         ...               ...   
98         M8X         Etobicoke   
99         M4Y  Downtown Toronto   
100        M7Y      East Toronto   
101        M8Y         Etobicoke   
102        M8Z         Etobicoke   

                                          Neighborhood   Latitude  Longitude  
0                                            Parkwoods  43.753259 -79.329656  
1                                     Victoria Village  43.725882 -79.315572  
2                           Regent Park , Harbourfront  43.654260 -79.360636  
3                    Lawrence Manor , Lawrence Heights  43.718518 -79.464763  
4         Queen's Park , Ontario Provincial Government  43.662301 -79.389494  
..                                                 ...        ...        ...  
98     The Kin

## Get a DataFrame from the mean position of boroughs in Toronto
Procedure:

    - try to group positions from geo_df for every Borough
    - find the mean lat and lng from each
    - do a search from every Borough

In [16]:
# initialization of dictionary and data frame
u_list = {
    'Borough': '',
    'Latitude': 0.0,
    'Longitude': 0.0,
    'Position': ''
}
boroughs_mean_pos = pd.DataFrame(columns=u_list)

Now we are going to find the mean position for the borough from all positions we get from the districts

In [17]:
boroughs_mean_pos = pd.DataFrame(columns=u_list)
for borough in geo_df['Borough'].unique():
    df_inter = geo_df[(geo_df['Borough']==borough)]
    # u_list = (x, y, z) x=borough, y=latitude, z=longitude
    x=borough
    y=df_inter['Latitude'].mean()
    z=df_inter['Longitude'].mean()
    p=str(str(y) + ',' + str(z))
    u_list = {
        'Borough': x,
        'Latitude': y,
        'Longitude': z,
        'Position': p
    }
    u_list = pd.Series(u_list)
    boroughs_mean_pos = boroughs_mean_pos.append(u_list, ignore_index=True)

print (boroughs_mean_pos)

            Borough   Latitude  Longitude  \
0        North York  43.750727 -79.429338   
1  Downtown Toronto  43.654597 -79.383972   
2         Etobicoke  43.660043 -79.542074   
3       Scarborough  43.766229 -79.249085   
4         East York  43.700303 -79.335851   
5              York  43.690797 -79.472633   
6      East Toronto  43.669436 -79.324654   
7      West Toronto  43.652653 -79.449290   
8   Central Toronto  43.701980 -79.398954   
9       Mississauga  43.636966 -79.615819   

                                Position  
0        43.750727425,-79.42933832499999  
1   43.65459717894736,-79.38397156842105  
2        43.660042975,-79.54207355000001  
3   43.76622889411766,-79.24908523529415  
4         43.70030348,-79.33585115999999  
5          43.6907968,-79.47263340000002  
6        43.669436479999995,-79.32465436  
7  43.652652933333336,-79.44928976666667  
8  43.701979788888885,-79.39895405555555  
9          43.6369656,-79.61581899999999  


Now we have it, we can generate a map with folium's library right on Toronto's center

In [18]:
# generate map centered on the middle of Toronto:
#################################################################
latTO = 43.6813         # --> Old city Center
lonTO = -79.4003        # --> Old city Center
locationTO = str(str(latTO) + ',' + str(lonTO))
mapTO = folium.Map(location=[latTO, lonTO], zoom_start=12)
mapTO

Now we are going to cluster the information on the following lines.

First want to say that though the code is clear, may confuse a bit the 'i' variable. It is just A COUNTER to see in which borough was comming out the third error. This error was on one borough in which was no data in the response out of the API. Need to say that I had no interest to know why, after I gotr the result I was waiting for.


## Clustering

This code is made out of a for loop which iterate boroughs number of times (9 times then as Toronto has 9 boroughs). 

In it are going to be generated links for the API, data frames out its json (json response) and markers for the final map. 

In between these data frames are filtered for convinience (please see filter_df and get_category_type methods above).

In [19]:
i=0

for pos in (boroughs_mean_pos['Position']):

    print ('i=',i)
    # cafeine = Link(option='search', location=pos, query='coffee').venue(radius=5000,limit=80) # radius in meters?
    # print (cafeine)

    coffeeTO = Link(option='search', location=pos, query='coffee').venue(radius=5000,limit=80) # radius in meters?
    libraryTO = Link(option='search', location=pos, query='library').venue(radius=5000,limit=80) # radius in meters?
    studyTO = Link(option='search', location=pos, query='study').venue(radius=5000,limit=80) # radius in meters?

    df_coffee = get_df(coffeeTO)
    df_library = get_df(libraryTO)
    df_study = get_df(studyTO)


    # Pandas data preparing:
    #################################################################
    # keep only columns that include venue name, and anything that is associated with location
    try:
        # filter the category for each row
        df_coffee = filter_df(df_coffee)
        df_coffee['categories'] = df_coffee.apply(get_category_type, axis=1)

        # instantiate a feature group for matches in the dataframe
        coffees = folium.map.FeatureGroup()
        for latitude, longitude, in zip(df_coffee.lat, df_coffee.lng):
            libraries.add_child(
                folium.CircleMarker(
                    [latitude, longitude],
                    radius=5, # define how big you want the circle markers to be
                    color='black',
                    fill=True,
                    fill_color='red',
                    fill_opacity=0.6
                )
            )
    except Exception as e:
        print (df_coffee.head())
        print (e)
        pass



    try:
        df_library = filter_df(df_library)
        df_library['categories'] = df_library.apply(get_category_type, axis=1)
        libraries = folium.map.FeatureGroup()
        for latitude, longitude, in zip(df_library.lat, df_library.lng):
            libraries.add_child(
                folium.CircleMarker(
                    [latitude, longitude],
                    radius=5, # define how big you want the circle markers to be
                    color='black',
                    fill=True,
                    fill_color='blue',
                    fill_opacity=0.6
                )
            )
    except Exception as e:
        print (df_library.head())
        print (e)
        pass



    try:
        df_study = filter_df(df_study)
        df_study['categories'] = df_study.apply(get_category_type, axis=1)
        studies = folium.map.FeatureGroup()
        for latitude, longitude, in zip(df_study.lat, df_study.lng):
            studies.add_child(
                folium.CircleMarker(
                    [latitude, longitude],
                    radius=10, # define how big you want the circle markers to be
                    color='black',
                    fill=True,
                    fill_color='green',
                    fill_opacity=0.6
                )
            )
    except Exception as e:
        print (df_study.head())
        print (e)
        pass

    i+=1

    mapTO.add_child(coffees)
    mapTO.add_child(libraries)
    mapTO.add_child(studies)

    
mapTO

i= 0
                                   name                   categories  \
0  Mountain View Estates Coffee company  Professional & Other Places   
1                Timothy's World Coffee                  Coffee Shop   
2                Timothy's World Coffee                  Coffee Shop   
3                           Coffee Time                  Coffee Shop   
4                           Coffee Time                  Coffee Shop   

                                   address  cc        city country  \
0                    1260 Martingrove Road  CA     Toronto  Canada   
1  5650 Yonge Street,North American Centre  CA  Willowdale  Canada   
2                      700 mt pleasant ave  CA     Toronto  Canada   
3                                      NaN  CA         NaN  Canada   
4                                      NaN  CA     Toronto  Canada   

     crossStreet  distance                                   formattedAddress  \
0      Bethridge      3402  [1260 Martingrove Road (Bethridg

# CONCLUSIONS

Green (big) Points are study centers, blues are libraries and reds are coffee shops.

Where would be the best place for you to study?

For me would be a nice place close to the school to look for a flat where coffees and libraries are also in the surroundings (I would rather have looked for party I guess, but I was once young... NOT ANYMORE :(

Hope yo enjoy your search!