In [1]:
import pandas as pd
from pandas.io.json import json_normalize
import requests
from bs4 import BeautifulSoup
import folium
import numpy as np
import warnings
warnings.filterwarnings('ignore')

<h1>Data wrangling</h1>
<p>This is the first part of this project. Here, Toronto neighborhood data is scapped from Wikipedia. Further, this data is cleaned for exploratory analysis </p>

In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
html_data = requests.get(url).text

In [3]:
soup = BeautifulSoup(html_data, "html5lib")
table = soup.find_all("table")[0]

In [4]:
df = pd.DataFrame(columns=["PostalCode", "Borough", "Neighborhood"])

for row in table.find_all("tr"):
    for col in row.find_all("td"):
        PC = col.text[0:4].strip()
        TXT = col.text[4:len(col.text)].strip().replace(")", "").split("(")
        
        try:
            TXT[1]
        except:
            TXT.append(np.nan)

        
        df = df.append({"PostalCode": PC, "Borough": TXT[0], "Neighborhood": TXT[1]}, ignore_index=True)

In [5]:
df_updated = df[df["Borough"]!="Not assigned"].reset_index()
df_updated.drop("index", axis=1, inplace=True)

In [6]:
df_updated.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Regent Park / Harbourfront
3,M6A,North York,Lawrence Manor / Lawrence Heights
4,M7A,Queen's Park,Ontario Provincial Government


In [7]:
#No Duplicate postal codes
print("Total dataframe rows are: ", df_updated.shape[0], "\nTotal duplicate postal codes: ", df_updated.duplicated(subset=["PostalCode"]).value_counts())

Total dataframe rows are:  103 
Total duplicate postal codes:  False    103
dtype: int64


In [8]:
#No Not assigned Neighborhood
df_updated[df["Neighborhood"]=="Not assigned"]

Unnamed: 0,PostalCode,Borough,Neighborhood


In [9]:
print("Number of rows: ", df_updated.shape[0], ", Number of columns: ", df_updated.shape[1])

Number of rows:  103 , Number of columns:  3


In [10]:
geocodes = pd.read_csv("Geospatial_Coordinates.csv")
geocodes.rename(columns = {"Postal Code":"PostalCode"}, inplace=True)
geocodes.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [11]:
df_merged = df_updated.merge(geocodes, on="PostalCode")

In [12]:
df_merged.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,Regent Park / Harbourfront,43.65426,-79.360636
3,M6A,North York,Lawrence Manor / Lawrence Heights,43.718518,-79.464763
4,M7A,Queen's Park,Ontario Provincial Government,43.662301,-79.389494


<h1>Exploratory analysis</h1>
<p>This part of the project deals with explore the categories of the different locations withing downtown Toronto. Foursquare RESTful API calls has been utilized to retrieve locations data.</p>

In [13]:
latitude = 43.6534817
longitude = -79.3839347
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

for lat, lng, borough, neighborhood in zip(df_merged['Latitude'], df_merged['Longitude'], df_merged['Borough'], df_merged['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)

map_toronto

In [14]:
CLIENT_ID = 'CRMXJD3G0DZV2VRXR52OQ5TY5LAVAVOPEQ0FDMQ1MCTLQ0B3' # your Foursquare ID
CLIENT_SECRET = 'BTGV2LNLRFVMZU2QB2OBR2OQSRHK0OBPC5FWMABX0GUZYW50' # your Foursquare Secret
ACCESS_TOKEN = '55CLD0NFKP3RMC5BFHFICIFZ50NV1TDH5VKB1EZ522QZNH54' # your FourSquare Access Token
VERSION = '20180604'
LIMIT = 30

In [15]:
toronto_data = df_merged[df_merged['Borough'].str.contains('Toronto')].reset_index(drop=True)
toronto_data.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M5A,Downtown Toronto,Regent Park / Harbourfront,43.65426,-79.360636
1,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
2,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
3,M4E,East Toronto,The Beaches,43.676357,-79.293031
4,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306


In [16]:
borough_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

for lat, lng, borough, neighborhood in zip(toronto_data['Latitude'], toronto_data['Longitude'], toronto_data['Borough'], toronto_data['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(borough_toronto)

borough_toronto

In [17]:
toronto_data.loc[0, 'Borough']

'Downtown Toronto'

In [18]:
radius = 500
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            latitude, 
            longitude, 
            radius, 
            LIMIT)
url

'https://api.foursquare.com/v2/venues/explore?&client_id=CRMXJD3G0DZV2VRXR52OQ5TY5LAVAVOPEQ0FDMQ1MCTLQ0B3&client_secret=BTGV2LNLRFVMZU2QB2OBR2OQSRHK0OBPC5FWMABX0GUZYW50&v=20180604&ll=43.6534817,-79.3839347&radius=500&limit=30'

In [19]:
results = requests.get(url).json()

In [20]:
grouped_toronto = toronto_data.groupby(["Borough"]).count()

In [21]:
venues = results['response']['groups'][0]['items']

In [22]:
venue_data = pd.DataFrame(columns = ["Venue", "PostalCode", "Category", "Venue Latitude", "Venue Longitude"])

for i in venues:
    try:
        name = i['venue']['name']
        PostalCode = i['venue']['location']['postalCode'][0:3]
        category = i['venue']['categories'][0]['name']
        lat = i['venue']['location']['lat']
        lon = i['venue']['location']['lng']
        venue_data = venue_data.append({"Venue": name, "PostalCode": PostalCode, "Category": category, "Venue Latitude": lat, "Venue Longitude": lon}, ignore_index=True)
    except:
        pass


In [205]:
toronto_venues = df_merged.merge(venue_data, on="PostalCode")

In [206]:
toronto_venues.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Venue,Category,Venue Latitude,Venue Longitude
0,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937,Indigo,Bookstore,43.653515,-79.380696
1,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937,CF Toronto Eaton Centre,Shopping Mall,43.654447,-79.380952
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937,LUSH,Cosmetics Shop,43.653557,-79.3804
3,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937,Crepe Delicious,Fast Food Restaurant,43.654536,-79.380889
4,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937,SEPHORA,Cosmetics Shop,43.653527,-79.380154


In [25]:
borough_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

for lat, lng, Neighborhood, category in zip(toronto_venues['Venue Latitude'], toronto_venues['Venue Longitude'], toronto_venues['Neighborhood'], toronto_venues['Category']):
    label = '{}, {}'.format(Neighborhood, category)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(borough_toronto)

borough_toronto

<h1>Clustering</h1>
<p>This section is inteded to cluster neighbourhoods of toronto based on frequency of different locations.</p>

In [217]:
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors

In [207]:
hoods = toronto_venues[["Neighborhood", "Category"]]
dummy_df = pd.get_dummies(hoods["Category"], prefix=None)
dummy_df.insert(loc = 0, column = "Neighborhood", value = hoods["Neighborhood"])

In [208]:
freq_data = dummy_df.groupby("Neighborhood").mean()
toronto_venues.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Venue,Category,Venue Latitude,Venue Longitude
0,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937,Indigo,Bookstore,43.653515,-79.380696
1,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937,CF Toronto Eaton Centre,Shopping Mall,43.654447,-79.380952
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937,LUSH,Cosmetics Shop,43.653557,-79.3804
3,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937,Crepe Delicious,Fast Food Restaurant,43.654536,-79.380889
4,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937,SEPHORA,Cosmetics Shop,43.653527,-79.380154


In [210]:
kclusters = 3
km = KMeans(n_clusters=kclusters)
km.fit_predict(freq_data)

array([2, 1, 1, 0])

In [211]:
freq_data.insert(0, 'Cluster Labels', km.labels_)

In [212]:
freq_data = freq_data.reset_index()

In [213]:
freq_data.head()

Unnamed: 0,Neighborhood,Cluster Labels,Art Museum,Bookstore,Bubble Tea Shop,Clothing Store,Coffee Shop,Concert Hall,Cosmetics Shop,Department Store,...,Hotel,Monument / Landmark,Plaza,Poke Place,Restaurant,Seafood Restaurant,Shopping Mall,Sushi Restaurant,Tea Room,Vegetarian / Vegan Restaurant
0,Central Bay Street,2,0.166667,0.0,0.166667,0.0,0.0,0.0,0.0,0.0,...,0.166667,0.0,0.0,0.166667,0.0,0.0,0.0,0.166667,0.166667,0.0
1,"Garden District, Ryerson",1,0.0,0.083333,0.0,0.333333,0.0,0.0,0.166667,0.0,...,0.0,0.0,0.083333,0.0,0.0,0.0,0.083333,0.0,0.0,0.0
2,Richmond / Adelaide / King,1,0.0,0.0,0.0,0.0,0.142857,0.142857,0.0,0.0,...,0.0,0.142857,0.142857,0.0,0.142857,0.142857,0.0,0.0,0.0,0.142857
3,St. James Town,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [232]:
toronto_venues_new = toronto_venues.replace("Garden District, Ryerson", int(freq_data[freq_data["Neighborhood"]=="Garden District, Ryerson"]["Cluster Labels"])).replace("Richmond / Adelaide / King",int(freq_data[freq_data["Neighborhood"]=="Richmond / Adelaide / King"]["Cluster Labels"])).replace("St. James Town",int(freq_data[freq_data["Neighborhood"]=="St. James Town"]["Cluster Labels"])).replace("Central Bay Street",int(freq_data[freq_data["Neighborhood"]=="Central Bay Street"]["Cluster Labels"]))

In [233]:
toronto_venues_new.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Venue,Category,Venue Latitude,Venue Longitude
0,M5B,Downtown Toronto,1,43.657162,-79.378937,Indigo,Bookstore,43.653515,-79.380696
1,M5B,Downtown Toronto,1,43.657162,-79.378937,CF Toronto Eaton Centre,Shopping Mall,43.654447,-79.380952
2,M5B,Downtown Toronto,1,43.657162,-79.378937,LUSH,Cosmetics Shop,43.653557,-79.3804
3,M5B,Downtown Toronto,1,43.657162,-79.378937,Crepe Delicious,Fast Food Restaurant,43.654536,-79.380889
4,M5B,Downtown Toronto,1,43.657162,-79.378937,SEPHORA,Cosmetics Shop,43.653527,-79.380154


In [234]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_venues_new['Venue Latitude'], toronto_venues_new['Venue Longitude'], toronto_venues_new['Category'], toronto_venues_new["Neighborhood"]):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [242]:
toronto_venues_new = toronto_venues_new.rename(columns = {"Neighborhood":"Clusters"})

In [245]:
toronto_venues_new.insert(loc=0, column = "Neighborhood", value=toronto_venues["Neighborhood"])

In [246]:
toronto_venues_new.head()

Unnamed: 0,Neighborhood,PostalCode,Borough,clusters,Latitude,Longitude,Venue,Category,Venue Latitude,Venue Longitude
0,"Garden District, Ryerson",M5B,Downtown Toronto,1,43.657162,-79.378937,Indigo,Bookstore,43.653515,-79.380696
1,"Garden District, Ryerson",M5B,Downtown Toronto,1,43.657162,-79.378937,CF Toronto Eaton Centre,Shopping Mall,43.654447,-79.380952
2,"Garden District, Ryerson",M5B,Downtown Toronto,1,43.657162,-79.378937,LUSH,Cosmetics Shop,43.653557,-79.3804
3,"Garden District, Ryerson",M5B,Downtown Toronto,1,43.657162,-79.378937,Crepe Delicious,Fast Food Restaurant,43.654536,-79.380889
4,"Garden District, Ryerson",M5B,Downtown Toronto,1,43.657162,-79.378937,SEPHORA,Cosmetics Shop,43.653527,-79.380154


In [248]:
toronto_venues_new[toronto_venues_new["clusters"]==0]

Unnamed: 0,Neighborhood,PostalCode,Borough,clusters,Latitude,Longitude,Venue,Category,Venue Latitude,Venue Longitude
12,St. James Town,M5C,Downtown Toronto,0,43.651494,-79.375418,Hudson's Bay,Department Store,43.65204,-79.380391


In [249]:
toronto_venues_new[toronto_venues_new["clusters"]==1]

Unnamed: 0,Neighborhood,PostalCode,Borough,clusters,Latitude,Longitude,Venue,Category,Venue Latitude,Venue Longitude
0,"Garden District, Ryerson",M5B,Downtown Toronto,1,43.657162,-79.378937,Indigo,Bookstore,43.653515,-79.380696
1,"Garden District, Ryerson",M5B,Downtown Toronto,1,43.657162,-79.378937,CF Toronto Eaton Centre,Shopping Mall,43.654447,-79.380952
2,"Garden District, Ryerson",M5B,Downtown Toronto,1,43.657162,-79.378937,LUSH,Cosmetics Shop,43.653557,-79.3804
3,"Garden District, Ryerson",M5B,Downtown Toronto,1,43.657162,-79.378937,Crepe Delicious,Fast Food Restaurant,43.654536,-79.380889
4,"Garden District, Ryerson",M5B,Downtown Toronto,1,43.657162,-79.378937,SEPHORA,Cosmetics Shop,43.653527,-79.380154
5,"Garden District, Ryerson",M5B,Downtown Toronto,1,43.657162,-79.378937,UNIQLO ユニクロ,Clothing Store,43.65591,-79.380641
6,"Garden District, Ryerson",M5B,Downtown Toronto,1,43.657162,-79.378937,Nordstrom,Clothing Store,43.655041,-79.380966
7,"Garden District, Ryerson",M5B,Downtown Toronto,1,43.657162,-79.378937,lululemon athletica,Clothing Store,43.653286,-79.380764
8,"Garden District, Ryerson",M5B,Downtown Toronto,1,43.657162,-79.378937,Apple Eaton Centre,Electronics Store,43.652832,-79.380555
9,"Garden District, Ryerson",M5B,Downtown Toronto,1,43.657162,-79.378937,HomeSense,Furniture / Home Store,43.653149,-79.379418


In [250]:
toronto_venues_new[toronto_venues_new["clusters"]==2]

Unnamed: 0,Neighborhood,PostalCode,Borough,clusters,Latitude,Longitude,Venue,Category,Venue Latitude,Venue Longitude
13,Central Bay Street,M5G,Downtown Toronto,2,43.657952,-79.387383,Poke Guys,Poke Place,43.654895,-79.385052
14,Central Bay Street,M5G,Downtown Toronto,2,43.657952,-79.387383,Textile Museum of Canada,Art Museum,43.654396,-79.3865
15,Central Bay Street,M5G,Downtown Toronto,2,43.657952,-79.387383,Japango,Sushi Restaurant,43.655268,-79.385165
16,Central Bay Street,M5G,Downtown Toronto,2,43.657952,-79.387383,Tsujiri,Tea Room,43.655374,-79.385354
17,Central Bay Street,M5G,Downtown Toronto,2,43.657952,-79.387383,Chatime 日出茶太,Bubble Tea Shop,43.655542,-79.384684
18,Central Bay Street,M5G,Downtown Toronto,2,43.657952,-79.387383,Marriott Downtown at CF Toronto Eaton Centre,Hotel,43.654728,-79.382422
