<H2>Assigment<H2>
    First part: scraping and creating dataframe

In [116]:
#installing libraries
! pip install BeautifulSoup4
! pip install lxml
! pip install geocoder
! pip install folium

from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans

from pandas.io.json import json_normalize 
import matplotlib.cm as cm
import matplotlib.colors as colors



In [117]:
#taking source code of page
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(source, 'lxml')

In [118]:
#scraping part:
#geting table
table = soup.find('table',class_='wikitable sortable').tbody
rows = table.find_all('tr')

#creating dataframe
toronto_df = pd.DataFrame()

#declaring lists
columns=[]
postcodes=[]
boroughs=[]
neighborhoods=[]

#extracting column names
columns = [value.text.rstrip() for value in rows[0].find_all('th')]

#getting values for every column
for row in rows[1:]:
    row_list = [value.text.rstrip() for value in row.find_all('td')]
    for column, value in zip([postcodes, boroughs, neighborhoods], row_list):
        column.append(value)
        
#adding values to dataframe
for column, values in zip(columns,[postcodes, boroughs, neighborhoods]):
    toronto_df[column] = values

In [119]:
#showing our dataframe
toronto_df.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor
7,M7A,Queen's Park,Not assigned
8,M8A,Not assigned,Not assigned
9,M9A,Queen's Park,Queen's Park


In [120]:
#dropping rows
toronto_df_filtered = toronto_df[toronto_df['Neighbourhood'] != 'Not assigned'].reset_index(drop=True)
toronto_df_filtered.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,Lawrence Heights
4,M6A,North York,Lawrence Manor


<h2>Second part: coordinates<h2>

In [121]:
import geocoder

In [122]:
Latitude = []
Longitude = []
for borough,neighborhood in zip(toronto_df_filtered['Borough'],toronto_df_filtered['Neighbourhood']):
    g = geocoder.osm('{}, {}, Toronto, ON, Canada'.format(neighborhood,borough))
    Latitude.append(g.lat)
    Longitude.append(g.lng)

In [123]:
toronto_df_filtered['Latitude'] = Latitude
toronto_df_filtered['Longitude'] = Longitude

In [124]:
toronto_df_filtered.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.7588,-79.320197
1,M4A,North York,Victoria Village,43.732658,-79.311189
2,M5A,Downtown Toronto,Harbourfront,,
3,M6A,North York,Lawrence Heights,43.722778,-79.450933
4,M6A,North York,Lawrence Manor,43.722079,-79.437507


In [125]:
#using csv file
toronto_df_c = pd.read_csv('http://cocl.us/Geospatial_data')
toronto_df_c.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [126]:
#shaping data
cleaned_toronto_df = toronto_df[toronto_df['Borough']!='Not assigned'].reset_index(drop=True)
merged_df = cleaned_toronto_df.merge(right=toronto_df_c, right_on='Postal Code', left_on='Postcode')
merged_df.drop('Postcode',axis=1,inplace=True)
merged_df = merged_df[['Postal Code','Borough','Neighbourhood','Latitude','Longitude']]
merged_df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636
3,M6A,North York,Lawrence Heights,43.718518,-79.464763
4,M6A,North York,Lawrence Manor,43.718518,-79.464763


In [127]:
import folium

In [128]:
#now, when we have longitudes and latitudes lets make a map
toronto_map = folium.Map(location=[merged_df['Latitude'].mean(),merged_df['Longitude'].mean()],tiles='Stamen Terrain',zoom_start=11)
for lat, lng, label in zip(merged_df['Latitude'], merged_df['Longitude'], merged_df['Neighbourhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        popup=label,
        parse_html=False).add_to(toronto_map)
toronto_map

<h2>Third part: Exploring and clustering the neighborhoods in Toronto with foresquare<h2>

In [148]:
CLIENT_ID = ''
CLIENT_SECRET = ''
VERSION = '20180605'
LIMIT = 10

In [130]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
        results = requests.get(url).json()["response"]['groups'][0]['items']
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [131]:
#looking for venues
toronto_venues = getNearbyVenues(names=merged_df['Neighbourhood'],
                                   latitudes=merged_df['Latitude'],
                                   longitudes=merged_df['Longitude']
                                  )

Parkwoods
Victoria Village
Harbourfront
Lawrence Heights
Lawrence Manor
Not assigned
Queen's Park
Rouge
Malvern
Don Mills North
Woodbine Gardens
Parkview Hill
Ryerson
Garden District
Glencairn
Cloverdale
Islington
Martin Grove
Princess Gardens
West Deane Park
Highland Creek
Rouge Hill
Port Union
Flemingdon Park
Don Mills South
Woodbine Heights
St. James Town
Humewood-Cedarvale
Bloordale Gardens
Eringate
Markland Wood
Old Burnhamthorpe
Guildwood
Morningside
West Hill
The Beaches
Berczy Park
Caledonia-Fairbanks
Woburn
Leaside
Central Bay Street
Christie
Cedarbrae
Hillcrest Village
Bathurst Manor
Downsview North
Wilson Heights
Thorncliffe Park
Adelaide
King
Richmond
Dovercourt Village
Dufferin
Scarborough Village
Fairview
Henry Farm
Oriole
Northwood Park
York University
East Toronto
Harbourfront East
Toronto Islands
Union Station
Little Portugal
Trinity
East Birchmount Park
Ionview
Kennedy Park
Bayview Village
CFB Toronto
Downsview East
The Danforth West
Riverdale
Design Exchange
Toronto 

In [132]:
print(toronto_venues.shape)
toronto_venues.head()

(1426, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Parkwoods,43.753259,-79.329656,Brookbanks Park,43.751976,-79.33214,Park
1,Parkwoods,43.753259,-79.329656,Variety Store,43.751974,-79.333114,Food & Drink Shop
2,Victoria Village,43.725882,-79.315572,Victoria Village Arena,43.723481,-79.315635,Hockey Arena
3,Victoria Village,43.725882,-79.315572,Tim Hortons,43.725517,-79.313103,Coffee Shop
4,Victoria Village,43.725882,-79.315572,Portugril,43.725819,-79.312785,Portuguese Restaurant


In [133]:
venues_per_neighborhood = toronto_venues[['Neighborhood','Venue Category']].groupby(['Neighborhood']).count().sort_values('Venue Category', ascending = False)
venues_per_neighborhood.head()

Unnamed: 0_level_0,Venue Category
Neighborhood,Unnamed: 1_level_1
St. James Town,20
Runnymede,14
Adelaide,10
Little Portugal,10
Railway Lands,10


In [135]:
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

In [137]:
toronto_onehot.shape

(1426, 177)

In [138]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()

In [139]:
num_top_venues = 5

for hood in toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Adelaide----
          venue  freq
0    Steakhouse   0.2
1  Concert Hall   0.1
2     Speakeasy   0.1
3   Opera House   0.1
4   Coffee Shop   0.1


----Agincourt----
                        venue  freq
0                      Lounge  0.25
1                Skating Rink  0.25
2   Latin American Restaurant  0.25
3              Breakfast Spot  0.25
4  Modern European Restaurant  0.00


----Agincourt North----
              venue  freq
0               Gym  0.33
1        Playground  0.33
2              Park  0.33
3  Hakka Restaurant  0.00
4            Market  0.00


----Albion Gardens----
                  venue  freq
0         Grocery Store  0.14
1            Beer Store  0.14
2   Fried Chicken Joint  0.14
3  Fast Food Restaurant  0.14
4           Pizza Place  0.14


----Alderwood----
                venue  freq
0         Pizza Place  0.22
1         Coffee Shop  0.11
2      Sandwich Place  0.11
3  Athletics & Sports  0.11
4                 Pub  0.11


----Bathurst Manor----
               

In [140]:
#making a function to...
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [141]:
#checking and sorting what we get
num_top_venues = 10
indicators = ['st', 'nd', 'rd']
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']
for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)
neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Adelaide,Steakhouse,Opera House,Concert Hall,Speakeasy,Hotel,Asian Restaurant,Plaza,Vegetarian / Vegan Restaurant,Coffee Shop,Construction & Landscaping
1,Agincourt,Latin American Restaurant,Lounge,Breakfast Spot,Skating Rink,Construction & Landscaping,Convenience Store,Comic Shop,Eastern European Restaurant,Concert Hall,Drugstore
2,Agincourt North,Park,Gym,Playground,Curling Ice,Drugstore,Dog Run,Discount Store,Diner,Dessert Shop,Department Store
3,Albion Gardens,Sandwich Place,Grocery Store,Fast Food Restaurant,Pharmacy,Pizza Place,Fried Chicken Joint,Beer Store,Women's Store,Dance Studio,Discount Store
4,Alderwood,Pizza Place,Skating Rink,Athletics & Sports,Pharmacy,Pub,Sandwich Place,Coffee Shop,Gym,Creperie,Diner


In [142]:
#Now starting with clustering
# seting number of clusters
kclusters = 10

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# runing k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# checking cluster labels for each row in the dataframe
kmeans.labels_[0:10]

array([3, 4, 5, 4, 4, 3, 0, 3, 4, 3], dtype=int32)

In [143]:
# adding more clustering labels
try:
    neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)
except:
    pass
toronto_merged = merged_df
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighbourhood')
toronto_merged.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M3A,North York,Parkwoods,43.753259,-79.329656,5.0,Park,Food & Drink Shop,Electronics Store,Drugstore,Dog Run,Discount Store,Diner,Dessert Shop,Department Store,Deli / Bodega
1,M4A,North York,Victoria Village,43.725882,-79.315572,4.0,Coffee Shop,French Restaurant,Financial or Legal Service,Intersection,Hockey Arena,Portuguese Restaurant,Pizza Place,Construction & Landscaping,Deli / Bodega,Dog Run
2,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636,3.0,Breakfast Spot,Park,Historic Site,Restaurant,Spa,Bakery,Farmers Market,Coffee Shop,Gym / Fitness Center,Curling Ice
3,M6A,North York,Lawrence Heights,43.718518,-79.464763,0.0,Clothing Store,Women's Store,Boutique,Furniture / Home Store,Coffee Shop,Vietnamese Restaurant,Accessories Store,Airport Food Court,Department Store,Electronics Store
4,M6A,North York,Lawrence Manor,43.718518,-79.464763,0.0,Clothing Store,Women's Store,Boutique,Furniture / Home Store,Coffee Shop,Vietnamese Restaurant,Accessories Store,Airport Food Court,Department Store,Electronics Store


In [146]:
toronto_clusters = folium.Map(location=[merged_df['Latitude'].mean(),merged_df['Longitude'].mean()],tiles='Stamen Terrain',zoom_start=11)
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighbourhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    try:
        folium.CircleMarker(
            [lat, lon],
            radius=5,
            popup=label,
            color=rainbow[int(cluster-1)],
            fill=True,
            fill_color=rainbow[int(cluster-1)],
            fill_opacity=0.7).add_to(toronto_clusters)
    except:
        folium.CircleMarker(
            [lat, lon],
            radius=5,
            popup=label,
            color='black',
            fill=True,
            fill_color='black',
            fill_opacity=0.7).add_to(toronto_clusters)

In [147]:
#some final analizing steps
toronto_merged.loc[toronto_merged['Cluster Labels'] == 0, toronto_merged.columns[[1] + list(range(5, merged_df.shape[1]))]]
for n in range(0,10):
    print(toronto_merged.loc[toronto_merged['Cluster Labels'] == 0, toronto_merged.columns[[1] + list(range(5, merged_df.shape[1]))]])

              Borough
3          North York
4          North York
163  Downtown Toronto
164  Downtown Toronto
165  Downtown Toronto
166  Downtown Toronto
167  Downtown Toronto
168  Downtown Toronto
169  Downtown Toronto
              Borough
3          North York
4          North York
163  Downtown Toronto
164  Downtown Toronto
165  Downtown Toronto
166  Downtown Toronto
167  Downtown Toronto
168  Downtown Toronto
169  Downtown Toronto
              Borough
3          North York
4          North York
163  Downtown Toronto
164  Downtown Toronto
165  Downtown Toronto
166  Downtown Toronto
167  Downtown Toronto
168  Downtown Toronto
169  Downtown Toronto
              Borough
3          North York
4          North York
163  Downtown Toronto
164  Downtown Toronto
165  Downtown Toronto
166  Downtown Toronto
167  Downtown Toronto
168  Downtown Toronto
169  Downtown Toronto
              Borough
3          North York
4          North York
163  Downtown Toronto
164  Downtown Toronto
165  Downt