# Q1: generate dataframe of postcode, borough and neighborhood
## Web Scraping for acuquiring tables on web page https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M

In [227]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import lxml

res = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")
soup = BeautifulSoup(res.content,'lxml')
table = soup.find_all('table')[0] 
df = pd.read_html(str(table))[0]
df.head(15)

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
7,M8A,Not assigned,Not assigned
8,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
9,M1B,Scarborough,"Malvern, Rouge"


## data cleaning

- [x] The dataframe will consist of three columns: PostalCode, Borough, and Neighborhood
- [x] Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned. Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.
- [x] More than one neighborhood can exist in one postal code area. For example, in the table on the Wikipedia page, you will notice that M5A is listed twice and has two neighborhoods: Harbourfront and Regent Park. These two rows will be combined into one row with the neighborhoods separated with a comma as shown in row 11 in the above table.
- [x] If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.
- [x] Clean your Notebook and add Markdown cells to explain your work and any assumptions you are making.
- [x] In the last cell of your notebook, use the .shape method to print the number of rows of your dataframe.

In [228]:
df.rename(columns={'Postal Code':'PostalCode'},inplace = True)
df.drop(df[df['Borough']=='Not assigned'].index,inplace = True)
df.reset_index(drop=True, inplace=True)
df.loc[df['Neighborhood'] =='Not assigned' , 'Neighborhood'] = df['Borough']
result = df.groupby(['PostalCode','Borough'], sort=False).agg( ', '.join)
df.head(15)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


In [229]:
df['Borough'].unique()
df.groupby(['Borough']).count()

Unnamed: 0_level_0,PostalCode,Neighborhood
Borough,Unnamed: 1_level_1,Unnamed: 2_level_1
Central Toronto,9,9
Downtown Toronto,19,19
East Toronto,5,5
East York,5,5
Etobicoke,12,12
Mississauga,1,1
North York,24,24
Scarborough,17,17
West Toronto,6,6
York,5,5


### Till now, dataframe has been cleaned


In [230]:
df.shape
print('number of rows in my dataframe:',df.shape[0] )

number of rows in my dataframe: 103


# Q2: complete the above table with location data
## read csv file for locations and append Latitude and Longitude columns to neighborhood table


In [231]:
location_data = pd.read_csv('http://cocl.us/Geospatial_data')
df_location= pd.DataFrame(location_data)
df_location.set_index('Postal Code',inplace=True)
df_location.head()

Unnamed: 0_level_0,Latitude,Longitude
Postal Code,Unnamed: 1_level_1,Unnamed: 2_level_1
M1B,43.806686,-79.194353
M1C,43.784535,-79.160497
M1E,43.763573,-79.188711
M1G,43.770992,-79.216917
M1H,43.773136,-79.239476


In [237]:
import numpy as np
postlist = df['PostalCode']
Latitude_list = []
Longitude_list = []
for index,item in enumerate(postlist):
    Latitude_list.append(df_location.loc[item]['Latitude'])
    Longitude_list.append(df_location.loc[item]['Longitude'])
df['Latitude']=Latitude_list
df['Longitude']=Longitude_list
df.head(15)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village",43.667856,-79.532242
6,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
7,M3B,North York,Don Mills,43.745906,-79.352188
8,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937


# Q3: Clustering and visualization
## show the map in center of Toronto 

In [238]:
import folium
import requests 
import json 
import matplotlib.cm as cm
import matplotlib.colors as colors
import pandas as pd
import sklearn

from pandas.io.json import json_normalize 
from sklearn.cluster import KMeans
from geopy.geocoders import Nominatim 

address = 'Toronto, Ontario Canada'
geolocator = Nominatim()
location = geolocator.geocode(address)
Toronto_latitude = location.latitude
Toronto_longitude = location.longitude
print('The geograpical coordinate of Toronto Canada are {}, {}.'.format(Toronto_latitude, Toronto_longitude))

The geograpical coordinate of Toronto Canada are 43.6534817, -79.3839347.


  


In [235]:
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


## map of toronto

In [240]:
map_toronto = folium.Map(location=[Toronto_latitude, Toronto_longitude], zoom_start=10)

for lat, lng, borough, neighborhood in zip(df['Latitude'], df['Longitude'], df['Borough'], df['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=4,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#87cefa',
        fill_opacity=0.5,
        parse_html=False).add_to(map_toronto)

map_toronto

## Explore boroughs which name has word 'Torento' in. They are: 'Downtown Toronto', 'East Toronto', 'West Toronto' and 'Central Toronto'

In [241]:
df.loc[df['Borough'].str.contains('Toronto'),'Borough'].unique()


toronto_borough = df[df['Borough'].str.contains("Toronto")].reset_index(drop=True)
print(toronto_borough.shape)
toronto_borough.head(15)

(39, 5)


Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
3,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
4,M4E,East Toronto,The Beaches,43.676357,-79.293031
5,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
6,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
7,M6G,Downtown Toronto,Christie,43.669542,-79.422564
8,M5H,Downtown Toronto,"Richmond, Adelaide, King",43.650571,-79.384568
9,M6H,West Toronto,"Dufferin, Dovercourt Village",43.669005,-79.442259


## mark them in Toronto map

In [242]:
for lat, lng, label in zip(toronto_borough['Latitude'], toronto_borough['Longitude'], toronto_borough['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker([lat, lng], 
                        radius=5, 
                        popup=label, 
                        color='blue', 
                        fill=True, 
                        fill_color='blue', 
                        fill_opacity=0.7,
                        parse_html=False).add_to(map_toronto)  
map_toronto

In [243]:
# Foursquare API
CLIENT_ID = '1IE0DSD4UR1KELVDJHGV5DXOUC3KHJ0RLRAHPOVGQ0RBMMEI' # Put Your Client Id
CLIENT_SECRET = '0YIQHCGFCJR31BO4NZVAXRI5FW0FJPZ4PH32PQUVVBVC4G2H' # Put You Client Secret 
VERSION = '20180615'
LIMIT = 30
radius=500
print('Your credentails:')
print('CLIENT_ID: Hidden')
print('CLIENT_SECRET: Hidden')

Your credentails:
CLIENT_ID: Hidden
CLIENT_SECRET: Hidden


In [244]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'\
            .format(CLIENT_ID, CLIENT_SECRET, VERSION, lat, lng ,radius, LIMIT)
        
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        venues_list.append([( name, lat, lng, v['venue']['name'], v['venue']['location']['lat'], 
                             v['venue']['location']['lng'], v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                             'Neighborhood Latitude', 
                             'Neighborhood Longitude', 
                             'Venue', 'Venue Latitude', 
                             'Venue Longitude', 
                             'Venue Category']
    
    return(nearby_venues)

In [245]:
toronto_borough
toronto_venues = getNearbyVenues(names=toronto_borough['Neighborhood'], 
                                 latitudes=toronto_borough['Latitude'],
                                 longitudes=toronto_borough['Longitude'])
toronto_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Regent Park, Harbourfront",43.65426,-79.360636,Roselle Desserts,43.653447,-79.362017,Bakery
1,"Regent Park, Harbourfront",43.65426,-79.360636,Tandem Coffee,43.653559,-79.361809,Coffee Shop
2,"Regent Park, Harbourfront",43.65426,-79.360636,Morning Glory Cafe,43.653947,-79.361149,Breakfast Spot
3,"Regent Park, Harbourfront",43.65426,-79.360636,Cooper Koo Family YMCA,43.653249,-79.358008,Distribution Center
4,"Regent Park, Harbourfront",43.65426,-79.360636,Body Blitz Spa East,43.654735,-79.359874,Spa


## since I like eating, let me find out what is neighborhood which has the most restaurants. It does not matter if it is Chinese, Italian, French, etc. The most important thing is that there are some food waiting for me...
## Next step I will clustering them into different groups and mark them on the map to get a general idea about where they are and which is the neighborbood that has the most restaurant surounded. By looking at the table, neighborhood 'Little Portugal, Trinity' has the most restaurants. After clustering, I expect to see that 'Little Portugal, Trinity' on the map locates somehow close to the center of a cluster.

In [247]:
toronto_venues.groupby('Neighborhood').count().head()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Berczy Park,30,30,30,30,30,30
"Brockton, Parkdale Village, Exhibition Place",23,23,23,23,23,23
"Business reply mail Processing Centre, South Central Letter Processing Plant Toronto",16,16,16,16,16,16
"CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport",18,18,18,18,18,18
Central Bay Street,30,30,30,30,30,30


In [70]:
toronto_restaurant = toronto_venues[toronto_venues['Venue Category'].str.contains("Restaurant")]
toronto_restaurant.groupby('Neighborhood').count().sort_values(['Venue Category'], ascending=False)

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Little Portugal, Trinity",13,13,13,13,13,13
"The Danforth West, Riverdale",12,12,12,12,12,12
"University of Toronto, Harbord",10,10,10,10,10,10
Central Bay Street,9,9,9,9,9,9
"Kensington Market, Chinatown, Grange Park",9,9,9,9,9,9
Stn A PO Boxes,9,9,9,9,9,9
Davisville,9,9,9,9,9,9
"St. James Town, Cabbagetown",9,9,9,9,9,9
"Richmond, Adelaide, King",9,9,9,9,9,9
"First Canadian Place, Underground city",8,8,8,8,8,8


In [172]:
toronto_restaurant.shape

(196, 7)

# Q3: Clustering the above 197 restuarants and show them on the map to decide which neighbourhood is ideal for a food person to live

In [248]:
toronto_restaurant.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
5,"Regent Park, Harbourfront",43.65426,-79.360636,Impact Kitchen,43.656369,-79.35698,Restaurant
21,"Regent Park, Harbourfront",43.65426,-79.360636,Cluny Bistro & Boulangerie,43.650565,-79.357843,French Restaurant
31,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494,Sushi Box,43.66296,-79.38658,Sushi Restaurant
32,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494,Mercatto,43.660391,-79.387664,Italian Restaurant
36,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494,Como En Casa,43.66516,-79.384796,Mexican Restaurant


## clustering

In [250]:
import sklearn.cluster.k_means_
num_clusters = 5
X = toronto_restaurant.loc[:,['Venue Latitude','Venue Longitude']]

# scaling data
from sklearn.preprocessing import StandardScaler
cluster_dataset = StandardScaler().fit_transform(X)
k_means = KMeans(init="k-means++", n_clusters=num_clusters, n_init=12)
k_means.fit(cluster_dataset)
labels = k_means.labels_

## add grouped restaurant clusters (colourful dots) to map and also location of neighborhood 'Little Portugal, Trinity' to map (the black dot)

In [253]:
map_toronto_restaurant = folium.Map(location=[Toronto_latitude, Toronto_longitude], zoom_start=12)

lat1 = toronto_restaurant[toronto_restaurant['Neighborhood']=='Little Portugal, Trinity']['Neighborhood Latitude'].unique()
lon1 = toronto_restaurant[toronto_restaurant['Neighborhood']=='Little Portugal, Trinity']['Neighborhood Longitude'].unique()
folium.CircleMarker(location=[lat1[0], lon1[0]], radius=5, popup='t', color='black', fill=True, fill_color='black', fill_opacity=0.7).add_to(map_toronto_restaurant)

x = np.arange(num_clusters)
ys = [i+x+(i*x)**2 for i in range(num_clusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

for lat, lon, cluster in zip(X['Venue Latitude'],X['Venue Longitude'],labels):
    label = folium.Popup(' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker([lat, lon], radius=5, popup=label, color=rainbow[cluster], fill=True, fill_color=rainbow[cluster], fill_opacity=0.7).add_to(map_toronto_restaurant)

map_toronto_restaurant

## Conclusion:##
### 'Little Portugal, Trinity' has some restaurants around, which proves that clustering results are good. However when you look at the above map, a better food-around-place could be somewhere above Toronto Union Station.

### So, if my future work is located in the purple cluster range, I go right away! Here concludes my wk3 assignment.