# Coursera Capstone Week 3 Notebook

## Part 1

Install Packages if not installed

In [None]:
!pip install lxml # requirement for pandas read_html

Importing Packages

In [None]:
import pandas as pd

**Get the Toronto Data table from Wikipedia**

In [None]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
toronto_page = pd.read_html(url, header=0)
toronto_df = toronto_page[0]
toronto_df

As Seen, the Values from the page are imported


Now Renaming the Column

In [None]:
toronto_df.rename(columns={'Postal Code':'PostalCode'}, inplace=True)
toronto_df

Remove Not Assigned Boroughs and set Neighbourhood to Borough when Not Assigned

In [None]:
toronto_df = toronto_df[toronto_df['Borough'] != 'Not assigned'].reset_index(drop=True)
toronto_df.loc[toronto_df['Neighborhood'] == 'Not assigned', 'Neighborhood'] = toronto_df['Borough']
toronto_df

**Group all neighborhoods with same postal code

In [None]:
toronto_df_grouped = toronto_df.groupby('PostalCode', as_index=False).agg(lambda x: ", ".join(x))
toronto_df_grouped

Get the unique column data

In [None]:
postalcodes = toronto_df_grouped['PostalCode'].nunique()
boroughs = toronto_df_grouped['Borough'].nunique()
neighbourhoods= toronto_df_grouped['Neighborhood'].nunique()
print('Unique Postalcodes in Toronto Canada : ' + str(postalcodes))
print('Unique Boroughs in Toronto Canada : '+ str(boroughs))
print('Unique Neighbourhoods in Toronto Canada :' + str(neighbourhoods))

**As we can see above, the number of unique PostalCode is equal to the number of rows, thus satisfying the question condition**


***Finally, Getting the shape of the Dataframe***

In [None]:
toronto_df_grouped.shape

## Part 2

Importing Packages

In [None]:
import pandas as pd

#### **Read the CSV file to get Latitude and Longitude of Every Postal Code**

In [None]:
csv_loc = './Geospatial_Coordinates.csv'
postalcodes = pd.read_csv(csv_loc)
postalcodes

Rename Postal Code to PostalCode for Merging with main toronto_df

In [None]:
postalcodes.rename(columns={'Postal Code':'PostalCode'}, inplace=True)
postalcodes

#### Merging the postal codes with the neighbourhood details

In [None]:
toronto_neighborhoods = pd.merge(toronto_df_grouped, postalcodes, how='right', on = 'PostalCode')
toronto_neighborhoods

## Part 3

Installing Packages

In [None]:
# !pip install geopy
#!pip install matplotlib
#!pip install sklearn

Importing Packages

In [None]:
from geopy.geocoders import Nominatim
import folium
import requests
import numpy as np
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans

Getting the Neighbourhoods that only contain Toronto in them

In [None]:
toronto_neighborhoods = toronto_neighborhoods[toronto_neighborhoods['Borough'].str.contains("Toronto", case=False)]
toronto_neighborhoods

Let's get Toronto's coordinates

In [None]:
address = 'Toronto, Canada'

geolocator = Nominatim(user_agent="to_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

Now, Lets Map Toronto

In [None]:
# create map of Manhattan using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(toronto_neighborhoods['Latitude'], toronto_neighborhoods['Longitude'], toronto_neighborhoods['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

#### Getting the FourSqure API

In [None]:

CLIENT_ID = '1SP5IVUCVB1EHHVALJE3LMF1Y1R3LWTZTEXAWMIW4TNZ31V5' # your Foursquare ID
CLIENT_SECRET = 'GB3EHZAS5ZHDABIFJ3224JUZ1HG5LRLB0NP24H42KW1WMA5T' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
radius = 500
LIMIT = 150

Function to Get Venues Near all Locations

In [None]:
def getNearbyVenues(names, latitudes, longitudes, radius):
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [None]:
toronto_venues = getNearbyVenues(names=toronto_neighborhoods['Neighborhood'], latitudes=toronto_neighborhoods['Latitude'], 
                                 longitudes=toronto_neighborhoods['Longitude'], radius=radius)

Seeing the Venues Data

In [None]:
toronto_venues.shape
toronto_venues

Let's check how many venues were returned for each neighborhood

In [None]:
toronto_venues.groupby('Neighborhood').count()

#### Let's find out how many unique categories can be curated from all the returned venues

In [None]:
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

### Analyzing Each Neighborhood

In [None]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot

And let's examine the new dataframe size.

In [None]:
toronto_onehot.shape

Next, let's group rows by neighborhood and by taking the mean of the frequency of occurrence of each category

In [None]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped

Let's confirm the new size

In [None]:
toronto_grouped.shape

#### Lets print each neighboorhood with the top 5 most common venue

In [None]:
num_top_venues = 5

for hood in toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

#### Let's put that into a pandas dataframe¶
First, let's write a function to sort the venues in descending order.

In [None]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

Now let's create the new dataframe and display the top 8 venues for each neighborhood.

In [None]:
num_top_venues = 8

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted

### Cluster Neighborhoods

Now, Let's Cluster them into 5 clusters

In [None]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=42).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

Let's create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.

In [None]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged  = toronto_neighborhoods

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

toronto_merged.head() # check the last columns!

Resetting the Index

In [None]:
toronto_merged = toronto_merged.reset_index(drop=True)
toronto_merged

#### Lets Map Out this Data

In [None]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

### Examining the Clusters

#### For Cluster 1 - This may be the high end living/work area as along with coffee shops and bars, speciality restaurants for indian and japanse food are highly visited

In [None]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 0, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

#### For Cluster 2-  This May be the SHopping Area as it is close to downtown and banks, hotels and diners along with the coffee shops are highly visited

In [None]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 1, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

#### For Cluster 3 - Seeing the location near the Airport and mostly Tourist Places, This may be a Tourist Area

In [None]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 2, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

#### For Cluster 4 - Seeing that Grocery Shops and Parks are highly visited here, These may be the Living Areas

In [None]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 3, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

#### For Cluster 5 - This may be the School/College are as book store and College Gym and Quad are highly Visited

In [None]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 4, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]