# Capstone Data Project

## Yong (Daniel) Shen

### This notebook will be primarily used for the Coursera IBM Data Science Capstone Project

In [1]:
import pandas as pd
import numpy as np

In [2]:
print("Hello Capstone Project Course!")

Hello Capstone Project Course!


## 1. Segmenting and Clustering Toronto Data

#### A. Wikipedia scraping and cleaning data using Pandas

In [3]:
#Here, I use the BeautifulSoup package to scrape the Wikipedia page data and use Pandas to clean it.

import requests
from bs4 import BeautifulSoup

wiki = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")
soup = BeautifulSoup(wiki.content,'lxml')
table = soup.find_all('tr')
list_rows = []
for row in table:
    row_td = row.find_all('td')
    str_cells = str(row_td)
    cleantext = BeautifulSoup(str_cells, "lxml").get_text()
    
    list_rows.append(cleantext)
#print(row_td)
#print(list_rows)

In [4]:
#Building Pandas dataframe to store the scraped raw data into manageable format
df = pd.DataFrame(list_rows)
#df
#After checking df, I relized that there are extra rows that are taken that aren't needed and should be cleaned. So I make another dataframe.
df1 = df[1:288]
df1.head()

Unnamed: 0,0
1,"[M1A, Not assigned, Not assigned\n]"
2,"[M2A, Not assigned, Not assigned\n]"
3,"[M3A, North York, Parkwoods\n]"
4,"[M4A, North York, Victoria Village\n]"
5,"[M5A, Downtown Toronto, Harbourfront\n]"


In [5]:
#Split the three into columns, get rid of the brackets, and get rid of the \n in the end of each row
df1 = df1[0].str.split(',', expand=True)
df1[0] = df1[0].str.strip('[]')
df1[2] = df1[2].str.strip('\n]')
#We are only interested in the first 3 columns so we'll take the first three with the following code
df1 = df1.iloc[:, 0:3]
df1.columns = ['PostalCode', 'Borough', 'Neighborhood']
df1['Borough'].str.strip()
#df1.head()
df1.shape

(287, 3)

In [6]:
#Now we can get rid of the Boroughs that have value = "not assigned"
i = df1['Borough'] == " Not assigned"
no_borough_names = df1[i].index
df1.drop(no_borough_names, inplace=True)
df1.reset_index(drop=True, inplace=True)
df1.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights


In [7]:
#Now let's assign the borough name to the neighborhood name if the neighborhood name is unassigned.
a = df1['Neighborhood'] == " Not assigned"
no_neigh_names = df1[a].index
df1.iloc[no_neigh_names, 2] = df1.iloc[no_neigh_names, 1]

In [8]:
# Now we combine the Neighborhoods with the same PostalCode into one row, separated by commas.
df2 = df1.groupby(['PostalCode','Borough'])['Neighborhood'].apply(list)
df3 = pd.DataFrame(df2)
df3.reset_index(inplace=True)
df3['Neighborhood'] = df3['Neighborhood'].apply(lambda x: ','.join(map(str, x)))
df3.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [9]:
#Now we print the shape of the cleaned dataframe after applying the .shape function
df3.shape

(103, 3)

#### B. Importing geographic location data

In [10]:
dfLatLong = pd.read_csv('http://cocl.us/Geospatial_data')
dfLatLong.head()
#it appears that the Postal Code column in the csv file already matches that of the df3 dataframe. Let's check that.
equal = dfLatLong['Postal Code'].equals(df3['PostalCode'])
print(equal)

True


In [11]:
#So now just concatenate the two dataframes into one. But first I clean the dfLatLong dataframe so it only includes the geolocations.
dfLatLong_concat = dfLatLong[['Latitude','Longitude']]
#dfLatLong_concat.head()
df4 = pd.concat([df3, dfLatLong_concat], axis=1)
df4.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [14]:
#I'm using the Nominatim geocoder to find the location of Toronto, Ontario, Canada and visualizing it with Folium
#!conda install -c conda-forge geopy --yes
from geopy.geocoders import Nominatim
address = 'Toronto, On'

geolocator = Nominatim(user_agent="tdot")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

#creating a map of Toronto using folium
!conda install -c conda-forge folium
import folium
map_tdot = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(df4['Latitude'], df4['Longitude'], df4['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_tdot)  
map_tdot

The geograpical coordinate of Toronto are 43.653963, -79.387207.
Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - folium


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    folium-0.10.0              |             py_0          59 KB  conda-forge
    branca-0.3.1               |             py_0          25 KB  conda-forge
    ca-certificates-2019.9.11  |       hecc5488_0         144 KB  conda-forge
    certifi-2019.9.11          |           py36_0         147 KB  conda-forge
    vincent-0.4.4              |             py_1          28 KB  conda-forge
    openssl-1.1.1c             |       h516909a_0         2.1 MB  conda-forge
    altair-3.2.0               |           py36_0         770 KB  conda-forge
    ------------------------------------------------------------
                                          

### We'll explore the boroughs within North York

In [25]:
#We'll create another dataframe that only exclusive has the data for boroughs in North York
NorthYork = df4.loc[df4['Borough']==' North York']
df_NorthYork = pd.DataFrame(NorthYork)
df_NorthYork.reset_index(drop=True, inplace=True)
df_NorthYork

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M2H,North York,Hillcrest Village,43.803762,-79.363452
1,M2J,North York,"Fairview, Henry Farm, Oriole",43.778517,-79.346556
2,M2K,North York,Bayview Village,43.786947,-79.385975
3,M2L,North York,"Silver Hills, York Mills",43.75749,-79.374714
4,M2M,North York,"Newtonbrook, Willowdale",43.789053,-79.408493
5,M2N,North York,Willowdale South,43.77012,-79.408493
6,M2P,North York,York Mills West,43.752758,-79.400049
7,M2R,North York,Willowdale West,43.782736,-79.442259
8,M3A,North York,Parkwoods,43.753259,-79.329656
9,M3B,North York,Don Mills North,43.745906,-79.352188


In [28]:
#Like in the lab, I'll define a function that gets the nearby venues. But first, I import the requests library and set up the Foursquare API info.
import requests

CLIENT_ID = '4XGEYWNDEWJE055VQ0IO32LY4DPSUTVVKIYXKL23KEKMKNWZ' # your Foursquare ID
CLIENT_SECRET = 'QKPCUJXNFQZV14MQX01HQI5EXWQLLTOWOOBQVUISODFCAWTX' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

limit = 100
radius = 500

def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            limit)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [30]:
NorthYork_venues = getNearbyVenues(names=df_NorthYork['Neighborhood'],
                                   latitudes=df_NorthYork['Latitude'],
                                   longitudes=df_NorthYork['Longitude']
                                  )
NorthYork_venues.head()
print(NorthYork_venues.shape)

(241, 7)


### Now we analyze the neighborhoods with K-mean clustering

In [32]:
#Do some one-hot encoding on the venues info
NorthYork_onehot = pd.get_dummies(NorthYork_venues[['Venue Category']], prefix="", prefix_sep="")
NorthYork_onehot['Neighborhood'] = NorthYork_venues['Neighborhood'] 
fixed_columns = [NorthYork_onehot.columns[-1]] + list(NorthYork_onehot.columns[:-1])
NorthYork_onehot = NorthYork_onehot[fixed_columns]

NorthYork_onehot.shape

(241, 109)

In [36]:
#We group them and assign the mean of the frequency of appearance of each neighborhood
NorthYork_grouped = NorthYork_onehot.groupby('Neighborhood').mean().reset_index()
NorthYork_grouped.shape

(22, 109)

In [37]:
#Here, I'm writing a function to return the most_common_values by descending order
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [65]:
#Here, I'll display the top 5 venues in each neighborhood

num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe for the data
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = NorthYork_grouped['Neighborhood']

for ind in np.arange(NorthYork_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(NorthYork_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Bathurst Manor, Downsview North, Wilson Heights",Coffee Shop,Fried Chicken Joint,Diner,Middle Eastern Restaurant,Pet Store,Pharmacy,Pizza Place,Deli / Bodega,Bridal Shop,Restaurant
1,Bayview Village,Chinese Restaurant,Bank,Café,Japanese Restaurant,Fried Chicken Joint,French Restaurant,Comfort Food Restaurant,Concert Hall,Construction & Landscaping,Furniture / Home Store
2,"Bedford Park, Lawrence Manor East",Hobby Shop,Italian Restaurant,Coffee Shop,Greek Restaurant,Sandwich Place,Fast Food Restaurant,Grocery Store,Indian Restaurant,Comfort Food Restaurant,Juice Bar
3,"CFB Toronto, Downsview East",Airport,Park,Bus Stop,Women's Store,Diner,Comfort Food Restaurant,Concert Hall,Construction & Landscaping,Convenience Store,Cosmetics Shop
4,Don Mills North,Caribbean Restaurant,Gym / Fitness Center,Café,Basketball Court,Japanese Restaurant,Women's Store,Discount Store,Concert Hall,Construction & Landscaping,Convenience Store


In [66]:
#Now we import Kmeans clustering algorithm from scklearn and do kmean clustering
from sklearn.cluster import KMeans

kclusters = 5
NorthYork_grouped_clustering = NorthYork_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(NorthYork_grouped_clustering)

In [69]:
#I create a new dataframe with the new data from clustering included
# add clustering labels
#neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)
NorthYork_merged = df_NorthYork
NorthYork_merged = NorthYork_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood', how = 'right')
NorthYork_merged

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M2H,North York,Hillcrest Village,43.803762,-79.363452,0,Golf Course,Pool,Athletics & Sports,Mediterranean Restaurant,Dog Run,Women's Store,Dim Sum Restaurant,Comfort Food Restaurant,Concert Hall,Construction & Landscaping
1,M2J,North York,"Fairview, Henry Farm, Oriole",43.778517,-79.346556,0,Clothing Store,Fast Food Restaurant,Coffee Shop,Asian Restaurant,Food Court,Gift Shop,Japanese Restaurant,Bakery,Women's Store,Tea Room
2,M2K,North York,Bayview Village,43.786947,-79.385975,0,Chinese Restaurant,Bank,Café,Japanese Restaurant,Fried Chicken Joint,French Restaurant,Comfort Food Restaurant,Concert Hall,Construction & Landscaping,Furniture / Home Store
5,M2N,North York,Willowdale South,43.77012,-79.408493,0,Coffee Shop,Ramen Restaurant,Restaurant,Pizza Place,Sandwich Place,Sushi Restaurant,Café,Movie Theater,Fast Food Restaurant,Indonesian Restaurant
6,M2P,North York,York Mills West,43.752758,-79.400049,0,Park,Convenience Store,Bank,Bar,Women's Store,Discount Store,Comfort Food Restaurant,Concert Hall,Construction & Landscaping,Cosmetics Shop
7,M2R,North York,Willowdale West,43.782736,-79.442259,0,Pizza Place,Coffee Shop,Butcher,Home Service,Pharmacy,Dim Sum Restaurant,Clothing Store,Comfort Food Restaurant,Concert Hall,Construction & Landscaping
8,M3A,North York,Parkwoods,43.753259,-79.329656,4,Park,Food & Drink Shop,Women's Store,Diner,Coffee Shop,Comfort Food Restaurant,Concert Hall,Construction & Landscaping,Convenience Store,Cosmetics Shop
9,M3B,North York,Don Mills North,43.745906,-79.352188,0,Caribbean Restaurant,Gym / Fitness Center,Café,Basketball Court,Japanese Restaurant,Women's Store,Discount Store,Concert Hall,Construction & Landscaping,Convenience Store
10,M3C,North York,"Flemingdon Park, Don Mills South",43.7259,-79.340923,0,Beer Store,Asian Restaurant,Gym,Coffee Shop,Discount Store,Chinese Restaurant,Concert Hall,Japanese Restaurant,Italian Restaurant,Restaurant
11,M3H,North York,"Bathurst Manor, Downsview North, Wilson Heights",43.754328,-79.442259,0,Coffee Shop,Fried Chicken Joint,Diner,Middle Eastern Restaurant,Pet Store,Pharmacy,Pizza Place,Deli / Bodega,Bridal Shop,Restaurant


In [68]:
#Finally, I visualize the clusters with folium map
import matplotlib.cm as cm
import matplotlib.colors as colors

map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

markers_colors = []
for lat, lon, poi, cluster in zip(NorthYork_merged['Latitude'], NorthYork_merged['Longitude'], NorthYork_merged['Neighborhood'], NorthYork_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters