In [9]:
import numpy as np  # library to handle data in a vectorized manner

import pandas as pd  # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import requests  # library to handle requests

from bs4 import BeautifulSoup  # library used for scraping

from sklearn.cluster import KMeans  # import k-means from clustering stage

import folium  # map rendering library

from geopy.geocoders import Nominatim  # convert an address into latitude and longitude values

import json # library to handle JSON files

import seaborn as sns 

from matplotlib import pyplot as plt
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors
%matplotlib inline

In [10]:
response = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")
soup = BeautifulSoup(response.text, "html.parser")  # pass the html object and the type of parser as arguments

In [11]:
table_data = iter(soup.find_all('td'))
data = []

# This loop will keep repeating till there is data available in the iterator
while True:
    try:
        postal_code = next(table_data).text
        borough = next(table_data).text
        neighborhood = next(table_data).text

        data.append([postal_code, borough, neighborhood])

    # Exception will occur when there are no more elements left to iterate through
    except StopIteration:
        break

In [12]:
df_postal = pd.DataFrame(data)
df_postal.columns =["Postal Code", "Borough", "Neighborhood"]  # set the column names
cols_to_check = ['Postal Code','Borough', 'Neighborhood']
df_postal[cols_to_check] = df_postal[cols_to_check].replace({'\n':''}, regex=True)  # remove "/n" character from every cell
df_postal = df_postal.iloc[:180] 
df_postal.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [13]:
df_postal.loc[df_postal['Neighborhood']=="Not assigned",'Neighborhood']=df_postal.loc[df_postal['Neighborhood']=="Not assigned",'Borough']
df_postal.rename(columns = {'Neighborhood':'Neighborhoods'}, inplace = True) 
df_postal.head()

Unnamed: 0,Postal Code,Borough,Neighborhoods
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [17]:
df_COVID19 = pd.read_csv(r"C:\Users\Chengyu\Desktop\Coursera\IBM Data Science\Capstone\Toronto_COVID-19.csv")
df_COVID19.head()

Unnamed: 0,_id,Outbreak Associated,Age Group,Neighbourhood Name,FSA,Source of Infection,Classification,Episode Date,Reported Date,Client Gender,Outcome,Currently Hospitalized,Currently in ICU,Currently Intubated,Ever Hospitalized,Ever in ICU,Ever Intubated
0,44294,Sporadic,50-59,Malvern,M1B,Institutional,CONFIRMED,2020-03-25,2020-03-27,MALE,RESOLVED,No,No,No,No,No,No
1,44295,Sporadic,20-29,Malvern,M1B,Community,CONFIRMED,2020-03-20,2020-03-28,MALE,RESOLVED,No,No,No,Yes,No,No
2,44296,Sporadic,60-69,Malvern,M1B,Travel,CONFIRMED,2020-03-04,2020-03-08,FEMALE,RESOLVED,No,No,No,Yes,Yes,Yes
3,44297,Outbreak Associated,50-59,Rouge,M1B,N/A - Outbreak associated,CONFIRMED,2020-05-02,2020-05-04,FEMALE,RESOLVED,No,No,No,No,No,No
4,44298,Sporadic,30-39,Rouge,M1B,Close contact,CONFIRMED,2020-05-31,2020-06-06,FEMALE,RESOLVED,No,No,No,No,No,No


In [21]:
df_CDN = pd.read_html("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M",'Sheet1')
df_CDN.head()

ValueError: No tables found matching pattern 'Sheet1'

In [None]:
df_cases = pd.read_csv('COVID19 cases.csv')
df_cases.sort_values('Reported Date', inplace=True)
df_cases.head()

In [None]:
geo_coord = pd.read_csv('https://cocl.us/Geospatial_data')
geo_coord.head()

In [None]:
df_comb = pd.merge(df_postal, geo_coord, on='Postal Code')
df_comb.head()

In [None]:
df_comb2 = pd.merge(df_COVID19, df_CDN, on='CDN')
df_comb2.head()

In [None]:
df_merged = pd.merge(df_comb, df_comb2, on='Borough')

df_merged.head()

In [None]:
total_cases = df_cases['Episode Date'].value_counts()
total_cases = pd.DataFrame(total_cases)
total_cases.reset_index(level=0, inplace=True)
total_cases.columns = ['Episode Date','Total Cases']
total_cases.sort_values('Episode Date',inplace=True)
total_cases.head()

In [None]:
df_cases =pd.merge(total_cases,df_cases,on='Episode Date')
df_cases.head()

In [None]:
plt.scatter(df_cases['Episode Date'],df_cases['Total Cases'])
# This shows us that the relationship is not linear and the cases already reached their peak!

In [None]:
cor = df_merged.corr()
sns.heatmap(cor)

In [None]:
sns.pairplot(df_merged)

In [None]:
address = 'Toronto ON'
geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinates of Toronto are {}, {}.'.format(latitude, longitude))

In [None]:
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)
for lat, lng, borough, neighborhood in zip(
        df_comb['Latitude'], 
        df_comb['Longitude'], 
        df_comb['Borough'], 
        df_comb['Neighborhoods']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  

map_toronto

In [None]:
CLIENT_ID = '2OERGOTEXIMMFKY3NDQRAHCR2HVZGTKOKBKTYSEVSL0LFBJH' # your Foursquare ID
CLIENT_SECRET = 'DVYCWS4AZXDG54JMIL143HMZES1RKEBMCWAT1MHP3AM32YOZ' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

In [None]:
df_merged.loc[2, 'Neighborhood']

In [None]:
neighborhood_latitude = df_merged.loc[2, 'Latitude'] # neighborhood latitude value
neighborhood_longitude = df_merged.loc[2, 'Longitude'] # neighborhood longitude value

neighborhood_name = df_merged.loc[2, 'Neighborhood'] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

In [None]:
LIMIT = 100 # limit of number of venues returned by Foursquare API
radius = 500 # define radius
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighborhood_latitude, 
    neighborhood_longitude, 
    radius, 
    LIMIT)

# get the result to a json file
results = requests.get(url).json()

In [None]:
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [None]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

In [None]:
def getNearbyVenues(names, latitudes, longitudes, radius=100):
    
    categoryId = '4bf58dd8d48988d196941735'
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/search?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}&categoryId={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT,
            categoryId)
            
        # make the GET request
        results = requests.get(url).json()["response"]
        
        # return only relevant information for each nearby venue
    venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])

    return(nearby_venues.head(5))

In [None]:
df_features = df_merged[['Rate per 100,000 people','Case Count']]
# set number of clusters
kclusters = 4

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(df_features)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

df_merged['Cluster Labels'] = kmeans.labels_

In [None]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(df_merged['Latitude'], df_merged['Longitude'], df_merged['Neighborhood'], df_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [None]:
df_merged.drop('Neighborhood', axis=1, inplace=True)
df_merged.loc[df_merged['Cluster Labels'] == 0, df_merged.columns[[1] + list(range(5, df_merged.shape[1]))]]

In [None]:
df_merged.loc[df_merged['Cluster Labels'] == 2, df_merged.columns[[1] + list(range(5, df_merged.shape[1]))]]

In [None]:
df_merged.loc[df_merged['Cluster Labels'] == 3, df_merged.columns[[1] + list(range(5, df_merged.shape[1]))]]