# Final Capstone Project
# Study Covid-19 spread over Toronto City and identifying venues which are more significant in spreading  using Foursquare API

## Import lib

In [2]:
# library to handle requests
import requests 

# library for web scraping 
#!pip install bs4
from bs4 import BeautifulSoup

# library for data analsysis
import pandas as pd
import re
import numpy as np

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# convert an address into latitude and longitude values
#!pip install geopy
from geopy.geocoders import Nominatim

# map rendering library
#!pip install folium
import folium

#!pip install statsmodels
import statsmodels as sm

## 1. Data Collection
### 1.1 Collect Covid-19 Data for Toronto city
#### Data is collected from Toronto.ca website link https://www.toronto.ca/home/covid-19/covid-19-latest-city-of-toronto-news/covid-19-status-of-cases-in-toronto/

#### 1.1.1 Convert data in Pandas DataFrame df_covid19

In [3]:
df_covid19 = pd.read_csv("CityofToronto_COVID-19_NeighbourhoodData1.csv")

In [4]:
df_covid19.shape

(141, 4)

In [5]:
df_covid19

Unnamed: 0,Neighbourhood ID,Neighbourhood Name,"Rate per 100,000 people",Case Count
0,138.0,Eglinton East,430.277485,98
1,47.0,Don Valley Village,255.073750,69
2,38.0,Lansing-Westgate,222.717149,36
3,9.0,Edenbridge-Humber Valley,656.581912,102
4,44.0,Flemingdon Park,606.392194,133
...,...,...,...,...
136,113.0,Weston,1823.032459,328
137,95.0,Annex,281.727052,86
138,94.0,Wychwood,557.530142,80
139,37.0,Willowdale West,307.038262,52


In [41]:
df_covid19.describe()

Unnamed: 0,Neighbourhood ID,"Rate per 100,000 people",Case Count,latitude,longitude
count,140.0,140.0,141.0,104.0,104.0
mean,70.5,549.582185,110.156028,43.15826,-76.72667
std,40.5586,429.295864,111.665206,8.703777,24.316156
min,1.0,78.824129,14.0,-41.489857,-114.122925
25%,35.75,237.160368,35.0,43.670853,-79.45145
50%,70.5,375.474328,74.0,43.706596,-79.383135
75%,105.25,723.236999,144.0,43.763361,-79.300355
max,140.0,1823.032459,607.0,53.478282,146.232511


# 2. Data Prepration 
### 2.1 Use geopy library to get the latitude and longitude values of all the Neighbourhood

In [6]:
def getcoord(address):
    geolocator = Nominatim(user_agent="to_explorer")
    location = geolocator.geocode(address)
    if location :
        latitude = location.latitude
        longitude = location.longitude
        return(latitude,longitude)

In [7]:
df_covid19['latitude'] = np.nan
df_covid19['longitude'] = np.nan

In [8]:
df_covid19.head()

Unnamed: 0,Neighbourhood ID,Neighbourhood Name,"Rate per 100,000 people",Case Count,latitude,longitude
0,138.0,Eglinton East,430.277485,98,,
1,47.0,Don Valley Village,255.07375,69,,
2,38.0,Lansing-Westgate,222.717149,36,,
3,9.0,Edenbridge-Humber Valley,656.581912,102,,
4,44.0,Flemingdon Park,606.392194,133,,


In [9]:
for x in range(df_covid19.shape[0]):
    #re.sub('\/*[a-z,A-Z,+,$]','',df_covid19.loc[x]['Neighbourhood Name'])
    loc = df_covid19.loc[x]['Neighbourhood Name']+", Canada"
    #print(loc)
    coord = getcoord(loc)
    if coord:
        df_covid19.loc[x,'latitude'] = coord[0]
        df_covid19.loc[x,'longitude'] = coord[1]

In [10]:
df_covid19

Unnamed: 0,Neighbourhood ID,Neighbourhood Name,"Rate per 100,000 people",Case Count,latitude,longitude
0,138.0,Eglinton East,430.277485,98,43.739465,-79.232100
1,47.0,Don Valley Village,255.073750,69,43.792673,-79.354722
2,38.0,Lansing-Westgate,222.717149,36,,
3,9.0,Edenbridge-Humber Valley,656.581912,102,43.672223,-79.514685
4,44.0,Flemingdon Park,606.392194,133,43.718432,-79.333204
...,...,...,...,...,...,...
136,113.0,Weston,1823.032459,328,43.700161,-79.516247
137,95.0,Annex,281.727052,86,43.670338,-79.407117
138,94.0,Wychwood,557.530142,80,43.682122,-79.423839
139,37.0,Willowdale West,307.038262,52,43.761510,-79.410923


In [11]:
df_covid19.shape

(141, 6)

## 2.2 Clean DataFrame , remove Nan and duplicates if any

In [12]:
df_clean = df_covid19.dropna().reset_index(drop=True)

In [13]:
df_clean.shape

(104, 6)

In [14]:
df_clean = df_clean.drop_duplicates(subset='Neighbourhood Name', keep="last")

In [15]:
df_clean.shape

(104, 6)

In [16]:
df_clean

Unnamed: 0,Neighbourhood ID,Neighbourhood Name,"Rate per 100,000 people",Case Count,latitude,longitude
0,138.0,Eglinton East,430.277485,98,43.739465,-79.232100
1,47.0,Don Valley Village,255.073750,69,43.792673,-79.354722
2,9.0,Edenbridge-Humber Valley,656.581912,102,43.672223,-79.514685
3,44.0,Flemingdon Park,606.392194,133,43.718432,-79.333204
4,59.0,Danforth East York,174.621653,30,43.686433,-79.300355
...,...,...,...,...,...,...
99,113.0,Weston,1823.032459,328,43.700161,-79.516247
100,95.0,Annex,281.727052,86,43.670338,-79.407117
101,94.0,Wychwood,557.530142,80,43.682122,-79.423839
102,37.0,Willowdale West,307.038262,52,43.761510,-79.410923


# 3. Data Exploration
## 3.1 Create Map of toronto and superimposing neibhorhood with Covid-19 Rates per 100k people

In [17]:
address = 'Toronto City, Canada'

geolocator = Nominatim(user_agent="to_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto City are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto City are 43.6534817, -79.3839347.


In [20]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, neighborhood,CovidRate in zip(df_clean['latitude'], df_clean['longitude'], df_clean['Neighbourhood Name'],df_clean['Rate per 100,000 people']):
    label = '{},{}'.format(neighborhood,CovidRate)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

## 3.2 Explore neighborhoods and segment them using the Foursquare API
### 3.2.1 Get the top 100 venues that are within a radius of 500 meters of each neighbourho

In [21]:
# Define Foursquare API credentials
CLIENT_ID = 'EAVHEM2AWPMWZ3DS0KH4KXNFXHCUT4K5244QDKTF3NUWRGXP' # your Foursquare ID
CLIENT_SECRET = '5A1BZ1MG5G1PTKAL1KBJNMJN0SFSYZKXDAAAD2OUUICFXMJK' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
LIMIT = 100
print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: EAVHEM2AWPMWZ3DS0KH4KXNFXHCUT4K5244QDKTF3NUWRGXP
CLIENT_SECRET:5A1BZ1MG5G1PTKAL1KBJNMJN0SFSYZKXDAAAD2OUUICFXMJK


In [22]:
# function to get list of venuews for each neighbourhood
def getNearbyVenues(names, latitudes, longitudes, Rate, CaseCount, radius=500):
    
    venues_list=[]
    for name, lat, lng, rt, cc in zip(names, latitudes, longitudes, Rate, CaseCount):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            rt,
            cc,
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Rate per 100,000 people',
                  'Case Count',
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [23]:
toronto_venues = getNearbyVenues(names=df_clean['Neighbourhood Name'],
                                   latitudes=df_clean['latitude'],
                                   longitudes=df_clean['longitude'],
                                   Rate = df_clean['Rate per 100,000 people'],
                                   CaseCount = df_clean['Case Count'],
                                  )

Eglinton East
Don Valley Village
Edenbridge-Humber Valley
Flemingdon Park
Danforth East York
Agincourt North
Mount Pleasant East
Woburn
Forest Hill North
Milliken
Thorncliffe Park
Markland Wood
Kingsway South
Centennial Scarborough
Church-Yonge Corridor
Taylor-Massey
Kennedy Park
Yonge-Eglinton
Junction Area
Mount Dennis
Englemount-Lawrence
South Riverdale
North St. James Town
Banbury-Don Mills
Yonge-St.Clair
Mount Pleasant West
Henry Farm
Humber Heights-Westmount
Runnymede-Bloor West Village
South Parkdale
Guildwood
Tam O'Shanter-Sullivan
Newtonbrook West
Little Portugal
Bayview Woods-Steeles
Niagara
Cliffcrest
L'Amoreaux
Trinity-Bellwoods
Elms-Old Rexdale
Willowdale East
High Park-Swansea
North Riverdale
Dufferin Grove
Black Creek
Alderwood
Rexdale-Kipling
Keelesdale-Eglinton West
Ionview
Bayview Village
Regent Park
Palmerston-Little Italy
Steeles
Morningside
Rosedale-Moore Park
West Hill
Maple Leaf
Forest Hill South
University
Caledonia-Fairbank
Leaside-Bennington
Lawrence Park Sout

In [24]:
print(toronto_venues.shape)
toronto_venues.head()

(2009, 9)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,"Rate per 100,000 people",Case Count,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Eglinton East,43.739465,-79.2321,430.277485,98,Anjappar Authentic Chettinadu Restaurant,43.741592,-79.226799,Indian Restaurant
1,Eglinton East,43.739465,-79.2321,430.277485,98,Dairy Queen,43.739506,-79.236894,Ice Cream Shop
2,Eglinton East,43.739465,-79.2321,430.277485,98,Dairy Queen,43.73958,-79.236991,Ice Cream Shop
3,Eglinton East,43.739465,-79.2321,430.277485,98,Subway,43.738284,-79.236792,Sandwich Place
4,Eglinton East,43.739465,-79.2321,430.277485,98,Eglinton GO Station,43.739701,-79.232281,Train Station


In [25]:
#count of unique venue categories
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 243 uniques categories.


#### 3.2.2 Let's check how many venues were returned for each neighborhood

In [26]:
toronto_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,"Rate per 100,000 people",Case Count,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Agincourt North,27,27,27,27,27,27,27,27
Alderwood,6,6,6,6,6,6,6,6
Annex,42,42,42,42,42,42,42,42
Banbury-Don Mills,5,5,5,5,5,5,5,5
Bay Street Corridor,100,100,100,100,100,100,100,100
...,...,...,...,...,...,...,...,...
Wychwood,53,53,53,53,53,53,53,53
Yonge-Eglinton,72,72,72,72,72,72,72,72
Yonge-St.Clair,56,56,56,56,56,56,56,56
York University Heights,18,18,18,18,18,18,18,18


In [27]:
# Save file to avoid calling API again 
toronto_venues.to_csv("Toronto_Covid19_Data_withVenue.csv",index=False)

In [28]:
toronto_venues.shape

(2009, 9)

## 3.3 Analyze Each Neighborhood

In [29]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Yoga Studio,Afghan Restaurant,American Restaurant,Animal Shelter,Art Gallery,Art Museum,Arts & Crafts Store,Arts & Entertainment,Asian Restaurant,Athletics & Sports,...,Turkish Restaurant,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Whisky Bar,Wine Bar,Wings Joint,Women's Store
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [30]:
toronto_onehot.shape

(2009, 243)

### 3.3.1 Group rows by neighborhood and by adding each category

In [31]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').sum().reset_index()

In [32]:
toronto_grouped

Unnamed: 0,Neighborhood,Yoga Studio,Afghan Restaurant,American Restaurant,Animal Shelter,Art Gallery,Art Museum,Arts & Crafts Store,Arts & Entertainment,Asian Restaurant,...,Turkish Restaurant,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Whisky Bar,Wine Bar,Wings Joint,Women's Store
0,Agincourt North,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0
1,Alderwood,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Annex,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,1,0
3,Banbury-Don Mills,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Bay Street Corridor,1,1,0,0,1,1,0,0,1,...,0,1,0,1,0,0,0,0,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
89,Wychwood,0,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
90,Yonge-Eglinton,1,0,0,0,0,0,2,0,0,...,0,1,1,0,1,0,0,1,0,0
91,Yonge-St.Clair,1,0,1,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
92,York University Heights,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [33]:
toronto_grouped.shape

(94, 243)

### 3.3.2 Add Infection Rate for each Neighborhood

In [34]:
toronto_grouped['Infection Rate'] = ""

In [35]:
for x in range(df_clean.shape[0]):
    for y in range(toronto_grouped.shape[0]):
         if (df_clean.loc[x,'Neighbourhood Name'] == toronto_grouped.loc[y,'Neighborhood']):
            #print(df_clean.loc[x,'Neighbourhood Name'])
            #print(toronto_grouped.loc[y,'Neighborhood'])
            toronto_grouped.loc[y,'Infection Rate'] = df_clean.loc[x,'Rate per 100,000 people']
            
        

In [36]:
toronto_grouped

Unnamed: 0,Neighborhood,Yoga Studio,Afghan Restaurant,American Restaurant,Animal Shelter,Art Gallery,Art Museum,Arts & Crafts Store,Arts & Entertainment,Asian Restaurant,...,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Whisky Bar,Wine Bar,Wings Joint,Women's Store,Infection Rate
0,Agincourt North,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,0,291.966
1,Alderwood,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,365.024
2,Annex,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,1,0,281.727
3,Banbury-Don Mills,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,144.43
4,Bay Street Corridor,1,1,0,0,1,1,0,0,1,...,1,0,1,0,0,0,0,0,2,220.956
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
89,Wychwood,0,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,557.53
90,Yonge-Eglinton,1,0,0,0,0,0,2,0,0,...,1,1,0,1,0,0,1,0,0,135.398
91,Yonge-St.Clair,1,0,1,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,199.553
92,York University Heights,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1554.74


In [37]:
toronto_grouped.shape

(94, 244)

# 4. Data Analysis 
## 4.1 Model Ordinary Least Square Regression using Venues Categories as independent variable and Covid-19 Infection Rate as dependent variable using statsmodels lib

In [38]:
import statsmodels.api as sm
X = toronto_grouped.iloc[:, 1:-1].astype(float)
y = toronto_grouped['Infection Rate'].astype(float)

In [39]:
tX = sm.add_constant(X)
est = sm.OLS(y, tX).fit()
est.summary()

0,1,2,3
Dep. Variable:,Infection Rate,R-squared:,0.972
Model:,OLS,Adj. R-squared:,0.562
Method:,Least Squares,F-statistic:,2.374
Date:,"Fri, 14 Aug 2020",Prob (F-statistic):,0.137
Time:,09:10:24,Log-Likelihood:,-531.39
No. Observations:,94,AIC:,1239.0
Df Residuals:,6,BIC:,1463.0
Df Model:,87,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,328.8579,226.720,1.450,0.197,-225.907,883.623
Yoga Studio,-100.6587,87.662,-1.148,0.295,-315.159,113.842
Afghan Restaurant,218.6143,98.658,2.216,0.069,-22.792,460.021
American Restaurant,100.0018,190.103,0.526,0.618,-365.163,565.166
Animal Shelter,-52.5166,98.274,-0.534,0.612,-292.984,187.951
Art Gallery,-86.8222,87.725,-0.990,0.361,-301.477,127.833
Art Museum,8.7475,22.319,0.392,0.709,-45.866,63.361
Arts & Crafts Store,278.7631,266.870,1.045,0.336,-374.243,931.769
Arts & Entertainment,-42.4121,51.289,-0.827,0.440,-167.911,83.087

0,1,2,3
Omnibus:,52.931,Durbin-Watson:,2.997
Prob(Omnibus):,0.0,Jarque-Bera (JB):,6282.822
Skew:,0.0,Prob(JB):,0.0
Kurtosis:,43.052,Cond. No.,1.33e+16


In [40]:
selcted_features = pd.DataFrame(est.pvalues).astype(float)
selcted_features.columns= ['P-value']
selcted_features = selcted_features[selcted_features['P-value']<=0.05].reset_index()
selcted_features

Unnamed: 0,index,P-value
0,Bookstore,0.036599
1,Boutique,0.01618
2,Brewery,0.029301
3,Burger Joint,0.029436
4,Cuban Restaurant,0.029301
5,Department Store,0.040129
6,Farmers Market,0.040342
7,Fish Market,0.011843
8,Furniture / Home Store,0.004812
9,Gift Shop,0.040856


# Conclusion:

#### In this study Covid-19 Data provided by Toronto.ca been analyzed. The data was having 140 Rows for different neighborhood with infection rate for each neighborhood. The Infection rate is provided per 100K people. The infection rate among different neighborhood varies in a wide range with min value of 78 and max value of 1823 with a std.deviation of 429. The data is further annotated by geo coordinates using geopy lib for each neighborhood lib. Using these coordinates a map is created and studied by superimposing different neighborhood with Infection rate over a map of Toronto. These geo coordinates further used to gather list of venues located within 500 meter for each neighborhood using foursquare lib. A total of 243 unique venue categories were identified in the data. To analyze these venue categories data is transformed using one-hot coding and added the similar venues categories. This data is modeled using Ordinary Least Square Regression Model from Statsmodels. All the venue categories count is used as independent variables and the Infection rate is used as depended variable.  The model showed the R2squared value of ~0.97 and provided p-Values for each venue category. The p-Values represent the significance of independent variable in predicting dependent variable. If a p-Value of an independent variable is < 0.05 then it is highly significance in predicting the depended variable. With this exercise all the independent variables with p<0.05 are filtered and reported to full fill the goal of this project. There are total 15 venue categories which are found to be highly significant in spreading the virus as listed 
### Bookstore, Boutique, Brewery, Burger Joint, Cuban Restaurant, Department Store, Farmers Market,Fish Market, Furniture / Home Store, Gift Shop,Karaoke Bar,Modern European Restaurant,Plaza,Poke Place,Vietnamese Restaurant


# Thank You