# Density clustering algorithm from Montreal Gazette data
Import the data, cluster this with different algorithms based on frequency and severity of accidents, export as json or geojson

In [16]:
# Import dependencies
import csv
import pandas as pd
import numpy as np
import json
import folium

from folium import plugins
import matplotlib.pyplot as plt
import seaborn as sns

import matplotlib.colors as clrs
import matplotlib.cm as cm

from sklearn.cluster import DBSCAN, KMeans
from geojson import Feature, FeatureCollection, Point
from geopy.geocoders import Nominatim


ModuleNotFoundError: No module named 'seaborn'

In [5]:
def get_color(radius):
    # min and max cluster size
    norm = clrs.Normalize(vmin=2, vmax=7) # Min/max cluster size
    m = cm.ScalarMappable(norm=norm, cmap='YlOrRd') # Choose colormap (from YeLlow to ReD)
    rgbs = m.to_rgba(radius)[:-1] # Remove opacity
    return clrs.rgb2hex(rgbs) # Turn into hex, because folium.Polyline doesn't take rgb


In [6]:
# Read collision file
df = pd.read_csv('../data/collisions.csv',encoding='latin1')
print(len(df))
# Keep only those that have "status = OK" and "type = bike"
df.drop(df[~((df['status'] == 'OK') & (df['type'] == 'bike'))].index,inplace = True)
df = df[['lat1','lng1','nb_grave','nb_leger','nb_mort']]
print(len(df))

17965
4133


In [7]:
print(np.sum(df[df['nb_mort'] != 0]))

lat1         956.063292
lng1       -1546.204715
nb_grave       0.000000
nb_leger       5.000000
nb_mort       21.000000
dtype: float64


Difference between (lat1,lng1) and (lat,lng) is that (lat1,lng1) is the center of the intersection, while (lat,lng) refers to a specific part of the intersection. We will proceed with the center of the intersection. We then expand the dataset via the frequency of accidents, where nb_grave counts for 3 and nb_mort counts for 6.

In [8]:
geo_list = []
for index, row in df.iterrows():
    [geo_list.append([row[0],row[1]]) for _ in range(int(3*row[2]+row[3]+6*row[4]))]
df_geo = pd.DataFrame(geo_list,columns=['lat','lng'])


Now we have our list of latitude and longitude points, and we can start to compute clusters. We used a hyperparameter search to compare different-sized clusters

In [9]:
'''
for paramloop in hyperparameter:
    for param in paramloop:
        db = DBSCAN(eps=2,min_samples=20)
        y_db = db.fit_predict(df_geo)
''';

In [10]:
db = DBSCAN(eps=0.00025, min_samples=5)
db.fit(df_geo)
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_

# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)

In [11]:
print(n_clusters_)

172


In [12]:
cluster_means = []
for label in range(n_clusters_):
    lat = df_geo[labels == label].mean(axis=0)['lat']
    lng = df_geo[labels == label].mean(axis=0)['lng']
    count = np.sum(labels == label)
    cluster_means.append([lat,lng,count])

In [13]:
geolocator = Nominatim()
location = geolocator.geocode("McGill University Montreal Quebec")
lat_mon = float(location.raw['lat'])
lng_mon = float(location.raw['lon'])
m = folium.Map(location=[lat_mon,lng_mon],tiles="Stamen Toner",zoom_start=14)



In [14]:
for lat,lng,radius in cluster_means:
    folium.CircleMarker(location = (lat,lng),radius = np.sqrt(10*radius),color = get_color(np.sqrt(radius)),fill=True,fill_opacity = 0.8).add_to(m)
m.save('../hotspots.html')

In [15]:
df_means = pd.DataFrame(cluster_means,columns = ['lat','lng','weight'])
df_means.to_csv('../data/cluster_means.csv')