In [1]:
%load_ext dotenv
%reload_ext dotenv
%dotenv

In [2]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

import folium
from folium.plugins import HeatMap
from shapely.geometry import MultiPoint

In [3]:
import os
import sys

from os.path import dirname
UTILS_PATH=os.environ.get('REPO') + "/notebooks/commons"
sys.path.append(dirname(UTILS_PATH))

from commons import check_args_variables, gen_paths_variables, coordinates_bounds

In [4]:
if __name__ == "__main__":
    source = "PLACES"
    city = "manizales"
    
    base_path = os.environ.get('REPO')
    
    source_path = base_path + gen_paths_variables(source, city)

In [5]:
MSG = """
Hay {} publicaciones de {} en total
"""

external_variables = pd.read_parquet(source_path)
print(MSG.format(external_variables.shape[0], city))
external_variables.head()


Hay 2588 publicaciones de manizales en total



Unnamed: 0,lat,lon,amenity,name
0,5.045667899999999,-75.5289915,university,Universidad Antonio Nariño Manizales
1,5.045667899999999,-75.5289915,university,UAN Universidad Antonio Nariño
2,5.001526,-75.59880199999999,university,Mànizales
3,4.9857958,-75.60692829999999,university,CUN - Corporación Unificada Nacional de Educac...
4,4.983538500000001,-75.6056244,university,JUZGADO PRIMERO PROMISCUO MUNICIPAL DE CHINCHINÁ


In [6]:
from sklearn.cluster import DBSCAN

In [7]:
coords = external_variables[['lat','lon']].copy()
coords.loc[:, "lat"] = coords["lat"].apply(float)
coords.loc[:, "lon"] = coords["lon"].apply(float)
coords = coords.values

kms_per_radian = 6371.0088
epsilon = 0.1 / kms_per_radian

min_samples = 20

In [8]:
db = DBSCAN(eps=epsilon, min_samples=min_samples, algorithm='ball_tree', metric='haversine').fit(np.radians(coords))

In [9]:
centroids = []
clusters_size = len(db.labels_)

for n in range(clusters_size):
    cluster = external_variables[['lat','lon']][db.labels_ == n].copy()
    if len(cluster) > 0:
        cluster.loc[:, "cluster_id"] = n
        
        multi_point = MultiPoint(cluster.values)
        cluster.loc[:, "cluster_latitude"] = multi_point.centroid.x
        cluster.loc[:, "cluster_longitude"] = multi_point.centroid.y
        
    centroids.append(cluster)
        
clusters_df = pd.concat(centroids)
clusters_df = clusters_df.reset_index(drop=True)
clusters_df.head()

Unnamed: 0,lat,lon,cluster_id,cluster_latitude,cluster_longitude
0,5.066951299999999,-75.5172298,0.0,5.067697,-75.517177
1,5.0676711,-75.5181413,0.0,5.067697,-75.517177
2,5.0683391,-75.5178292,0.0,5.067697,-75.517177
3,5.0673616,-75.5195778,0.0,5.067697,-75.517177
4,5.067478100000001,-75.5197086,0.0,5.067697,-75.517177


In [10]:
clusters_coordinates = clusters_df[["cluster_id", "lat", "lon"]].groupby(["lat", "lon"]).count()

coordinates = list(zip(*clusters_coordinates.index))
clusters_coordinates.loc[:, "lat"] = coordinates[0]
clusters_coordinates.loc[:, "lon"] = coordinates[1]

clusters_coordinates.loc[:, "count"] = clusters_coordinates["cluster_id"].apply(lambda e: float(e*100))

clusters_coordinates = clusters_coordinates.reset_index(drop=True)

clusters_coordinates.head()

Unnamed: 0,cluster_id,lat,lon,count
0,1,4.982349,-75.60545499999999,100.0
1,1,4.9823659,-75.6054491,100.0
2,1,4.982439,-75.605238,100.0
3,1,4.982473000000001,-75.605088,100.0
4,1,4.9825221,-75.605402,100.0


In [11]:
folium_hmap = folium.Map(location=[
    np.mean([coordinates_bounds[city]["lat"]["lower"], coordinates_bounds[city]["lat"]["upper"]]),
    np.mean([coordinates_bounds[city]["lon"]["lower"], coordinates_bounds[city]["lon"]["upper"]])
], zoom_start=13, tiles="OpenStreetMap")

max_amount = clusters_coordinates['count'].max()

hm_wide = HeatMap( 
    clusters_coordinates[["lat", "lon", "count"]],
    min_opacity=0.2,
    max_val=max_amount,
    radius=8, blur=6, 
    max_zoom=15, 
    gradient={.1:'yellow', .5: 'orange',  1: 'red'}
)

folium_hmap.add_child(hm_wide)