In [192]:
%matplotlib notebook

import os
import sys
from operator import itemgetter
from matplotlib import pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
from ipyleaflet import (Map, GeoJSON)

sns.set()

from sklearn.cluster import DBSCAN

import matplotlib.dates as mdates
from matplotlib.colors import rgb2hex
import json

def get_geojson(features):
    return {
        'type': 'FeatureCollection',
        'features': features
    }

def save_geojson(features, directory, file_name):
    if not os.path.exists(directory): os.makedirs(directory)
    f = os.path.join(directory, file_name + '.geojson')
    geojson = {
      'type': 'FeatureCollection',
      'features': features
    }
    with open(f, 'w') as outfile:
        json.dump(geojson, outfile, indent = 4)
    print('Saved to ' + f)

def to_geojson(df, groupby, lat, lng, cols, dumps=True):

    def get_features(row, color):
        properties = { k: str(v) for k,v in zip(cols,[row[col] for col in cols]) }
        properties['marker-color'] = rgb2hex(color[:3])
        return {
            'type': 'Feature',
                'geometry': {
                'type': 'Point',
                'coordinates': [row[lng], row[lat]]
            },
            'properties': properties
        }

    clusters = df.groupby(groupby)

    features = []
    colors = plt.cm.Spectral(np.linspace(0, 1, len(clusters)))
    for name, group in clusters:
        i = np.random.randint(colors.shape[0])
        color = colors[i]
        group.apply(lambda row: features.append(get_features(row, color)), axis=1)
        colors = np.delete(colors, i, 0)

    if dumps:
        return json.dumps(get_geojson(features))
    return get_geojson(features)

In [197]:
nb_dir = os.path.normpath(os.path.join(os.getcwd(), '..'))
os.listdir(nb_dir)
if nb_dir not in sys.path:
    sys.path.append(nb_dir)

f = 'data/UTSEUS-shanghai-flickr.sqlite'

In [198]:
import sqlite3
conn = sqlite3.connect(f)
cursor = conn.cursor()
cursor.execute("""
SELECT * FROM photos
""")
photos = pd.DataFrame(cursor.fetchall())

In [199]:
photos.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,12786061,31.239682,121.497266,23804952@N00,15,22726052.0,2004,9,25,Needle in the Sky,sky 2004 architecture shanghai pearltower dscp8,"Pearl TV Tower, Shanghai",http://farm1.staticflickr.com/9/12786061_a6b55...
1,21048909,31.23438,121.494541,40264825@N00,16,22726050.0,2005,3,22,Skywards,holga cityscape shanghai,Holga Shanghai Places,http://farm1.staticflickr.com/16/21048909_3a5c...
2,21048962,31.23438,121.494541,40264825@N00,16,22726050.0,2005,6,23,Bund2,holga cityscape shanghai,Holga Shanghai Places,http://farm1.staticflickr.com/16/21048962_98b0...
3,21048995,31.23438,121.494541,40264825@N00,16,22726050.0,2005,6,23,RoomWithaView,holga cityscape shanghai hotelrooms,Holga Shanghai Places\n\nCamera: Holga 120N\n,http://farm1.staticflickr.com/15/21048995_8cd6...
4,21049047,31.23438,121.494541,40264825@N00,16,22726050.0,2004,11,30,"Bund, Early Morning (Shanghai)",holga cityscape shanghai,Camera: Holga 120N,http://farm1.staticflickr.com/15/21049047_51f8...


In [200]:
# Don't forget to remove "sample" to get full dataset
data_dbscan = photos[[1, 2]].sample(500)
data_dbscan.columns = ['latitude', 'longitude']
data_dbscan.head()
X = data_dbscan.values

In [201]:
kms_per_radian = 6371.0088

def compute_dbscan(meters):
    epsilon = (meters * 0.001) / kms_per_radian
    db = DBSCAN(eps=epsilon, algorithm='ball_tree', metric='haversine').fit(np.radians(X))
    return db.labels_

computings = map(compute_dbscan, range(100, 300))
n_computings = map(lambda x: (len(set(x)), x), computings)
labels = max(n_computings, key=itemgetter(0))[1]

for label in set(labels):
    df = data_dbscan[(labels == label)]
    data_dbscan.loc[df.index, 'cluster_num'] = int(label)

In [202]:
len(set(labels))

26

In [203]:
data_dbscan.sample(n=100)

Unnamed: 0,latitude,longitude,cluster_num
6941,31.207075,121.435616,-1.0
42971,31.247057,121.507308,-1.0
30548,31.242636,121.464285,-1.0
15416,31.241855,121.486180,4.0
4930,31.235311,121.432822,-1.0
38817,31.216152,121.470016,-1.0
42588,31.229830,121.472053,11.0
42275,31.200000,121.500000,5.0
2467,31.251258,121.475894,-1.0
13060,31.274630,121.504005,-1.0


In [204]:
to_geojson(data_dbscan, 'cluster_num', 'latitude', 'longitude', ['cluster_num'])

'{"features": [{"geometry": {"coordinates": [121.47907, 31.235591], "type": "Point"}, "properties": {"marker-color": "#feec9f", "cluster_num": "-1.0"}, "type": "Feature"}, {"geometry": {"coordinates": [121.472886, 31.217291], "type": "Point"}, "properties": {"marker-color": "#feec9f", "cluster_num": "-1.0"}, "type": "Feature"}, {"geometry": {"coordinates": [121.500291, 31.249231], "type": "Point"}, "properties": {"marker-color": "#feec9f", "cluster_num": "-1.0"}, "type": "Feature"}, {"geometry": {"coordinates": [121.492836, 31.230722], "type": "Point"}, "properties": {"marker-color": "#feec9f", "cluster_num": "-1.0"}, "type": "Feature"}, {"geometry": {"coordinates": [121.050577, 31.112505], "type": "Point"}, "properties": {"marker-color": "#feec9f", "cluster_num": "-1.0"}, "type": "Feature"}, {"geometry": {"coordinates": [121.4261, 31.1721], "type": "Point"}, "properties": {"marker-color": "#feec9f", "cluster_num": "-1.0"}, "type": "Feature"}, {"geometry": {"coordinates": [121.685241, 