In [54]:
import re
import numpy as np
import pandas as pd

import numpy.linalg as LA 
from sklearn.cluster import DBSCAN


In [55]:
data = pd.read_csv('../data/sample_users_location_time.tsv', sep='\t')
data.head()

Unnamed: 0,from_user_id,created_at,coordinates
0,2537269060,Sat Jan 10 01:01:47 +0000 2015,"{u'type': u'Point', u'coordinates': [-117.1306..."
1,2329502492,Sat Jan 10 01:01:47 +0000 2015,"{u'type': u'Point', u'coordinates': [-117.1701..."
2,366331312,Sat Jan 10 01:01:47 +0000 2015,"{u'type': u'Point', u'coordinates': [-121.2075..."
3,1007081804,Sat Jan 10 01:01:50 +0000 2015,"{u'type': u'Point', u'coordinates': [-117.0861..."
4,89506217,Sat Jan 10 01:01:53 +0000 2015,"{u'type': u'Point', u'coordinates': [-121.3433..."


In [56]:
lon_re = "{u'type': u'Point', u'coordinates': \[(.*),\s.*\]}"
data['longitude'] = data['coordinates'].str.extract(lon_re)

lat_re = "{u'type': u'Point', u'coordinates': \[.*,\s(.*)\]}"
data['latitude'] = data['coordinates'].str.extract(lat_re)
data = data.drop('coordinates', 1)

data.head()

Unnamed: 0,from_user_id,created_at,longitude,latitude
0,2537269060,Sat Jan 10 01:01:47 +0000 2015,-117.130628,32.736346
1,2329502492,Sat Jan 10 01:01:47 +0000 2015,-117.170173,32.699702
2,366331312,Sat Jan 10 01:01:47 +0000 2015,-121.207559,38.238465
3,1007081804,Sat Jan 10 01:01:50 +0000 2015,-117.086182,32.778781
4,89506217,Sat Jan 10 01:01:53 +0000 2015,-121.343304,37.657208


In [57]:
filtered = data[['from_user_id', 'longitude', 'latitude']] \
                .groupby('from_user_id') \
                .filter(lambda x: len(x) > 3) \
                .dropna()
filtered.head()


Unnamed: 0,from_user_id,longitude,latitude
0,2537269060,-117.130628,32.736346
7,506724954,-117.099371,32.74269
16,52483167,-117.076995,32.905226
18,74901100,-116.970399,32.630514
22,2537269060,-117.13065,32.736294


In [58]:
def cluster(locations, xmin = 1, epsilon = 0.3, neigh_samples = 3, random_state = 42):
    """ Devuelve el cluster de los datos para un usuario
        INPT: 
        locations es una matriz de coordenadas
        epsilon, neigh_samples: datos para DBSCAN
        OUTPT:
            (cluster, labels) si el numero de filas en locations > xmin
                               cluster es una lista de n_clusters matrices de len(n_clusters) x 2
            (none, none) en otro caso
    """
    db = DBSCAN(eps = epsilon, min_samples = neigh_samples, random_state = random_state).fit(locations)
    labels = db.labels_
    n_clusters = len(set(labels)) - (1 if -1 in labels else 0)

    clusters = [locations[labels == i] for i in xrange(n_clusters)]

    return clusters

In [60]:
def getCentroid(cluster):
    """
        Devuelve el centroide de las coordenadas del cluster
        INPT:
            cluster es el cluster obtenido para ese usuario
        OUTPT:
            centroid si cluster es distinto de None
            None en otro caso
    """
    try:
        return np.mean(cluster, axis = 0)
    except:
        return None

def maxCluster(clusters):
    """ Deuvelve el cluster mas grande para un usuario
        INPT:
            clusters: cluster obtenido para key
    """
    try:
        return clusters[np.argmax(map(len, clusters))]
    except:
        return None

In [64]:
for uid, val in filtered.groupby('from_user_id'):
    mtx = np.matrix(val[['longitude', 'latitude']].values)
    c = cluster(mtx)
    print uid, c

 76 [matrix([['-121.775835', '36.231731'],
        ['-121.776149', '36.231866'],
        ['-121.775856', '36.231743'],
        ['-121.775892', '36.231763']], dtype=object)]
20267224 [matrix([['-118.263054', '34.145047'],
        ['-118.263102', '34.145023'],
        ['-118.263103', '34.145022'],
        ['-118.263103', '34.145022']], dtype=object)]
22212936 [matrix([['-117.147518', '32.792603'],
        ['-117.14766', '32.792616'],
        ['-117.147469', '32.792647'],
        ['-117.147383', '32.792696'],
        ['-117.147733', '32.792567']], dtype=object)]
24551649 [matrix([['-117.155258', '32.772831'],
        ['-117.155319', '32.77278'],
        ['-117.154517', '32.773539'],
        ['-117.154323', '32.76891']], dtype=object)]
52483167 [matrix([['-117.076995', '32.905226'],
        ['-117.076987', '32.905228'],
        ['-117.076986', '32.905226'],
        ['-117.076996', '32.905226']], dtype=object)]
64958473 [matrix([['-118.297093', '34.166883'],
        ['-118.297106', '34.1668