In [1]:
from sklearn.cluster import MeanShift
import numpy as np
import pandas as pd

In [2]:
with open('checkins.dat', 'r') as input_file:
    line = input_file.readline()

In [4]:
line.split('|')

['   id    ',
 ' user_id ',
 ' venue_id ',
 '     latitude      ',
 '     longitude     ',
 '     created_at      \n']

In [5]:
row = [field.strip() for field in line.split('|')]

In [6]:
row

['id', 'user_id', 'venue_id', 'latitude', 'longitude', 'created_at']

In [7]:
import csv

with open('checkins.dat') as dat_file, open('checkins.csv', 'w') as csv_file:
    csv_writer = csv.writer(csv_file)

    for line in dat_file:
        row = [field.strip() for field in line.split('|')]
        if len(row) == 6 and row[3] and row[4]:
            csv_writer.writerow(row)

In [8]:
data = pd.read_csv('checkins.csv', sep=',')

In [9]:
data.head()

Unnamed: 0,id,user_id,venue_id,latitude,longitude,created_at
0,984222,15824,5222,38.895112,-77.036366,2012-04-21 17:43:47
1,984234,44652,5222,33.800745,-84.41052,2012-04-21 17:43:43
2,984291,105054,5222,45.523452,-122.676207,2012-04-21 17:39:22
3,984318,2146539,5222,40.764462,-111.904565,2012-04-21 17:35:46
4,984232,93870,380645,33.448377,-112.074037,2012-04-21 17:38:18


In [18]:
data.dtypes

id                     int64
user_id                int64
venue_id               int64
latitude             float64
longitude            float64
created_at    datetime64[ns]
dtype: object

In [11]:
data.created_at = data.created_at.apply(pd.to_datetime)

In [12]:
data.shape

(396634, 6)

In [23]:
data_head = data[:100000][['latitude', 'longitude']]

In [25]:
clustering = MeanShift(bandwidth=0.1).fit(data_head)

In [28]:
max(clustering.labels_)

3230

In [29]:
data_head['labels'] = pd.Series(clustering.labels_, index=data_head.index)

In [31]:
clustering.cluster_centers_

array([[  40.7177164 ,  -73.99183542],
       [  33.44943805, -112.00213969],
       [  33.44638027, -111.90188756],
       ...,
       [ -37.8229826 ,  145.1811902 ],
       [ -41.2924945 ,  174.7732353 ],
       [ -45.0311622 ,  168.6626435 ]])

In [33]:
labels_unique = np.unique(clustering.labels_)

In [35]:
n_clusters_ = len(labels_unique)

In [36]:
cluster_centers = clustering.cluster_centers_

In [43]:
np.array(data_head.iloc[[0], [0]])

array([[38.8951118]])

In [46]:
counts = data_head.groupby('labels').count()[['latitude']]

In [59]:
counts['centers_la'] = pd.Series(np.transpose(clustering.cluster_centers_)[0], index=counts.index)

In [60]:
counts['centers_lo'] = pd.Series(np.transpose(clustering.cluster_centers_)[1], index=counts.index)

In [63]:
counts_new = counts[counts.latitude > 15]

In [64]:
counts_new[['centers_la', 'centers_lo']].to_csv('counts_new.csv', sep=',', index=False)

In [70]:
offices = np.array([[33.751277, -118.188740], [25.867736, -80.324116], [51.503016, -0.075479], [52.378894, 4.885084], [39.366487, 117.036146], [-33.868457, 151.205134]])

In [71]:
offices

array([[ 3.37512770e+01, -1.18188740e+02],
       [ 2.58677360e+01, -8.03241160e+01],
       [ 5.15030160e+01, -7.54790000e-02],
       [ 5.23788940e+01,  4.88508400e+00],
       [ 3.93664870e+01,  1.17036146e+02],
       [-3.38684570e+01,  1.51205134e+02]])

In [73]:
len(counts_new)

592

In [81]:
centers = np.transpose(np.array([counts_new['centers_la'], counts_new['centers_lo']]))

In [93]:
dists = []
for i in range(len(counts_new)):
    for j in range(len(offices)):
        dists.append([np.linalg.norm(centers[i]-offices[j]), offices[j], centers[i]])

In [96]:
from heapq import nsmallest

nsmallest(20, dists)[0][2]

array([-33.86063043, 151.20477593])

In [97]:
with open("PA_clusters_answer.txt", "w") as fout:
    fout.write(" ".join([str(num) for num in nsmallest(20, dists)[0][2]]))

In [98]:
nsmallest(20, dists)

[[0.007834758163107856,
  array([-33.868457, 151.205134]),
  array([-33.86063043, 151.20477593])],
 [0.009353316185992226,
  array([52.378894,  4.885084]),
  array([52.37296399,  4.89231722])],
 [0.022674066158385495,
  array([ 25.867736, -80.324116]),
  array([ 25.84567226, -80.3188906 ])],
 [0.05005829482278787,
  array([51.503016, -0.075479]),
  array([51.50299126, -0.12553729])],
 [0.07084773242719973,
  array([  33.751277, -118.18874 ]),
  array([  33.80987796, -118.14892381])],
 [0.13410903336184654,
  array([ 25.867736, -80.324116]),
  array([ 25.78581242, -80.21793804])],
 [0.16740596425035326,
  array([ 25.867736, -80.324116]),
  array([ 25.70534972, -80.28342874])],
 [0.18887596060185083,
  array([ 25.867736, -80.324116]),
  array([ 26.01009825, -80.19999059])],
 [0.19577945647763628,
  array([  33.751277, -118.18874 ]),
  array([  33.88832534, -118.04892817])],
 [0.21181053682436798,
  array([  33.751277, -118.18874 ]),
  array([  33.87298601, -118.36209115])],
 [0.222233290