In [1]:
import matplotlib.pyplot as plt
import os
import time
import pandas as pd
import numpy as np
class bounding_box:
    def __init__(self, _lat_min, _lon_min,_lat_max,_lon_max, _name=None):
        self.lat_min = _lat_min
        self.lon_min = _lon_min
        self.lat_max = _lat_max
        self.lon_max = _lon_max
        self.name = _name

In [20]:
# read the SNAP gowalla dataset
gowalla_df = pd.read_csv("data/gowalla_checkins.txt", delimiter=' ', header=0, names = ['uid','latitude','longitude','pid','date','time'])

Unnamed: 0,uid,latitude,longitude,pid,date,time
0,36606,32.924213,-96.795741,3338530,2010-10-21,13:53:26.000
1,36606,32.909843,-96.80552,78273,2010-10-20,17:42:31.000
2,36606,32.924213,-96.795741,3338530,2010-10-20,13:50:51.000
3,36606,32.793329,-96.779209,2336788,2010-10-20,03:30:45.000
4,36606,32.794833,-96.780163,67600,2010-10-20,03:30:36.000


In [21]:
#NY_downtown Salt_lake_downtown Cook_downtown Harris_downtown Miami_downtown LA_downtown SF
NY = bounding_box(40.69985199, -74.02004242, 40.80183029, -73.9427948 , "NY") 
Salt_lake = bounding_box(40.67640305, -112.00101471, 40.78819656, -111.85135651, "Salt_lake") 
Cook = bounding_box(41.82378006, -87.77555084,41.93557358, -87.62589264, "Cook") 
Harris = bounding_box(29.69077682, -95.43946075,29.80257225, -95.28980255, "Harris")  
Miami = bounding_box(25.74535751, -80.33049774,25.85715294, -80.18083954, "Miami") 
LA = bounding_box(33.96725082, -118.36177063,34.07904434, -118.21211243, "LA") 
SF = bounding_box(37.7079277 , -122.50764465,37.81974792, -122.35796356, "SF") 
Tulsa = bounding_box(36.09810257, -96.06762695, 36.20989609, -95.91796875, 'Tulsa')
Milwaukee = bounding_box(42.98300171, -87.98133087, 43.09479523, -87.84003448, 'Milwaukee')
Fargo = bounding_box(46.82130432, -96.86463165, 46.9330864, -96.71497345, 'Fargo') 
Boston = bounding_box(42.24830246, -71.20856476, 42.47189713, -70.90927887, 'Boston') 
Kansas_city = bounding_box(39.03318024, -94.6626358, 39.14497375, -94.5129776, 'Kansas_city') 
Phoenix = bounding_box(33.39250183, -112.1488266, 33.50429535, -111.9991684, 'Phoenix') 

bboxes = [NY, Salt_lake, Cook, Harris, Miami, LA, SF, Tulsa, Milwaukee, Fargo, Boston,Kansas_city, Phoenix]

In [22]:
for bbox in bboxes:
    print( bbox.name,  (bbox.lat_min + bbox.lat_max)/2, (bbox.lon_min + bbox.lon_max)/2 )

NY 40.75084114 -73.98141860999999
Salt_lake 40.732299805 -111.92618561
Cook 41.87967682 -87.70072174
Harris 29.746674535 -95.36463165
Miami 25.801255225 -80.25566864
LA 34.02314758 -118.28694153000001
SF 37.76383781 -122.432804105
Tulsa 36.153999330000005 -95.99279785
Milwaukee 43.03889847000001 -87.910682675
Fargo 46.87719536 -96.78980255
Boston 42.360099795 -71.05892181499999
Kansas_city 39.089076995 -94.5878067
Phoenix 33.44839859 -112.0739975


In [23]:
for bbox in bboxes:
    df = gowalla_df[(gowalla_df['latitude'] > bbox.lat_min) & (gowalla_df['latitude'] < bbox.lat_max) & (gowalla_df['longitude'] > bbox.lon_min) & (gowalla_df['longitude'] < bbox.lon_max)]
    df.to_csv('gowalla'+bbox.name, index=False)
    print(bbox.name)

NY
Salt_lake
Cook
Harris
Miami
LA
SF
Tulsa
Milwaukee
Fargo
Boston
Kansas_city
Phoenix


## Various ideas to capture distribution of datasets from public POI data

#### entropy profile
Compute entropy over 2d hsitrograms of variaous binnings
Evaluate whether they are different enough between cities.
Example. Harris county has higher skewness (i.e. dense pockets) which require finer cells to capture the resolution.

#### hot pockets
Calculate the number of bins with atleast c points in it.
Intuition is that it corresponds to the signal-to-noise ratio of DP. Essentially we can learn a relation between eps and the hotpockets at x threshold.

#### Nearest how far
Calcualte nearest neighbor distance for each point. 
Calculate average/median/skewness of these distances.

In [24]:
from scipy.stats import entropy
from rtree import index

In [26]:
# entropy profile
for _bins in [512]:
# for _bins in [64, 256, 512, 1024]:
    for bbox in bboxes:
        df = pd.read_csv('gowalla'+bbox.name)
        H, xedges, yedges = np.histogram2d(df.iloc[:, 2].values, df.iloc[:, 3].values, bins=_bins)
        x_probs = np.true_divide(H,np.sum(H)) # convert the histogram to probability
        x_probs = x_probs.ravel()
        ent = entropy(x_probs)
        print("{0:<15}".format(bbox.name), '\t', _bins, np.sum(H),'\t', ent)

NY              	 512 100538.0 	 8.349732946613312
Salt_lake       	 512 6976.0 	 5.874959424303732
Cook            	 512 24793.0 	 7.3405223689656705
Harris          	 512 20653.0 	 7.036337115173779
Miami           	 512 5372.0 	 5.5973731049470095
LA              	 512 15265.0 	 6.860816965068247
SF              	 512 149820.0 	 7.79794281221878
Tulsa           	 512 4307.0 	 5.792334496603348
Milwaukee       	 512 6231.0 	 6.21496345770543
Fargo           	 512 270.0 	 4.007187265076273
Boston          	 512 40299.0 	 7.403692236311014
Kansas_city     	 512 15735.0 	 6.171420523321162
Phoenix         	 512 6567.0 	 4.833425784656036


In [35]:
# hot pockets vs avg noise (2/eps)
eps = [0.2]
avg_noise = 2.0/np.array(eps)
print("Avg Noise:", avg_noise)
for _bins in [512]:
    for bbox in bboxes:
        df = pd.read_csv('gowalla'+bbox.name)
        H, xedges, yedges = np.histogram2d(df.iloc[:, 2].values, df.iloc[:, 3].values, bins=_bins)
        H = H.ravel()
        hot = [] 
        for val in avg_noise:
            hot.append((H >= val).sum())

        print("{0:<15}".format(bbox.name), '\t', _bins, np.sum(H),'\t', hot[0])

Avg Noise: [10.]
NY              	 512 100538.0 	 2382
Salt_lake       	 512 6976.0 	 149
Cook            	 512 24793.0 	 679
Harris          	 512 20653.0 	 530
Miami           	 512 5372.0 	 112
LA              	 512 15265.0 	 355
SF              	 512 149820.0 	 2771
Tulsa           	 512 4307.0 	 106
Milwaukee       	 512 6231.0 	 179
Fargo           	 512 270.0 	 5
Boston          	 512 40299.0 	 918
Kansas_city     	 512 15735.0 	 364
Phoenix         	 512 6567.0 	 122


In [29]:
import math
def dist_in_kmeters(Lat1, Long1, Lat2, Long2):
    x = Lat2 - Lat1
    y = (Long2 - Long1) * math.cos((Lat2 + Lat1)*0.00872664626)  
    return 111.319 * math.sqrt(x*x + y*y)

In [31]:
idx = index.Index() # rtree index intialization
for bbox in bboxes:
        df = pd.read_csv('gowalla'+bbox.name)
        i = 0
        coord_pairs = [(i,j) for i,j in zip(df.iloc[:, 2].values,df.iloc[:, 3].values)]
        for coord in coord_pairs:
            i+=1
            idx.insert(i,coord)
            
#         assert len(df) == idx.count((bbox.lat_min, bbox.lon_min, bbox.lat_max, bbox.lon_max))
        
        cum_nn1_dist = 0 
        cum_nn2_dist = 0
        for coord in coord_pairs:
            nns_ids = list(idx.nearest(coord, 2))
            try:
                nn_1 = coord_pairs[nns_ids[1]]
#             nn_2 = coord_pairs[nns_ids[2]]
#                 print(nn_1)
                cum_nn1_dist += dist_in_kmeters(coord[0], coord[1], nn_1[0], nn_1[1])
#             cum_nn2_dist += dist_in_kmeters(coord[0], coord[1], nn_2[0], nn_2[1])
            except:
                continue
        cum_nn1_dist = cum_nn1_dist / len(df)
#         cum_nn2_dist = cum_nn2_dist / len(df)
        print("{0:<15}".format(bbox.name), '\t',cum_nn1_dist, '\t', cum_nn2_dist)
#             break
#         break
#         for i in range(len(df)): 
#             idx.insert(i, (non_unif_qs2[i][0],non_unif_qs2[i][1]))


NY              	 13691569.787143717 	 0
Salt_lake       	 13778944.697372658 	 0
Cook            	 2044142.9917044085 	 0
Harris          	 4413807.091733104 	 0
Miami           	 9021037.888770834 	 0
LA              	 21730316.096072465 	 0
SF              	 15869701.778604556 	 0
Tulsa           	 4556160.424671883 	 0
Milwaukee       	 1587721.865093697 	 0
Fargo           	 6806332.186358199 	 0
Boston          	 15592801.247122241 	 0
Kansas_city     	 2889027.289944852 	 0
Phoenix         	 13984776.279203761 	 0
