In [2]:
import pandas as pd
from geopy import distance
from tqdm import tqdm
import gmplot
from matplotlib import pyplot as plt
from sklearn.cluster import DBSCAN
import numpy as np
import requests
import json

In [3]:
filepath = "/home/yash/Documents/CB/location_data/"
filename = "original_points/User3.json"

# Less -> More accuracy
accuracy_threshold = 50

tdiff_threshold = 1200 # In seconds

geodist_threshold = 50 # In metres

visit_threshold = 300 # In seconds

## Load Dataset and clean it

In [14]:
# Read file into dataframe
df = pd.read_json(filepath+filename, orient='records')

# Drop useless columns
df.drop(['activity', 'altitude', 'verticalAccuracy', 'heading', 'velocity'], axis=1, inplace=True)

# Rename weirdly named columns
df.rename(index = int, columns = {'latitudeE7':'latitude',
                                  'longitudeE7': 'longitude',
                                  'timestampMs' : 'timestamp'}, inplace =True)
# Add offset for GMT to IST
df['timestamp'].apply(lambda x: x+pd.Timedelta(hours=5, minutes=30))

# Apply accuracy filter
df = df[df['accuracy'] <= accuracy_threshold]

# Apply the time range filter
df = df[df['timestamp'].dt.year==2014]

# Arrange the dataframe in ascending time order
df = df.reindex(index=df.index[::-1])
df['index'] = list(range(df.shape[0])) 
df.set_index('index', inplace=True)

# Initialise time difference and geodist columns
df['tdiff'] = 0
df['geodist'] = 0

# Lat and Long in the form 19045972, 73025802
# Convert to 19.045972, 73.025802
df['latitude'] = df['latitude'].apply(lambda x: x/10000000)
df['longitude'] = df['longitude'].apply(lambda x: x/10000000)
df['location'] = list(zip(df.latitude, df.longitude))

# Add additional date and time related columns
df['day'] = df['timestamp'].dt.weekday
df['week'] = df['timestamp'].dt.week
df['month'] = df['timestamp'].dt.month
df['year'] = df['timestamp'].dt.year

df

Unnamed: 0_level_0,accuracy,latitude,longitude,timestamp,tdiff,geodist,location,day,week,month,year
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,43,18.971995,72.808788,2014-01-01 00:00:57.565,0,0,"(18.9719951, 72.8087883)",2,1,1,2014
1,21,18.972108,72.808674,2014-01-01 00:01:57.742,0,0,"(18.9721078, 72.8086744)",2,1,1,2014
2,43,18.971751,72.808505,2014-01-01 00:02:57.792,0,0,"(18.9717509, 72.8085047)",2,1,1,2014
3,43,18.971751,72.808505,2014-01-01 00:03:58.400,0,0,"(18.9717509, 72.8085047)",2,1,1,2014
4,43,18.971751,72.808505,2014-01-01 00:04:59.187,0,0,"(18.9717509, 72.8085047)",2,1,1,2014
5,43,18.971751,72.808505,2014-01-01 00:05:59.552,0,0,"(18.9717509, 72.8085047)",2,1,1,2014
6,27,18.972165,72.808686,2014-01-01 00:06:59.713,0,0,"(18.9721652, 72.8086857)",2,1,1,2014
7,37,18.972115,72.808769,2014-01-01 00:07:59.736,0,0,"(18.9721151, 72.808769)",2,1,1,2014
8,36,18.972122,72.808761,2014-01-01 00:08:59.699,0,0,"(18.9721224, 72.8087605)",2,1,1,2014
9,37,18.970088,72.810233,2014-01-01 00:11:59.934,0,0,"(18.9700879, 72.8102332)",2,1,1,2014


## Calculate Differences between consecutive points

In [166]:
# Returns time difference between 2 timestamps in seconds
def timeDiff(t1, t2):
    return round(pd.Timedelta(t2 - t1).seconds)

# Returns geographic distance between coordinates in metres
def geoDist(l1, l2):
    return round(distance.vincenty(l1, l2).m )

In [167]:
# Generate two series objects from the timestamps in such a way that both can be subtracts(timeDiff)
# Combine the two series objects wuth the function tdiff

ts = df['timestamp']
ts1 = ts.shift(periods=1)[1:]
ts2 = ts[1:]

df['tdiff'] = ts1.astype(object).combine(ts2, func=timeDiff)
df['tdiff'][0] = 0

location = df['location']
location1 = location.shift(periods=1)[1:]
location2 = location[1:]

df['geodist'] = location1.astype(object).combine(location2, func=geoDist)
df['geodist'][0] = 0

df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


Unnamed: 0_level_0,accuracy,latitude,longitude,timestamp,velocity,tdiff,geodist,location,day,week,month,year
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,24,19.107731,72.839502,2018-01-02 08:57:52.707,,0,0,"(19.1077314, 72.8395022)",1,1,1,2018
1,23,19.107733,72.839515,2018-01-02 08:59:38.334,,105,1,"(19.1077325, 72.839515)",1,1,1,2018
2,23,19.107736,72.839508,2018-01-02 09:00:52.363,,74,1,"(19.107736, 72.8395077)",1,1,1,2018
3,7,19.107695,72.839625,2018-01-02 09:01:12.739,1.0,20,13,"(19.107695, 72.8396248)",1,1,1,2018
4,4,19.107770,72.839625,2018-01-02 09:01:28.768,1.0,16,8,"(19.1077703, 72.8396251)",1,1,1,2018
5,24,19.108127,72.839717,2018-01-02 09:02:34.905,,66,41,"(19.1081265, 72.839717)",1,1,1,2018
6,25,19.108166,72.839665,2018-01-02 09:03:40.444,,65,7,"(19.1081657, 72.839665)",1,1,1,2018
7,25,19.108166,72.839665,2018-01-02 09:04:40.556,,60,0,"(19.1081657, 72.839665)",1,1,1,2018
8,24,19.108158,72.839648,2018-01-02 09:05:00.248,,19,2,"(19.1081577, 72.8396477)",1,1,1,2018
9,25,19.108107,72.839703,2018-01-02 09:05:40.349,,40,8,"(19.1081072, 72.8397035)",1,1,1,2018


## Generate Staypoints

In [168]:
# Start processing each row and compare with the previous row's tdiff and geodist
# If it fits within tdiff and geodist bounds, consider the point to be a part of the staypoint

# Set the first entry to be the parent SP

df['visit_time'] = 0

parent = 0
location = df['location'][parent] # Location of current parent
visit_time = 0 # Intialising the visit_time
counter = 1 # Number points forming the new staypoint
lat = df['latitude'][parent] # Location of the parent
long = df['longitude'][parent]

for i in tqdm(df.index[1:]):
    
    # Check if the entry's tdiff and geodist fall within constraints
    
    # If yes, add its time to total time at that SP
    if (df['geodist'][i]<=geodist_threshold and df['tdiff'][i]<=tdiff_threshold and geoDist(location, df['location'][i])<=geodist_threshold):
        visit_time += df['tdiff'][i]
        lat += df['latitude'][i] # Summing the location of all points forming the SP
        long += df['longitude'][i]
        counter += 1
        df.drop(i, inplace=True)
    
    # If no, make the parent entry SP and set this to be the parent entry
    else:
        
        if visit_time < visit_threshold:
            df.drop(parent, inplace=True)
            
        else:
            # Set visit time and location of the newly formed SP
            df.loc[parent, 'visit_time'] = visit_time
            lat /= counter
            long /= counter
            df.loc[parent, 'latitude'] = lat
            df.loc[parent, 'longitude'] = long
            df['location'][parent] = (lat,long)
            

        # Set new parent
        parent = i
        lat = df['latitude'][parent]
        long = df['longitude'][parent]
        location = df['location'][parent]
        counter = 1
        visit_time = 0

df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
100%|██████████| 46573/46573 [07:18<00:00, 106.29it/s]


Unnamed: 0_level_0,accuracy,latitude,longitude,timestamp,velocity,tdiff,geodist,location,day,week,month,year,visit_time
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
6,25,19.108144,72.839677,2018-01-02 09:03:40.444,,65,7,"(19.1081441, 72.8396771)",1,1,1,2018,516
31,27,19.076308,72.837945,2018-01-02 09:46:02.766,,60,235,"(19.0763081429, 72.8379451486)",1,1,1,2018,4472
68,25,19.076639,72.862119,2018-01-02 11:14:26.272,,199,740,"(19.0766390211, 72.8621188368)",1,1,1,2018,1910
93,27,19.076658,72.862114,2018-01-02 11:58:51.500,,59,57,"(19.0766583469, 72.8621140219)",1,1,1,2018,6138
157,35,19.076439,72.862100,2018-01-02 13:52:19.541,,646,23,"(19.07643896, 72.86209954)",1,1,1,2018,1551
164,50,19.076470,72.862119,2018-01-02 15:27:25.627,,1658,22,"(19.0764698, 72.86211918)",1,1,1,2018,474
169,23,19.076636,72.862178,2018-01-02 16:03:21.963,,1681,19,"(19.0766362, 72.862178173)",1,1,1,2018,7183
243,31,19.076630,72.862176,2018-01-03 02:11:23.414,,29267,23,"(19.0766299577, 72.8621761488)",2,1,1,2018,23683
456,29,19.076636,72.862130,2018-01-03 09:55:56.094,,4094,10,"(19.0766364842, 72.8621301904)",2,1,1,2018,13148
585,26,19.076666,72.862132,2018-01-03 13:56:45.315,,61,47,"(19.0766659875, 72.8621323208)",2,1,1,2018,5174


### Rearrange Dataframe

In [169]:
df['index'] = list(range(df.shape[0])) 
df.set_index('index', inplace=True)

# Generate two series objects from the timestamps in such a way that both can be subtracts(timeDiff)
# Combine the two series objects with the function tdiff

ts = df['timestamp']
ts1 = ts.shift(periods=1)[1:]
ts2 = ts[1:]

df['tdiff'] = ts1.astype(object).combine(ts2, func=timeDiff)
df['tdiff'] = df['tdiff'] - df['visit_time'].shift(1)[1:]
df['tdiff'][0] = 0

location = df['location']
location1 = location.shift(periods=1)[1:]
location2 = location[1:]

df['geodist'] = location1.astype(object).combine(location2, func=geoDist)
df['geodist'][0] = 0

df['cluster_id'] = 0
df['tag'] = "Unknown"

df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0_level_0,accuracy,latitude,longitude,timestamp,velocity,tdiff,geodist,location,day,week,month,year,visit_time,cluster_id,tag
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0,25,19.108144,72.839677,2018-01-02 09:03:40.444,,0,0,"(19.1081441, 72.8396771)",1,1,1,2018,516,0,Unknown
1,27,19.076308,72.837945,2018-01-02 09:46:02.766,,2026,3529,"(19.0763081429, 72.8379451486)",1,1,1,2018,4472,0,Unknown
2,25,19.076639,72.862119,2018-01-02 11:14:26.272,,831,2544,"(19.0766390211, 72.8621188368)",1,1,1,2018,1910,0,Unknown
3,27,19.076658,72.862114,2018-01-02 11:58:51.500,,755,2,"(19.0766583469, 72.8621140219)",1,1,1,2018,6138,0,Unknown
4,35,19.076439,72.862100,2018-01-02 13:52:19.541,,670,24,"(19.07643896, 72.86209954)",1,1,1,2018,1551,0,Unknown
5,50,19.076470,72.862119,2018-01-02 15:27:25.627,,4155,4,"(19.0764698, 72.86211918)",1,1,1,2018,474,0,Unknown
6,23,19.076636,72.862178,2018-01-02 16:03:21.963,,1682,19,"(19.0766362, 72.862178173)",1,1,1,2018,7183,0,Unknown
7,31,19.076630,72.862176,2018-01-03 02:11:23.414,,29298,1,"(19.0766299577, 72.8621761488)",2,1,1,2018,23683,0,Unknown
8,29,19.076636,72.862130,2018-01-03 09:55:56.094,,4189,5,"(19.0766364842, 72.8621301904)",2,1,1,2018,13148,0,Unknown
9,26,19.076666,72.862132,2018-01-03 13:56:45.315,,1301,3,"(19.0766659875, 72.8621323208)",2,1,1,2018,5174,0,Unknown


### Save and visualise Staypoints

In [20]:

#df.to_csv(filepath+'stay_points/Lx2018SP.csv')

gmap = gmplot.GoogleMapPlotter(19.05, 72.8, 16)
gmap.scatter(df['latitude'], df['longitude'], 'black' , marker=True)
gmap.draw('CB2018SP_compare.html')
del gmap


### Function for clustering

In [182]:
# Form clusters
# Takes dataframe of staypoints as parameter and adds a column cluster_id to it
# Uses DBSCAN, param: min_smaples and epsilon
def form_clusters(df, index, epsilon=0.03, min_samples=2):
        
    coords = df.loc[index].as_matrix(columns=['latitude', 'longitude'])
  
    kms_per_radian = 6371.0088
    epsilon = epsilon / kms_per_radian
    db = DBSCAN(eps=epsilon, min_samples=min_samples, algorithm='ball_tree', metric='haversine').fit(np.radians(coords))
    cluster_labels = db.labels_
    num_clusters = len(set(cluster_labels))
  
    df.loc[index,'cluster_id'] = cluster_labels
  
    print('Number of clusters: {}'.format(num_clusters))

## Tagging

In [170]:
df['cluster_id'] = 0
df['tag'] = "Unknown"

for year in df.year.drop_duplicates():
    
    for week in df[df['year']==year].week.drop_duplicates():
        
        temp = df.loc[ (df['year']==year) & (df['week']==week) ]
        form_clusters(df, temp.index)
        temp = df.loc[ (df['year']==year) & (df['week']==week) ]
        
        if temp.cluster_id.max() >= 1:
        
            # Create a df to reduce the clusters to centroids

            centroids = pd.DataFrame()
            centroids['cluster_id'] = list(range(temp.cluster_id.max()+1)) 
            centroids.set_index('cluster_id', inplace=True)
            centroids['latitude'] = 0.0
            centroids['longitude'] = 0.0
            centroids['week_visit_time'] = 0
            centroids['week_frequency'] = 0
            centroids['tag'] = "Unknown"

            for i in centroids.index:
                rows = temp[temp['cluster_id']==i]
                centroids['latitude'][i] = rows['latitude'].mean()
                centroids['longitude'][i] = rows['longitude'].mean()
                centroids['week_visit_time'][i] = rows['visit_time'].sum()
                centroids['week_frequency'][i] = rows.shape[0]
                centroids['tag'] = 'Unknown'
            
            ## Identify places with max time spent as home and second max as work
            home_id = centroids['week_visit_time'].idxmax()
            t = centroids.drop(home_id)
            work_id = t['week_visit_time'].idxmax()
            print(home_id, work_id)
            df.loc[(df['year']==year) & (df['week']==week) & (df['cluster_id']==home_id) , 'tag'] = 'Home'
            centroids.loc[home_id, 'tag'] = 'Home'
            df.loc[(df['year']==year) & (df['week']==week) & (df['cluster_id']==work_id) , 'tag'] = 'Work'
            centroids.loc[work_id, 'tag'] = 'Work'
            
            for i in centroids.index:
                if centroids.loc[i,'tag'] == 'Unknown':
                    centroids.loc[i,'tag'] = getTag(centroids.loc[i, 'latitude'], centroids.loc[i, 'longitude'])
                    df.loc[(df['year']==year) & (df['week']==week) & (df['cluster_id']==i), 'tag'] = centroids.loc[i,'tag']
            
            print(centroids)

            
for i in df.index:
    if df.loc[i, 'tag'] == 'Unknown':
        df.loc[i, 'tag'] = getTag(df.loc[i, 'latitude'], df.loc[i, 'longitude'])
                                                                      
                                                                      
            
df['tag'].value_counts()

Number of clusters: 4


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


0 2
             latitude  longitude  week_visit_time  week_frequency      tag
cluster_id                                                                
0           19.076621  72.862154           205498              20     Home
1           19.030842  73.015227             6753               2  Unknown
2           19.107239  72.837193            49728              10     Work
Number of clusters: 4


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


0 1
             latitude  longitude  week_visit_time  week_frequency      tag
cluster_id                                                                
0           19.076637  72.862154           357100              12     Home
1           19.107182  72.837147            98396              23     Work
2           19.121024  72.870435            18976               3  Unknown
Number of clusters: 3


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


1 0
             latitude  longitude  week_visit_time  week_frequency   tag
cluster_id                                                             
0           19.107226  72.837159           101574              29  Work
1           19.076598  72.862209           285409              21  Home
Number of clusters: 3


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


0 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



             latitude  longitude  week_visit_time  week_frequency   tag
cluster_id                                                             
0           19.076539  72.862217           332687              29  Home
1           19.107237  72.837172            79992              20  Work
Number of clusters: 5
1 0
             latitude  longitude  week_visit_time  week_frequency      tag
cluster_id                                                                
0           19.107230  72.837170            76890              17     Work
1           19.076554  72.862218           258753              27     Home
2           19.080661  72.853848             1304               2  Unknown
3           19.103670  72.831329             1678               2     Mall
Number of clusters: 3


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a

0 1
             latitude  longitude  week_visit_time  week_frequency   tag
cluster_id                                                             
0           19.076527  72.862210           313276              30  Home
1           19.107232  72.837165            93994              14  Work
Number of clusters: 3

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



0 1
             latitude  longitude  week_visit_time  week_frequency   tag
cluster_id                                                             
0           19.076535  72.862225           161094               9  Home
1           19.132676  72.917440            17340               6  Work

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



Number of clusters: 3
0 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


1
             latitude  longitude  week_visit_time  week_frequency   tag
cluster_id                                                             
0           19.076660  72.862237           278684              32  Home
1           19.107216  72.837229           115242              27  Work
Number of clusters: 4
0 1
             latitude  longitude  week_visit_time  week_frequency      tag
cluster_id                                                                
0           19.076691  72.862252           288785              29     Home
1           19.107209  72.837233            79987              12     Work
2           19.119024  72.887666            28095               3  Unknown
Number of clusters: 4


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


0 1
             latitude  longitude  week_visit_time  week_frequency   tag
cluster_id                                                             
0           19.076679  72.862253           268034              40  Home
1           19.107214  72.837233            96152              14  Work
2           19.078164  72.859135              802               2  Park
Number of clusters: 3


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


0 1


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


             latitude  longitude  week_visit_time  week_frequency   tag
cluster_id                                                             
0           19.076681  72.862244           255196              22  Home
1           19.107256  72.837216           105038              25  Work
Number of clusters: 4
0 1
             latitude  longitude  week_visit_time  week_frequency     tag
cluster_id                                                               
0           19.076684  72.862248           251876              34    Home
1           19.107467  72.837214            82848              11    Work
2           19.095292  72.848103             3915               2  School
Number of clusters: 4


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


0 1
             latitude  longitude  week_visit_time  week_frequency   tag
cluster_id                                                             
0           19.076688  72.862253           184715              31  Home
1           19.107245  72.837247            31874              11  Work
2           19.085486  72.887666              964               2  Mall
Number of clusters: 4


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


1 0
             latitude  longitude  week_visit_time  week_frequency       tag
cluster_id                                                                 
0           19.107197  72.837223            54668              27      Work
1           19.076663  72.862269           157075              38      Home
2           19.098746  72.848111             1796               2  Hospital
Number of clusters: 5


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


0 1
             latitude  longitude  week_visit_time  week_frequency      tag
cluster_id                                                                
0           19.076648  72.862252           225163              36     Home
1           19.107188  72.837243            85180              23     Work
2           19.004103  72.818583             1766               2  Unknown
3           19.030892  73.015333             2556               3  Unknown
Number of clusters: 3


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


0 1
             latitude  longitude  week_visit_time  week_frequency   tag
cluster_id                                                             
0           19.076648  72.862257           121954              27  Home
1           19.107269  72.837251            52285              20  Work

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



Number of clusters: 3
0 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


1
             latitude  longitude  week_visit_time  week_frequency   tag
cluster_id                                                             
0           19.076656  72.862257            87754              22  Home
1           19.107265  72.837108            81742              15  Work
Number of clusters: 8
0 5
             latitude  longitude  week_visit_time  week_frequency         tag
cluster_id                                                                   
0           19.076648  72.862249           281456              35        Home
1           19.118718  72.938180             3706               2     Unknown
2           19.099702  72.915678             4196               2        Mall
3           19.098843  72.916701             5785               3        Mall
4           19.068547  72.857428             7418               2  University
5           19.107136  72.837192            26422               5        Work
6           19.073054  72.851345             1059           

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


0 1
             latitude  longitude  week_visit_time  week_frequency   tag
cluster_id                                                             
0           19.076638  72.862236            55908               7  Home
1           19.107152  72.837269            14927               5  Work


Home          501
Work          314
Unknown        68
Mall           43
University     14
School         12
Hospital       12
Park            6
Name: tag, dtype: int64

### Visualise tags

In [172]:
gmap = gmplot.GoogleMapPlotter(19.05, 72.8, 16)
tags = ['Home', 'Work', 'Mall', 'University', 'Hospital', 'School', 'Park', 'Unknown']
colors = ['green', 'red', 'blue', 'purple', 'yellow', 'pink', 'white', 'black']

for i in range(len(tags)):
    gmap.scatter(df[df.tag==tags[i]]['latitude'], df[df.tag==tags[i]]['longitude'], colors[i] , marker=True)

gmap.draw('Lxtag18.html')
del gmap

In [205]:
frequency_threshold = 0.05
time_threshold = 0.05

temp = df[(df.tag!='Home') & (df.tag!='Work')].copy()

temp['cluster_id'] = 0
temp['frequent'] = False
temp['interested'] = False

form_clusters(temp, temp.index, epsilon=0.05, min_samples=2)

nSP = temp.shape[0]
total_time = temp['visit_time'].sum()
nClusters = temp['cluster_id'].max() + 1

for i in range(nClusters):
    
    nVisits = temp[temp.cluster_id==i].shape[0]
    total_visit_time = temp[temp.cluster_id==i]['visit_time'].sum()
    
    # Normalise
    nVisits /= nSP
    total_visit_time /= total_time
    
    #print(nVisits)
    #print(total_visit_time)
    
    if nVisits >= frequency_threshold: temp.loc[temp.cluster_id==i, 'frequent'] = True
    if total_visit_time >= time_threshold: temp.loc[temp.cluster_id==i, 'interested'] = True
    
print(temp['frequent'].value_counts())
print(temp['interested'].value_counts())

print(temp[(temp.frequent==True) & (temp.interested==True)].shape[0])
print(temp[(temp.frequent==True) & (temp.interested==False)].shape[0])
print(temp[(temp.frequent==False) & (temp.interested==True)].shape[0])
print(temp[(temp.frequent==False) & (temp.interested==False)].shape[0])

Number of clusters: 34
False    133
True      22
Name: frequent, dtype: int64
False    135
True      20
Name: interested, dtype: int64
13
9
7
126


### Function for tagging

###### results-> array of results -> geometry, name, types(tags), etc

###### We extract only the types

In [140]:

# Given lat, long returns a tag
def getTag(lat, long):
    
    # Fetch GMaps API result
    # Returns a list of places along with their details
    results = requests.get('https://maps.googleapis.com/maps/api/place/nearbysearch/json?location='+str(lat)+','+str(long)+'&radius=50&key=AIzaSyBdqlVT2vWz9wu_olDovWYI561da6QtKyk').json()['results']
    tags = []
    
    # Extract the types array from the list of results
    for result in results:
        tags.append(result['types'])
    
    # Cleanse the list of tags
    tags = cleanTags(tags)
    
    if(len(tags)<2): return 'Unknown'
    
    if checkMall(tags): return 'Mall'
    elif checkHospital(tags): return 'Hospital'
    elif checkPark(tags): return 'Park'
    elif checkUniversity(tags): return 'University'
    elif checkSchool(tags): return 'School'
    else: return 'Unknown'

    
def cleanTags(tags):
    unwanted_tags =['point_of_interest', 'establishment', 'route', 'political', 'locality', 'sublocality']
    clean_tags = []
    
    for tag in tags:
        cleaned = [x for x in tag if x not in unwanted_tags]
        if cleaned:
            clean_tags.append(cleaned)
        
    return clean_tags

def checkHospital(tags):
    threshold = 4
    hospital_tags = ['hospital', 'health']
    
    i=0
    for tag in tags:
        filtered_tags = [x for x in tag if x in hospital_tags]
        if filtered_tags:
            i+=len(filtered_tags)
            
    return i>=threshold

def checkMall(tags):
    threshold = 10
    mall_tags = ['store','clothing_store', 'electronics_store', 'home_goods_store', 'shoe_store', 'book_store', 'shopping_mall']
    
    i=0
    for tag in tags:
        filtered_tags = [x for x in tag if x in mall_tags]
        if filtered_tags:
            i+=len(filtered_tags)
            
    return i>=threshold

def checkSchool(tags):
    threshold = 2
    for i in range(threshold):
        if "school" in tags[i]:
            return True    
    return False

def checkUniversity(tags):
    threshold = 2
    for i in range(threshold):
        if "university" in tags[i]:
            return True    
    return False

def checkPark(tags):
    threshold = 2
    for i in range(threshold):
        if ("park" or 'garden') in tags[i]:
            return True    
    return False

In [3]:
requests.get('https://maps.googleapis.com/maps/api/place/nearbysearch/json?location=19.107747,72.836470&radius=50&key=AIzaSyBdqlVT2vWz9wu_olDovWYI561da6QtKyk').json()['results']

[{'geometry': {'location': {'lat': 19.1077226, 'lng': 72.83683719999999},
   'viewport': {'northeast': {'lat': 19.1090715802915,
     'lng': 72.8381861802915},
    'southwest': {'lat': 19.1063736197085, 'lng': 72.8354882197085}}},
  'icon': 'https://maps.gstatic.com/mapfiles/place_api/icons/geocode-71.png',
  'id': '25066b071f729b340839674b1858c6416951e0c5',
  'name': 'Bhaktivedanta Swami Road',
  'place_id': 'ChIJbf77V8bJ5zsRFea91p7eXdI',
  'reference': 'CmRbAAAA3BcgA-uPgC2a_4aXZvO4o_X-LQ3dvnWE49qvIBVb5mjWcMeOtBmENIF9_cD_uHrZQ2B0l6l0QuQgwYfrfKpYMAl2nLlZL076ujz6aDvXrJOnyM6BHoEQ7U7cbCLPeCg0EhBF1sRvS-0Y162Yn4d1heqZGhQ_3q_MrfTx8u28MmrCmxugw6G6fw',
  'scope': 'GOOGLE',
  'types': ['route'],
  'vicinity': 'Suvarna Nagar'},
 {'geometry': {'location': {'lat': 19.1079411, 'lng': 72.8366206},
   'viewport': {'northeast': {'lat': 19.1092900802915,
     'lng': 72.83796958029151},
    'southwest': {'lat': 19.1065921197085, 'lng': 72.8352716197085}}},
  'icon': 'https://maps.gstatic.com/mapfiles/pl