In [1]:
import networkx as nx
import pandas as pd
import numpy as np
import json
from ast import literal_eval
import math
import time
from statistics import geometric_mean
import matplotlib.pyplot as plt
from datetime import datetime
import collections, functools, operator
import re

In [19]:
def occupancy(df_test, df_base, place_types_list=[]):
    """ Slice function for separating places into place types and calculating the occupancy
    :param df_test: the dataframe with the data for the current month
    :param df_base: the dataframe with the data for the baseline month
    :param place_types_list: the list of place types to be sliced. If left empty it returns all
                             placetypes present in df_test AND df_base
    :return dict_df: a dictionary with keys corresponding to all the place types and values corresponding to the dataframes
    """
    dict_df = {}
    if not place_types_list:
        place_types_list_1 = list(df_test['top_category'].unique())
        place_types_list_2 = list(df_base['top_category'].unique())
        place_types = [value for value in place_types_list_1 if value in place_types_list_2 and str(value) != 'nan']
    else: place_types = place_types_list
    
    for placeType in place_types:
        df = df_test[df_test['top_category'].str.contains(placeType, na=False)]
        df = pd.merge(df, df_base[df_base['top_category'].str.contains(placeType, na=False)][['safegraph_place_id', 'popularity_by_day']], on='safegraph_place_id')
        df['popularity_by_day_x'] = df['popularity_by_day_x'].apply(lambda x: np.sum(list(literal_eval(x).values())))
        df['popularity_by_day_y'] = df['popularity_by_day_y'].apply(lambda x: np.sum(list(literal_eval(x).values())))
        df['occupancy'] = df['popularity_by_day_x']/df['popularity_by_day_y']*100
        dict_df[placeType] = df
    
    return dict_df


def distance(lat1, lon1, lat2, lon2):
    r = 6372.8 # radius of earth in km
    dLat = math.radians(lat2 - lat1)
    dLon = math.radians(lon2 - lon1)
    lat1 = math.radians(lat1)
    lat2 = math.radians(lat2)
    a = math.sin(dLat / 2) ** 2 + math.cos(lat1) * math.cos(lat2) * math.sin(dLon / 2) ** 2
    c = 2 * math.asin(math.sqrt(a))
    return r * c


def approx_distance(lat1, lon1, lat2, lon2):

    dLat = lat2 - lat1
    dLon = lon2 - lon1
    
    a = (dLat / 2) ** 2 + (1 - lat1 * lat1 /4) * (1 - lat2 * lat2 /4) * ((dLon / 2) ** 2)
    return a


## Load Population Flow Data for NYC

In [3]:
# keys = ['2019_01', '2019_02', '2019_03', '2019_04', '2019_05', '2019_06', '2019_07', '2019_08', '2019_09', '2019_10', '2019_11', '2019_12',
#         '2020_01', '2020_02', '2020_03', '2020_04', '2020_05', '2020_06', '2020_07', '2020_08', '2020_09', '2020_10', '2020_11', '2020_12',]

keys = ['2019_01', '2019_04', '2020_04']

dict_df = {}
dict_df_NYC = {}

for key in keys:
    dict_df[key] = pd.read_csv('../datasets/NYC/NYC_'+key+'.csv.tar.gz', compression='gzip')
    
dfC_2020_04 = pd.read_csv('../datasets/NYC/Core-NYC_2020_04.csv.tar.gz', compression='gzip')

for key in keys:
    dict_df_NYC[key] = pd.merge(dfC_2020_04[dfC_2020_04['city']=='New York'],
                                dict_df[key][dict_df[key]['city']=='New York'], on='safegraph_place_id')
    

dump = False
if dump:
    with open('../datasets/place_types.json', 'w') as f:
        json.dump(pd.DataFrame(np.sort(dict_df_NYC[keys[0]]['top_category'].dropna().unique()), columns=["place_type"]).to_dict(orient='dict'), f, indent=4)
    
with open('../datasets/place_types.json', 'r') as f:
    place_types = json.load(f)
    

## Compute Occupancy for POIs 

In [4]:
# place_types_list = ['Restaurants', 'Grocery', 'Clothing Stores']
dict_df_analysis = {}

for key in keys[1:]:
    # dict_df_analysis[key] = occupancy(dict_df_NYC[key], dict_df_NYC['2019_01'], place_types_list)
    dict_df_analysis[key] = occupancy(dict_df_NYC[key], dict_df_NYC['2019_01'])


  return func(self, *args, **kwargs)


### Concatenate data frame for all place types ('top_category')

In [5]:
dfm = pd.concat([dict_df_analysis['2019_04'][i] for i in dict_df_analysis['2019_04'].keys()])

In [6]:
dfm.columns

Index(['safegraph_place_id', 'state_x', 'countyName_x',
       'parent_safegraph_place_id_x', 'location_name_x', 'brands_x',
       'top_category', 'sub_category', 'naics_code', 'latitude', 'longitude',
       'street_address_x', 'city_x', 'postal_code_x', 'open_hours',
       'category_tags', 'state_y', 'countyName_y', 'placekey',
       'parent_placekey', 'parent_safegraph_place_id_y', 'location_name_y',
       'street_address_y', 'city_y', 'postal_code_y', 'brands_y',
       'date_range_start', 'date_range_end', 'raw_visit_counts',
       'raw_visitor_counts', 'visits_by_day', 'poi_cbg', 'visitor_home_cbgs',
       'visitor_daytime_cbgs', 'visitor_country_of_origin',
       'distance_from_home', 'median_dwell', 'bucketed_dwell_times',
       'related_same_day_brand', 'related_same_month_brand',
       'popularity_by_hour', 'popularity_by_day_x', 'device_type',
       'popularity_by_day_y', 'occupancy'],
      dtype='object')

### Compute Distance between every pair of POIs 

In [7]:
dfm = dfm[['safegraph_place_id', 'latitude', 'longitude', 'occupancy', 
           'top_category', 'sub_category', 'naics_code', 'poi_cbg', 'visitor_home_cbgs', 'distance_from_home']]

In [8]:
# dfm[:10]
# 
# dfm[dfm['safegraph_place_id'] == 'sg:e2e1a315f7e84868a56778e9252c0e05']['latitude'].values[0]
# 
# a = dfm['safegraph_place_id'].tolist()
# len(a)

In [20]:

## compute the distance between every pair of POIs #approximate distance 

rel_prob = dict()
c = 0
s  = time.time()

list_poi = dfm['safegraph_place_id'].tolist()[:10]
for p1 in range(len(list_poi)):
    for p2 in range(len(list_poi)):

        if p1 == p2 or p1 < p2:
            continue

        lat1 = dfm[dfm['safegraph_place_id'] == list_poi[p1]]['latitude'].values[0] 
        lon1 = dfm[dfm['safegraph_place_id'] == list_poi[p1]]['longitude'].values[0] 
        lat2 = dfm[dfm['safegraph_place_id'] == list_poi[p2]]['latitude'].values[0] 
        lon2 = dfm[dfm['safegraph_place_id'] == list_poi[p2]]['longitude'].values[0]

        op1 = dfm[dfm['safegraph_place_id'] == list_poi[p1]]['occupancy'].values[0] 
        op2 = dfm[dfm['safegraph_place_id'] == list_poi[p2]]['occupancy'].values[0]

        rel_prob[(p1,p2)] = geometric_mean([op1, op2]) / approx_distance(lat1, lon1, lat2, lon2)  
        c += rel_prob[(p1,p2)]

e = time.time()
print("time", e-s )

time 0.9169559478759766


In [22]:
### compute the distance between every pair of POIs #exact distance

rel_prob = dict()
c = 0 

s  = time.time()

list_poi = dfm['safegraph_place_id'].tolist()[:100]
for p1 in range(len(list_poi)):
    for p2 in range(len(list_poi)):

        if p1 == p2 or p1 < p2:
            continue
            
        lat1 = dfm[dfm['safegraph_place_id'] == list_poi[p1]]['latitude'].values[0] 
        lon1 = dfm[dfm['safegraph_place_id'] == list_poi[p1]]['longitude'].values[0] 
        lat2 = dfm[dfm['safegraph_place_id'] == list_poi[p2]]['latitude'].values[0] 
        lon2 = dfm[dfm['safegraph_place_id'] == list_poi[p2]]['longitude'].values[0]
        
        op1 = dfm[dfm['safegraph_place_id'] == list_poi[p1]]['occupancy'].values[0] 
        op2 = dfm[dfm['safegraph_place_id'] == list_poi[p2]]['occupancy'].values[0]

        rel_prob[(p1,p2)] = geometric_mean([op1, op2]) / distance(lat1, lon1, lat2, lon2) 
        c += rel_prob[(p1,p2)]
        
e = time.time()
print("time", e-s )



time 65.66853928565979


In [18]:
### compute geometric mean only 

rel_prob = dict()
c = 0 

s  = time.time()

list_poi = dfm['safegraph_place_id'].tolist()[:100]
for p1 in range(len(list_poi)):
    for p2 in range(len(list_poi)):

        if p1 == p2 or p1 < p2:
            continue
        
        op1 = dfm[dfm['safegraph_place_id'] == list_poi[p1]]['occupancy'].values[0] 
        op2 = dfm[dfm['safegraph_place_id'] == list_poi[p2]]['occupancy'].values[0]

        rel_prob[(p1,p2)] = geometric_mean([op1, op2]) 
        c += rel_prob[(p1,p2)]
        
e = time.time()
print("time", e-s )

time 16.475069999694824


In [None]:
 
# linkData = pd.DataFrame({'source' : ['Amy', 'Bob'],
#                   'target' : ['Bob', 'Cindy'],
#                   'weight' : [100, 50]})
# 
# nodeData = pd.DataFrame({'name' : ['Amy', 'Bob', 'Cindy'],
#                   'type' : ['Foo', 'Bar', 'Baz'],
#                   'gender' : ['M', 'F', 'M']})
# 
# G = nx.from_pandas_edgelist(linkData, 'source', 'target', True, nx.DiGraph())
# nx.set_node_attributes(G, nodeData.set_index('name').to_dict('index'))
# 
# 
# G.nodes(data=True)
# 
# 
# nodeData.set_index('name').to_dict('index')