In [12]:
import pandas as pd
import numpy as np
import json
from ast import literal_eval
import matplotlib.pyplot as plt
from datetime import datetime
import collections, functools, operator
import re

In [13]:
with open('../datasets/place_types.json', 'r') as f:
    place_types = json.load(f)

with open('translation.json', 'r') as f:
    translation = json.load(f)

In [25]:
def slicePlaceType(df_test, df_base, place_types_list=[]):
    dict_df = {}
    for placeType in place_types_list:
        df = df_test[df_test['top_category'].str.contains(placeType, na=False)][['safegraph_place_id', 'top_category', 'sub_category', 'naics_code', 'popularity_by_day']]
        df = pd.merge(df, df_base[df_base['top_category'].str.contains(placeType, na=False)][['safegraph_place_id', 'popularity_by_day']], on='safegraph_place_id')
        df['popularity_by_day_x'] = df['popularity_by_day_x'].apply(lambda x: np.sum(list(literal_eval(x).values())))
        df['popularity_by_day_y'] = df['popularity_by_day_y'].apply(lambda x: np.sum(list(literal_eval(x).values())))
        df['occupancy'] = df['popularity_by_day_x']/df['popularity_by_day_y']*100
        dict_df[placeType] = df
    
    return dict_df


def getPopularity(place_t, df_array, months):
    dict_dist = {}
    dict_dist[place_t] = {}
    
    places = [place_types['place_type'][elem] for elem in translation[place_t]]

    df_tmp = pd.DataFrame()
    for i in range(len(places)):
        place = re.escape(places[i])
        for df in df_array:
            for month in months:
                df_tmp = pd.concat([df_tmp, df[month][df[month]['top_category'].str.contains(place, na=False)]])
        if df_tmp.empty:
            print("Data does not contain "+place)
#             return
#         try:
#             np.concatenate([days, df_tmp['popularity_by_day'].apply(lambda x: literal_eval(x)).values])
#         except:
    days = df_tmp['popularity_by_day'].apply(lambda x: literal_eval(x)).values
    factor_days = dict(functools.reduce(operator.add, map(collections.Counter, days)))
    factor_days = {key: (factor_days[key]/factor_days[max(factor_days, key=factor_days.get)]) for key in factor_days.keys()}
    for day in factor_days.keys():
        dict_dist[place_t][day] = {}

        arr = df_tmp['popularity_by_hour'].apply(lambda x: np.array(literal_eval(x))).sum()
        arr = (arr - arr[np.argmin(arr)])
        arr = arr/arr[np.argmax(arr)]*100*factor_days[day]
        dict_dist[place_t][day] = {str(i): arr[i] for i in range(len(arr))}
    return dict_dist

## New York

In [16]:
keys = []
for year in ['2019', '2020']:
        for month in ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12']:
            keys.append(year+'_'+month)
for year in ['2021']:
        for month in ['01', '02', '03', '04']:
            keys.append(year+'_'+month)
            
dict_df = {}
dict_df_NYC = {}

for key in keys:
    dict_df[key] = pd.read_csv('../datasets/NYC/NYC_'+key+'.csv.tar.gz', compression='gzip')
    
dfC_2020_04 = pd.read_csv('../datasets/NYC/Core-NYC_2020_04.csv.tar.gz', compression='gzip')

for key in keys:
    dict_df_NYC[key] = pd.merge(dfC_2020_04[dfC_2020_04['city']=='New York'][['safegraph_place_id', 'top_category', 'sub_category', 'naics_code']],
                                dict_df[key][dict_df[key]['city']=='New York'], on='safegraph_place_id')

## Seattle

In [33]:
dict_df_Seattle = {}
dict_df = {}

for key in keys:
    dict_df[key] = pd.read_csv('../datasets/SeattleMetro/SeattleMetro_'+key+'.csv.tar.gz', compression='gzip')
    
dfC_2020_04 = pd.read_csv('../datasets/SeattleMetro/Core-SeattleMetro_2020_04.csv.tar.gz', compression='gzip')

for key in keys:
    dict_df_Seattle[key] = pd.merge(dfC_2020_04[dfC_2020_04['city']=='Seattle'][['safegraph_place_id', 'top_category', 'sub_category', 'naics_code']],
                                dict_df[key][dict_df[key]['city']=='Seattle'], on='safegraph_place_id')

## Boston

In [34]:
dict_df_Boston = {}
dict_df = {}

for key in keys:
    dict_df[key] = pd.read_csv('../datasets/Boston/Boston_'+key+'.csv.tar.gz', compression='gzip')
    
dfC_2020_04 = pd.read_csv('../datasets/Boston/Core-Boston_2020_04.csv.tar.gz', compression='gzip')

for key in keys:
    dict_df_Boston[key] = pd.merge(dfC_2020_04[dfC_2020_04['city']=='Boston'][['safegraph_place_id', 'top_category', 'sub_category', 'naics_code']],
                                dict_df[key][dict_df[key]['city']=='Boston'], on='safegraph_place_id')

In [35]:
dict_times = {}
for key in translation.keys():
    try: 
        tmp = getPopularity(key, [dict_df_NYC, dict_df_Seattle, dict_df_Boston], keys)
    except:
        print('Data missing in '+key)
        continue
    dict_times.update(tmp)
dict_times['creation_time_utc'] = datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S")

with open('popular_times.json', 'w') as f:
    json.dump(dict_times, f)