import libraries

In [2]:
import glob
import pandas as pd
from h3 import h3
import geohash2
import numpy as np
from multiprocessing import Pool
from multiprocessing import cpu_count
nCores = cpu_count()
import time
import sys

import function projects

In [2]:
sys.path.append("..")
from src.geoIndexFunctions import *

### Load GPS events 

In [7]:
# # tourists
# gps_path = '../../data/geodata/paul/04new'

# # British
# gps_path = '../../data/geodata/paul/04new_GBR'

# mixed
gps_path = '../../data/geodata/paul/mixed'

In [8]:
all_files = glob.glob(gps_path + "/*.parquet")

li = []

for filename in all_files:
    df = pd.read_parquet(filename, columns=['timestamp', 'latitude', 'longitude', 'country']) #
    li.append(df)

frame = pd.concat(li, axis=0, ignore_index=True)

In [9]:
print("{} GPS events within the bounding box of London".format(len(frame)))

8589632 GPS events within the bounding box of London


In [5]:
df.columns

Index(['timestamp', 'latitude', 'longitude', 'country'], dtype='object')

In [5]:
frame.shape

(8589632, 4)

In [9]:
# not showing deviceId for data protection reasons
frame.loc[:,frame.columns != 'deviceId'].sample(5)

Unnamed: 0,timestamp,latitude,longitude
17898592,1556463600000,51.512482,-0.268252
50869900,1555455600000,51.51522,-0.02745
52016817,1555315200000,51.523727,-0.649701
16166201,1555869600000,51.513564,-0.019742
25098304,1555282800000,51.593684,-0.211697


In [8]:
frame.dtypes

timestamp     int64
latitude     object
longitude    object
country      object
dtype: object

In [6]:
frame['latitude'] = frame['latitude'].astype(float)
frame['longitude'] = frame['longitude'].astype(float)

###  Load boundary geoindex list

h3

In [7]:
all_hexagons_df = pd.read_csv('../data output/0 london_boundary_hex9_list.csv')

In [8]:
all_hexagons_df.head()

Unnamed: 0,hex9
0,89195dadc07ffff
1,89194e6d0a7ffff
2,89194ac2e7bffff
3,89194ad5c93ffff
4,89194ad2473ffff


In [9]:
hex_col = all_hexagons_df.columns.tolist()[0]
APERTURE_SIZE = int(hex_col[-1:])

In [10]:
gdf_hex_list = all_hexagons_df[hex_col].tolist()

In [14]:
# visualize_polygon('green',hexagons=gdf_hex_list)

geohash

In [15]:
# all_geohash_df = pd.read_csv('london_boundary_into_hash_list.csv')

In [16]:
# hash_col = all_geohash_df.columns.tolist()[0]
# HASH_SIZE = int(hash_col[-1:])

In [17]:
# gdf_geohash_list = all_geohash_df[hash_col].tolist()

### Store geoindex id, geoindex lat/lon for each event and if within boundary

In [11]:
# split the dataframe into chunks for parallelization
frame_split = np.array_split(frame, nCores)

def process(df):
    
    df[hex_col] = df.apply(lambda x: h3.geo_to_h3(x['latitude'], x['longitude'], APERTURE_SIZE), axis=1)
    df[hex_col+"_lat"] = df.apply(lambda x: h3.h3_to_geo(x[hex_col])[0], axis=1)
    df[hex_col+"_lon"] = df.apply(lambda x: h3.h3_to_geo(x[hex_col])[1], axis=1)
    df['hex_within'] = df.apply(lambda x: True if x[hex_col] in gdf_hex_list else False, axis=1)
    
#     # geohash
#     df[hash_col] = df.apply(lambda x: geohash2.encode(x['latitude'], x['longitude'], HASH_SIZE), axis=1)
#     df[hash_col+"_lat"] = df.apply(lambda x: geohash2.decode_exactly(x[hash_col])[0], axis=1)
#     df[hash_col+"_lon"] = df.apply(lambda x: geohash2.decode_exactly(x[hash_col])[1], axis=1)
#     df['hash_within'] = df.apply(lambda x: True if x[hash_col] in gdf_geohash_list else False, axis=1)
    
    return df

# multi cores with Multiprocessing
with Pool(processes=nCores) as pool1:
    new_frame = pd.concat(pool1.map(process, frame_split)).reset_index(drop=True)

In [None]:
new_frame.loc[:,new_frame.columns != 'deviceId'].head()

In [None]:
len(new_frame)

### Keep only the ones within London

In [12]:
london_new_frame = new_frame[new_frame.hex_within==True]
london_new_frame.drop(columns=["hex_within"], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [6]:
print("{} GPS events within London".format(len(london_new_frame)))

7649570 GPS events within London


### Save the df to parquet.gzip file

In [13]:
# london_new_frame.to_parquet('../../data/geodata/paul/mixed/gps_events_with_geoindex_within.parquet.gzip',compression='gzip')

In [3]:
london_new_frame = pd.read_parquet('../../data/geodata/paul/mixed/gps_events_with_geoindex_within.parquet.gzip')

## Appendix 

In [82]:
# # single core
# frame[hex_col] = frame.apply(lambda x: h3.geo_to_h3(x['latitude'], x['longitude'], APERTURE_SIZE), axis=1)

In [83]:
# # multi cores with Dask
# daskdf = dd.from_pandas(frame,npartitions=nCores).\
#    map_partitions(
#       lambda df :df.apply(
#           lambda x: h3.geo_to_h3(x['latitude'], x['longitude'], APERTURE_SIZE), axis=1)).\
#     compute()