In [None]:
import json
import pickle
from pathlib import Path
import geopandas as gp

In [None]:
data_dir = Path("../data").resolve()

Load data

In [None]:
file_name = "synth_data.json"
test_data_location = data_dir / file_name

with open(test_data_location, "r") as file:
    data = json.load(file)

Extract time series

In [None]:
time_series = data["gps-coordinates"]["dataset"]

Put time series in GeoDataFrame and tidy 

In [None]:
gps_gdf = gp.GeoDataFrame(time_series)
gps_gdf.rename(columns={"d":"datetime"}, inplace=True)
gps_gdf["datetime"] = gp.pd.to_datetime(gps_gdf["datetime"], format="%Y-%m-%dT%H:%M:%S")
gps_gdf.set_index("datetime",inplace=True)


Set geometry from the raw GPS coordinates

In [None]:
gps_trace = gp.GeoSeries.from_xy(gps_gdf["long"], gps_gdf["lat"], crs = 'EPSG:4326')
gps_gdf.set_geometry(gps_trace,inplace=True)
gps_gdf

Generate distance travelled between subsequent GPS coordinates.  
Uses [EPSG: 27700 Coordinate reference system](https://spatialreference.org/ref/epsg/27700/) following [UK Gov guidance](https://www.gov.uk/guidance/uk-geospatial-data-standards-coordinate-reference-systems)

In [None]:
gps_trace_326 = gps_trace.to_crs(epsg="27700") 
distances = gps_trace_326.distance(gps_trace_326.shift())

Add distance travelled into GeoDataFrame, rounded to 3 decimal places (to the cm level)

In [None]:
decimal_places = 3

gps_gdf["dist_travelled"] = distances.round(decimal_places)
gps_gdf

Load the LSOA reference file

In [None]:
alllsoas_path = data_dir / "all_lsoas.pkl"

with open(alllsoas_path, "rb") as lsoa_file:
    all_lsoas = pickle.load(lsoa_file)

Use a GeoPandas sjoin to left join the GPS geometry and LSOA info based on wether the GPS is within a particular LSOA

In [None]:
gps_lsoa_gdf = gps_gdf.sjoin(all_lsoas, how="left", predicate="within")

Drop Columns no longer needed

In [None]:
pseudo_gdf = gps_lsoa_gdf.drop(columns=["lat", "long", "geometry", "index_right"])
pseudo_gdf

Aggregate over 1 minute windows.  
- Number of data points in window  
- Sum of diatnace travelled
- Mode of LSOA21CD (LSOA21 code)
- Mode of LSOA21NM (LSOA21 name)

In [None]:
grouper = gp.pd.Grouper(freq='1min')

grouped_pseudo_df = pseudo_gdf.groupby(grouper).agg({"dist_travelled":["count", "sum"],
                                                     "LSOA21CD": gp.pd.Series.mode,
                                                     "LSOA21NM": gp.pd.Series.mode})
grouped_pseudo_df

Save the data

In [None]:
header = ("n_data_points", "dist_travelled", "LSOA21CD_Mode", "LSOA21NM_Mode")

out_path = data_dir / "gps_out.csv"

grouped_pseudo_df.to_csv(out_path, header=header)