In [1]:
# Import packages
import pandas as pd 
import numpy as np 
import geopandas as gpd 
import matplotlib.pyplot as plt

from tqdm import tqdm
import requests
from bs4 import BeautifulSoup
import io
import json

In [10]:
df = pd.read_csv("../data/external/all_snuffelfiets_raw.csv")

In [11]:
df

Unnamed: 0,sensor,air_quality_observed_id,lon,lat,recording_time,trip_sequence,humidity,pm2_5,pressure,temperature,geometry,distance,delta_time,speed_ms,speed_kmh,index_right,CBS_CODE,PROV_NAAM,OBJECTID
0,cb8220c6f54804f2bcf623190b29e6d2,30694417,5.260575,52.164562,2020-04-27 12:00:38,0,37,13,1008,21.6,POINT (146336.093114702 464052.2688131503),127.041523,0 days 00:00:12,10.586794,38.112457,4,6.0,Utrecht,5.0
1,1ee241830b9c00d1e767b175817acd3e,30694424,5.012298,51.948456,2020-04-27 12:00:44,0,51,13,1050,23.2,POINT (129225.0303589758 440067.4706495589),28917.275870,-7 days +20:41:56,0.388068,1.397045,4,6.0,Utrecht,5.0
2,5f43aac634776a41fe83ee005cd04839,30694495,5.040284,52.140881,2020-04-27 12:00:48,0,42,15,1008,20.9,POINT (131251.288819419 461466.6183504097),24585.232633,-7 days +18:11:58,0.375244,1.350878,4,6.0,Utrecht,5.0
3,fc38181e2cb2778d47b2519430755669,30694464,5.005867,52.168140,2020-04-27 12:00:48,0,56,16,1018,17.7,POINT (128911.2117550118 464511.3444094701),66997.665578,-2 days +19:01:47,0.977968,3.520685,4,6.0,Utrecht,5.0
4,0073b711d7702ec73c84ab154468293d,30694489,5.156363,52.030231,2020-04-27 12:00:49,0,56,16,1059,25.1,POINT (139158.4286186836 449124.3256652508),1052.576956,-4 days +19:10:19,0.015251,0.054902,4,6.0,Utrecht,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
511165,74678958154e0d1a078e5a6f50b73b73,31925640,5.056912,52.022751,2020-05-18 11:59:07,16,55,7,1042,21.5,POINT (132329.8481180518 448318.4055458872),79.860061,0 days 00:00:12,6.655005,23.958018,4,6.0,Utrecht,5.0
511166,c432f4d9cac0cc68e786d054c1ec771f,31925609,4.829355,52.017056,2020-05-18 11:59:08,19,57,4,1049,23.3,POINT (116706.2797367536 447780.2397429558),113.855659,0 days 00:00:11,10.350514,37.261852,4,6.0,Utrecht,5.0
511167,7d320c4527e25b06c3c2cfc68c917019,31925576,5.237413,52.023979,2020-05-18 11:59:16,39,50,8,1054,24.2,POINT (144719.1213778763 448414.1443792214),83.316691,0 days 00:00:11,7.574245,27.267281,4,6.0,Utrecht,5.0
511168,74678958154e0d1a078e5a6f50b73b73,31925642,5.057220,52.023201,2020-05-18 11:59:19,16,55,6,1042,21.5,POINT (132351.1854591865 448368.3903050824),88.336558,0 days 00:00:12,7.361380,26.500968,4,6.0,Utrecht,5.0


In [8]:
url = 'https://ckan.dataplatform.nl/api/3/action/package_show?id=9cc4de28-6d03-4b59-8c66-085b3e8b3956&include_tracking=true'
r = requests.get(url)
data = r.json()

csv_urls = [x['url'] for x in data['result']['resources']]
test_csv_urls = csv_urls[:10]


In [19]:
test_csv_urls[0].split('/')[-1]

'resource_2020_05_11_2020_05_18.csv'

In [31]:
# Open Utrecht Province polygon.
provinces = gpd.read_file("../data/external/B1_Provinciegrenzen_van_Nederland/B1_Provinciegrenzen_van_NederlandPolygon.shp")
utrecht = provinces[provinces["PROV_NAAM"] == "Utrecht"]

for csv in tqdm(test_csv_urls): 
    response = requests.get(csv)
    file_object = io.StringIO(response.content.decode('utf-8'))
    df = pd.read_csv(file_object, usecols=['sensor', 'air_quality_observed_id', 'lon', 'lat', 'recording_time', 'trip_sequence', 'humidity', 'pm2_5', 'pressure', 'temperature'])
    df['recording_time'] = pd.to_datetime(df['recording_time'], format="%Y-%m-%d %H:%M:%S")

    # Remove all measurements with pm2.5 <0.5 and >150 ug/m3.
    df = df[~((df["pm2_5"]<0.5)&(df["pm2_5"]>150))]

    # Remove all measurements with avg. speed >45 km/h
    geo_df = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df['lon'],df['lat'], crs="EPSG:4326"))
    geo_df = geo_df.sort_values(by=['sensor', 'recording_time'])
    geo_df = geo_df.to_crs("EPSG:3857") # from WGS84 Geographic to WGS84 Projected

    geo_df['distance'] = geo_df.distance(geo_df.shift(1)) # distance between two consecutive measurements
    geo_df['delta_time'] = geo_df['recording_time'] - geo_df['recording_time'].shift(1) # time_delta between two consecutive measurements
    geo_df['delta_time'] = geo_df['delta_time'].dt.seconds # convert to seconds
    geo_df['avg_speed_ms'] = geo_df['distance'] / geo_df['delta_time'] # avg(v)=x/t
    #geo_df['speed_kmh'] = round((geo_df['speed_ms'] * 3.6), 3)

    geo_df = geo_df[~(geo_df['avg_speed_ms']>12.5)]
    #geo_df = geo_df[~(geo_df['speed_kmh']>45)]

    # Remove all measurements outside of Utrecht Province.
    geo_df = geo_df.to_crs("EPSG:28992")
    geo_df = gpd.sjoin(geo_df, utrecht, how="inner", op='within')

    # Save as .csv-file
    geo_df.drop(labels=["geometry", "index_right", "CBS_CODE", "PROV_NAAM", "OBJECTID"], inplace=True, axis=1)
    geo_df.reset_index(drop=True, inplace=True)
    filename = "../data/external/" + csv.split('/')[-1]
    geo_df.to_csv(filename, index=False)


100%|██████████| 3/3 [00:36<00:00, 12.24s/it]


In [17]:
"../data/external/" + csv.split('/')[-1]

'../data/external/resource_2020_04_27_2020_05_04.csv'