In [1]:
# Import packages
import pandas as pd 
import numpy as np 
import geopandas as gpd 

from tqdm import tqdm
import requests
from bs4 import BeautifulSoup
import io
import json

## Downloading all Snuffelfiets data

In [2]:
url = 'https://ckan.dataplatform.nl/api/3/action/package_show?id=9cc4de28-6d03-4b59-8c66-085b3e8b3956&include_tracking=true'

r = requests.get(url)

data = r.json()

In [3]:
csv_urls = [x['url'] for x in data['result']['resources']]
csv_urls[0]

'https://ckan.dataplatform.nl/dataset/9cc4de28-6d03-4b59-8c66-085b3e8b3956/resource/a12b08b4-d0bf-45f9-b17c-20448369754c/download/resource_2020_05_11_2020_05_18.csv'

In [4]:
test_csv_urls = csv_urls[:3]

### Warning: running the block below will result in downloading ~100 .csv's


In [118]:
data = []

for csv in tqdm(test_csv_urls): 
    response = requests.get(csv)
    file_object = io.StringIO(response.content.decode('utf-8'))
    df = pd.read_csv(file_object)
    data.append(df)


df = pd.concat(data)    # merge all csv's
df.reset_index(drop=True, inplace=True)
df.to_csv("../data/external/all_snuffelfiets_raw.csv", index=False)
data = []   # clear memory

100%|██████████| 5/5 [00:39<00:00,  7.98s/it]


In [120]:
snuffel_all = pd.read_csv("../data/external/all_snuffelfiets_raw.csv")

In [121]:
snuffel_all.head()

Unnamed: 0,sensor,air_quality_observed_id,lon,lat,recording_time,trip_sequence,humidity,pm10,pm1_0,pm2_5,pressure,temperature,voc,voltage,error_code,version_major,version_minor,acc_max,no2
0,c432f4d9cac0cc68e786d054c1ec771f,31925610,4.830358,52.01701,2020-05-18 11:59:19,19,57,4,4,4,1049,23.4,188,3.81,0,1,7,0,0
1,74678958154e0d1a078e5a6f50b73b73,31925642,5.05722,52.023201,2020-05-18 11:59:19,16,55,6,6,6,1042,21.5,190,4.06,0,1,7,0,0
2,7d320c4527e25b06c3c2cfc68c917019,31925576,5.237413,52.023979,2020-05-18 11:59:16,39,50,9,8,8,1054,24.2,201,4.02,0,1,7,0,0
3,c432f4d9cac0cc68e786d054c1ec771f,31925609,4.829355,52.017056,2020-05-18 11:59:08,19,57,4,4,4,1049,23.3,187,3.81,0,1,7,0,0
4,74678958154e0d1a078e5a6f50b73b73,31925640,5.056912,52.022751,2020-05-18 11:59:07,16,55,7,6,7,1042,21.5,189,4.06,0,1,7,1,0


In [10]:
utrecht = gpd.read_file("../data/external/WijkBuurtkaart_2020_v1/gem_utrecht.shp")


for csv in tqdm(test_csv_urls): 
    response = requests.get(csv)
    file_object = io.StringIO(response.content.decode('utf-8'))
    df = pd.read_csv(file_object, usecols=['sensor', 'air_quality_observed_id', 'lon', 'lat',
     'recording_time', 'trip_sequence', 'humidity', 'pm2_5', 'pressure', 'temperature'])
    df['recording_time'] = pd.to_datetime(df['recording_time'], format="%Y-%m-%d %H:%M:%S")


    # Remove all measurements with pm2.5 <0.5 or >150 ug/m3.
    df = df[~((df["pm2_5"]<0.5)|(df["pm2_5"]>150))]


    # Remove all measurements with avg. speed >45 km/h (=12.5 m/s)
    geo_df = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df['lon'],df['lat'], crs="EPSG:4326"))
    geo_df = geo_df.sort_values(by=['sensor', 'recording_time'])
    geo_df = geo_df.to_crs("EPSG:28992") # from WGS84 Geographic to Amersfoort / RD New Projected

    geo_df['distance'] = geo_df.distance(geo_df.shift(1)) # distance between two consecutive measurements
    geo_df['delta_time'] = geo_df['recording_time'] - geo_df['recording_time'].shift(1) # delta_time between two consecutive measurements
    geo_df['delta_time'] = geo_df['delta_time'].dt.seconds # convert to seconds
    geo_df['avg_speed_ms'] = geo_df['distance'] / geo_df['delta_time'] # avg(v)=x/t

    geo_df = geo_df[~(geo_df['avg_speed_ms']>12.5)]


    # Remove all measurements outside of Utrecht Province.
    geo_df = gpd.sjoin(geo_df, utrecht, how="inner", op='within')


    # Save as .csv-file
    geo_df.drop(labels=["geometry",'GM_CODE', 'JRSTATCODE', 'GM_NAAM', 'H2O', 'OAD', 'STED', 'BEV_DICHTH',
       'AANT_INW', 'AANT_MAN', 'AANT_VROUW', 'P_00_14_JR', 'P_15_24_JR',
       'P_25_44_JR', 'P_45_64_JR', 'P_65_EO_JR', 'P_ONGEHUWD', 'P_GEHUWD',
       'P_GESCHEID', 'P_VERWEDUW', 'AANTAL_HH', 'P_EENP_HH', 'P_HH_Z_K',
       'P_HH_M_K', 'GEM_HH_GR', 'P_WEST_AL', 'P_N_W_AL', 'P_MAROKKO',
       'P_ANT_ARU', 'P_SURINAM', 'P_TURKIJE', 'P_OVER_NW', 'OPP_TOT',
       'OPP_LAND', 'OPP_WATER', 'Shape_Leng', 'Shape_Area'], inplace=True, axis=1)
    geo_df.reset_index(drop=True, inplace=True)
    filename = "../data/external/city/" + csv.split('/')[-1]
    geo_df.to_csv(filename, index=False)


100%|██████████| 3/3 [00:27<00:00,  9.04s/it]


In [8]:
utrecht.keys()

Index(['GM_CODE', 'JRSTATCODE', 'GM_NAAM', 'H2O', 'OAD', 'STED', 'BEV_DICHTH',
       'AANT_INW', 'AANT_MAN', 'AANT_VROUW', 'P_00_14_JR', 'P_15_24_JR',
       'P_25_44_JR', 'P_45_64_JR', 'P_65_EO_JR', 'P_ONGEHUWD', 'P_GEHUWD',
       'P_GESCHEID', 'P_VERWEDUW', 'AANTAL_HH', 'P_EENP_HH', 'P_HH_Z_K',
       'P_HH_M_K', 'GEM_HH_GR', 'P_WEST_AL', 'P_N_W_AL', 'P_MAROKKO',
       'P_ANT_ARU', 'P_SURINAM', 'P_TURKIJE', 'P_OVER_NW', 'OPP_TOT',
       'OPP_LAND', 'OPP_WATER', 'Shape_Leng', 'Shape_Area', 'geometry'],
      dtype='object')