In [16]:
# Import packages
import pandas as pd 
import numpy as np 
import geopandas as gpd 
import matplotlib.pyplot as plt

from tqdm import tqdm
import requests
from bs4 import BeautifulSoup
import io
import json

import plotly.express as px
from collections import Counter
import glob
import math
import re


In [None]:
df = pd.read_csv("../data/raw/city_jan_2020/full_city_jan_2020_bbox.csv")
df['recording_time'] = pd.to_datetime(df['recording_time'], format="%Y-%m-%d %H:%M:%S")

gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df['lon'],df['lat'], crs="EPSG:4326"))
gdf = gdf.to_crs("EPSG:28992")


In [None]:
gdf["date"] = gdf['recording_time'].dt.date
gdf["hour"] = gdf["recording_time"].dt.hour
gdf["datehour"] = pd.to_datetime(gdf["date"]) + gdf["hour"].astype('timedelta64[h]')
gdf['day_of_week'] = gdf['recording_time'].dt.day_name()

count = Counter([date for date in gdf["datehour"]])

df = pd.DataFrame.from_dict(count, orient='index', columns=["activity"]).reset_index().sort_values(by=['index'])

fig = px.line(df, x='index', y='activity', title='Snuffelfiets activity per hour in January 2020')
fig.show()

In [None]:
fig = px.histogram(gdf, x='day_of_week', title='Snuffelfiets activity per day of the week')
fig.show()

In [None]:
fig = px.histogram(gdf, x='hour', title='Snuffelfiets activity per hour of the day')
fig.show()

In [None]:
mean_pm25_date = gdf[["date", "pm2_5"]].groupby(["date"]).median()

fig = px.scatter(mean_pm25_date, x=mean_pm25_date.index, y='pm2_5', title='Median PM2.5 concentration per day (ug/m3)')
fig.show()

In [None]:
gdf['hour'] = gdf['recording_time'].dt.hour

mean_pm25_date = gdf[["date", "hour", "pm2_5"]].groupby(["date", "hour"]).median().reset_index()

mean_pm25_date["datetime"] = pd.to_datetime(mean_pm25_date["date"]) + mean_pm25_date["hour"].astype('timedelta64[h]')



fig = px.scatter(mean_pm25_date, x="datetime", y='pm2_5', title='Median PM2.5 concentration per hour in January (ug/m3)')
fig.show()

In [None]:
gdf.describe()

## Investigating #obs per cell

In [None]:
path = "../data/interim/vms_grid/total" # use your path
all_files = glob.glob(path + "/*.csv")

li = []

for filename in tqdm(all_files):
    df = pd.read_csv(filename, index_col=None, header=0)
    df["file"] = filename.split("\\")[1]
    li.append(df)

frame = pd.concat(li, axis=0, ignore_index=True)

In [None]:
frame[frame["file"]=="grid_vms1000.csv"]["count"].value_counts()

In [None]:
frame[frame['unique']>1].groupby("file").count()

In [None]:
path = "../data/interim/vms_grid/hourly/250" # use your path
all_files = glob.glob(path + "/*.csv")

li = []

for filename in tqdm(all_files):
    df = pd.read_csv(filename, index_col=None, header=0)
    df["file"] = filename.split("\\")[1]
    li.append(df)

frame2 = pd.concat(li, axis=0, ignore_index=True)

t=frame2[frame2['unique']>1].groupby("file").count().reset_index()

fig = px.line(t, x='file', y='unique', title='Cells (250m) with 1> unique measurements per hour')
fig.show()

In [None]:
path = "../data/interim/vms_grid/daily" # use your path
all_files = glob.glob(path + "/*/*.csv")

li = []

for filename in tqdm(all_files):
    df = pd.read_csv(filename, index_col=None, header=0)
    df["resolution"] = filename.split("\\")[1]
    df["date"] = filename.split("_")[3].split(".")[0]
    li.append(df)

frame2 = pd.concat(li, axis=0, ignore_index=True)

In [None]:
full = frame2.groupby(["resolution", "date"]).count().reset_index()
g = frame2[frame2["unique"]>1].groupby(["resolution", "date"]).count().reset_index()

In [None]:
g["percent"]=g["count"]/full["count"]*100

In [None]:
fig = px.line(g, x='date', y='percent', color='resolution', title='% of cells with data with 1> unique measurements per day')
fig.show()

In [None]:
path = "../data/interim/vms_grid/hourly" # use your path
all_files = glob.glob(path + "/*/*.csv")

li = []

for filename in tqdm(all_files):
    df = pd.read_csv(filename, index_col=None, header=0)
    df["resolution"] = filename.split("\\")[1]
    df["date"] = filename.split("_")[3].split(".")[0]
    df = df[df["unique"]>1].groupby(["resolution","date"]).count().reset_index()
    li.append(df)

frame2 = pd.concat(li, axis=0, ignore_index=True)

In [None]:
path = "../data/interim/vms_grid/hourly" # use your path
all_files = glob.glob(path + "/*/*.csv")

li = []

for filename in tqdm(all_files):
    df = pd.read_csv(filename, index_col=None, header=0)
    df["resolution"] = filename.split("\\")[1]
    df["date"] = filename.split("_")[3].split(".")[0]
    df = df.groupby(["resolution","date"]).count().reset_index()
    li.append(df)

frame1 = pd.concat(li, axis=0, ignore_index=True)

In [None]:
fig = px.line(frame2, x='date', y='unique', color='resolution', title='# of cells with data with 1> unique measurements per day')
fig.show()

In [None]:
frame2

In [None]:
fig = px.histogram(frame2, x='pm2_5', color='resolution', title='# of cells with data with 1> unique measurements per day', log_x=True)
fig.show()

In [None]:
path = "../data/interim/vms_grid/total" # use your path
all_files = glob.glob(path + "/*.csv")

li = []

for filename in tqdm(all_files):
    df = pd.read_csv(filename, index_col=None, header=0)
    df["resolution"] = filename.split("\\")[1]
    li.append(df)

frame1 = pd.concat(li, axis=0, ignore_index=True)

In [None]:
frame1

In [None]:
total_mean = frame1.groupby("resolution").mean().reset_index()
total_mean

In [None]:
total_mean["resolution"] = total_mean["resolution"].str.split("vms", expand=True)[1]
total_mean["resolution"] = total_mean["resolution"].str.split(".", expand=True)[0]
total_mean["resolution"] = total_mean["resolution"].astype(int)
total_mean = total_mean.sort_values("resolution")

In [None]:
fig = px.line(total_mean, x='resolution', y='se', title='Average standard error (SE) per cell')
fig.show()

In [None]:
## Same, but now daily

In [None]:
path = "../data/interim/vms_grid/daily" # use your path
all_files = glob.glob(path + "/*/*.csv")

li = []

for filename in tqdm(all_files):
    df = pd.read_csv(filename, index_col=None, header=0)
    df["resolution"] = filename.split("\\")[1]
    df["resolution"] = df["resolution"].astype(int)
    df["date"] = filename.split("_")[3].split(".")[0]
    df = df.groupby(["resolution", "date"]).mean().reset_index()
    li.append(df)

frame1 = pd.concat(li, axis=0, ignore_index=True)

In [None]:
daily_mean = frame1.groupby("resolution").mean().reset_index()

In [None]:
daily_mean

In [None]:
## Same, but now hourly

In [None]:
path = "../data/interim/vms_grid/hourly" # use your path
all_files = glob.glob(path + "/*/*.csv")

li = []

for filename in tqdm(all_files):
    df = pd.read_csv(filename, index_col=None, header=0)
    df["resolution"] = filename.split("\\")[1]
    df["date"] = filename.split("_")[3].split(".")[0]
    df = df.groupby(["resolution","date"]).count().reset_index()
    li.append(df)

frame1 = pd.concat(li, axis=0, ignore_index=True)

In [None]:
frame1

In [None]:
hourly_mean = frame1.groupby("resolution").mean().reset_index()

In [None]:
hourly_mean["time"] = "Hourly"
daily_mean["time"] = "Daily"
total_mean["time"] = "Full"

In [None]:
full = pd.concat([total_mean, daily_mean, hourly_mean], axis=0, ignore_index=True)

In [None]:
full

In [None]:
fig = px.line(full, x='resolution', y='se', color="time", title='Average standard error (SE) per cell')
fig.show()

In [None]:
fig = px.line(full, x='resolution', y='sd', color="time", title='Average standard deviation (SD) per cell')
fig.show()

In [None]:
fig = px.line(full, x='resolution', y='count', color="time", title='Average number of observations per cell', log_y=True)
fig.show()

In [None]:
path = "../data/interim/vms_grid/hourly" # use your path
all_files = glob.glob(path + "/*/*.csv")

vms100_1000 = [url for url in all_files if re.match(".+vms100.+", url)]

li = []

for filename in tqdm(vms100_1000):
    df = pd.read_csv(filename, index_col=None, header=0)
    df["resolution"] = filename.split("\\")[1]
    df["date"] = filename.split("_")[3].split(".")[0]
    df = df.groupby(["resolution","date"]).mean().reset_index()
    li.append(df)

frame1 = pd.concat(li, axis=0, ignore_index=True)

In [None]:
frame1["date"] = "2020" + frame1["date"]
frame1['date'] = pd.to_datetime(frame1['date'], format="%Y%m%d-%H")


In [None]:
frame1

In [None]:
fig = px.line(frame1, x='date', y='pm2_5_mean', color="resolution", title='Mean PM2.5 values per hour')
fig.show()

In [None]:
frame1["diff"]=frame1["pm2_5_mean"][:216].reset_index(drop=True) - frame1["pm2_5_mean"][216:].reset_index(drop=True)

In [None]:
fig = px.line(frame1, x='date', y='diff', title='Difference in mean PM2.5 values per hour (100m vs. 1000m)')
fig.show()

In [None]:
frame1.groupby('resolution').mean()

In [None]:
df = pd.read_csv("C:/Users/Klant/Documents/GitHub/ADS-Snuffelfiets-Thesis/data/raw/city_jan_2020/data_selection_bbox.csv")
df.head()

In [None]:
df['recording_time'] = pd.to_datetime(df['recording_time'], format="%Y-%m-%d %H:%M:%S")

df['day_of_week'] = df['recording_time'].dt.day_name()

In [None]:
df.head()

## KNMI Features

In [2]:
import pandas as pd

data = pd.read_csv('../data/external/uurgeg_260_2011-2020/uurgeg_260_2011-2020.txt', skiprows=31, skipinitialspace=True, usecols=['YYYYMMDD', 'HH', 'DD', 'FH', 'T', 'P', 'U'])

In [3]:
# DD = Windrichting (in graden) gemiddeld over de laatste 10 minuten van het afgelopen uur (360=noord, 90=oost, 180=zuid, 270=west, 0=windstil 990=veranderlijk.
# FH = Uurgemiddelde windsnelheid (in 0.1 m/s). 
# T = Temperatuur (in 0.1 graden Celsius) op 1.50 m hoogte tijdens de waarneming.
# P = Luchtdruk (in 0.1 hPa) herleid naar zeeniveau, tijdens de waarneming.
# U = Relatieve vochtigheid (in procenten) op 1.50 m hoogte tijdens de waarneming.

In [4]:
data["YYYYMMDD"] = data["YYYYMMDD"].astype(str)

data["hour"] = data['HH'].astype(str).str.strip()
data["year"] = data["YYYYMMDD"].str.slice(start=0, stop=4)
data["month"] = data["YYYYMMDD"].str.slice(start=4, stop=6)
data["day"] = data["YYYYMMDD"].str.slice(start=6, stop=8)

data["date"] = pd.to_datetime(data[['year', 'month', 'day', 'hour']])

data.drop(labels=['year', 'month', 'day', 'hour', 'YYYYMMDD', 'HH'], axis=1, inplace=True)

In [14]:
# Changing wind direction from degrees to cardinal directions
def degToCompass(deg):
    deg = math.floor((deg / 45) + 0.5)
    dirs = ["N","NE","E", "SE","S","SW","W","NW"]
    return dirs[(deg % 8)]

data["DD"] = data["DD"].apply(degToCompass)

In [15]:
data.head()

Unnamed: 0,DD,FH,T,P,U,date
0,SW,30,36,10217,99,2011-01-01 01:00:00
1,W,30,36,10214,99,2011-01-01 02:00:00
2,W,30,39,10211,99,2011-01-01 03:00:00
3,W,30,42,10208,97,2011-01-01 04:00:00
4,W,30,42,10204,97,2011-01-01 05:00:00


In [21]:
path = "../data/interim/vms_grid/hourly" # use your path
all_files = glob.glob(path + "/*/*.csv")

vms100_1000 = [url for url in all_files if re.match(".+vms100.+", url)]

li = []

for filename in tqdm(vms100_1000):
    df = pd.read_csv(filename, index_col=None, header=0)
    df["resolution"] = filename.split("\\")[1]
    #df["date"] = filename.split("_")[3].split(".")[0]
    #df = df.groupby(["resolution","date"]).mean().reset_index()
    li.append(df)

frame1 = pd.concat(li, axis=0, ignore_index=True)

100%|████████████████████████████████████████████████████████████████████████████████| 432/432 [00:07<00:00, 59.43it/s]


In [26]:
frame1['date'] = pd.to_datetime(frame1['date'], format="%Y-%m-%d %H:%M:%S")

In [44]:
merge = frame1.merge(data, on="date")
merge['date'] = pd.to_datetime(merge['date'], format="%Y-%m-%d %H:%M:%S")
merge['hour'] = merge['date'].dt.hour

In [52]:
merge.head()

Unnamed: 0,x,y,pm2_5_med,pm2_5_mean,count,unique,se,sd,date,resolution,DD,FH,T,P,U,hour
0,126484.772,461559.608,,,,,,,2020-01-06 06:00:00,100,SW,40,65,10278,92,6
1,126584.772,461559.608,,,,,,,2020-01-06 06:00:00,100,SW,40,65,10278,92,6
2,126684.772,461559.608,,,,,,,2020-01-06 06:00:00,100,SW,40,65,10278,92,6
3,126784.772,461559.608,,,,,,,2020-01-06 06:00:00,100,SW,40,65,10278,92,6
4,126884.772,461559.608,,,,,,,2020-01-06 06:00:00,100,SW,40,65,10278,92,6


In [51]:
merge.to_csv("test.csv", index=False)