In [71]:
# Import packages
import pandas as pd 
import numpy as np 
import geopandas as gpd 
import matplotlib.pyplot as plt

from tqdm import tqdm
import requests
from bs4 import BeautifulSoup
import io
import json

import plotly.express as px
from collections import Counter
import glob


In [69]:
df = pd.read_csv("../data/external/city_jan_2020/january_2020_city.csv")
df['recording_time'] = pd.to_datetime(df['recording_time'], format="%Y-%m-%d %H:%M:%S")

gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df['lon'],df['lat'], crs="EPSG:4326"))
gdf = gdf.to_crs("EPSG:28992")


In [82]:
gdf["date"] = gdf['recording_time'].dt.date


count = Counter([date for date in gdf["date"]])

df = pd.DataFrame.from_dict(count, orient='index', columns=["activity"]).reset_index().sort_values(by=['index'])

fig = px.line(df, x='index', y='activity', title='Snuffelfiets activity per day in January 2020')
fig.show()

In [79]:
mean_pm25_date = gdf[["date", "pm2_5"]].groupby("date").median()

fig = px.scatter(mean_pm25_date, x=mean_pm25_date.index, y='pm2_5', title='Median PM2.5 concentration per day (ug/m3)')
fig.show()

In [78]:
gdf['hour'] = gdf['recording_time'].dt.hour

mean_pm25_date = gdf[["date", "hour", "pm2_5"]].groupby(["date", "hour"]).median().reset_index()

mean_pm25_date["datetime"] = pd.to_datetime(mean_pm25_date["date"]) + mean_pm25_date["hour"].astype('timedelta64[h]')



fig = px.scatter(mean_pm25_date, x="datetime", y='pm2_5', title='Median PM2.5 concentration per hour in January (ug/m3)')
fig.show()

In [80]:
gdf.describe()

Unnamed: 0,air_quality_observed_id,lon,lat,trip_sequence,humidity,pm2_5,pressure,temperature,distance,delta_time,avg_speed_ms,hour
count,216239.0,216239.0,216239.0,216239.0,216239.0,216239.0,216239.0,216239.0,216239.0,216239.0,216239.0,216239.0
mean,18451640.0,5.104288,52.088833,8.008518,68.808379,11.594139,53636.4,11.24653,95.060762,19.787309,5.14874,12.951244
std,7543336.0,0.037193,0.016843,7.342861,13.689889,11.236791,1502583.0,45.138878,1367.935385,354.682894,1.392429,4.585497
min,1385415.0,4.974094,52.028873,0.0,0.0,1.0,431.0,0.0,8.188689,2.0,2.000223,0.0
25%,11030310.0,5.084321,52.078094,3.0,58.0,4.0,988.0,7.9,55.776372,13.0,4.200075,8.0
50%,22845330.0,5.109617,52.089329,6.0,71.0,8.0,1004.0,10.2,68.249257,13.0,5.119263,13.0
75%,23512750.0,5.129708,52.099194,11.0,80.0,16.0,1021.0,13.0,82.23359,14.0,6.144976,17.0
max,24202610.0,5.195133,52.137924,59.0,100.0,150.0,42949500.0,6553.5,134791.032089,62375.0,12.493945,23.0


## "Weighted" K-means clustering of the PM2.5 data

### 1. Full data
### 2. Aggregated day/month medians

In [81]:
# Clustering
from sklearn import cluster
# Scaling
from sklearn.preprocessing import MinMaxScaler

In [90]:
full_data = gdf[["lon", "lat", "pm2_5", "geometry"]]

full_data['x'] = full_data["geometry"].apply(lambda p: p.x)
full_data['y'] = full_data["geometry"].apply(lambda p: p.y)

full_data.head()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,lon,lat,pm2_5,geometry,x,y
0,5.118134,52.11116,22,POINT (136568.331 458137.319),136568.331468,458137.319
1,5.119091,52.11057,23,POINT (136633.645 458071.434),136633.6449,458071.433727
2,5.119953,52.11046,21,POINT (136692.649 458058.978),136692.648797,458058.977584
3,5.120988,52.1099,20,POINT (136763.320 457996.412),136763.320401,457996.412136
4,5.121899,52.109608,22,POINT (136825.608 457963.696),136825.608202,457963.696134


In [97]:
k_means = cluster.KMeans(n_clusters=20, random_state=42)
k_means.fit(full_data[["x","y","pm2_5"]])

X_cluster = k_means.labels_
full_data["cluster"] = X_cluster



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [95]:
full_data.head(10)

Unnamed: 0,lon,lat,pm2_5,geometry,x,y,cluster
0,5.118134,52.11116,22,POINT (136568.331 458137.319),136568.331468,458137.319,4
1,5.119091,52.11057,23,POINT (136633.645 458071.434),136633.6449,458071.433727,4
2,5.119953,52.11046,21,POINT (136692.649 458058.978),136692.648797,458058.977584,4
3,5.120988,52.1099,20,POINT (136763.320 457996.412),136763.320401,457996.412136,4
4,5.121899,52.109608,22,POINT (136825.608 457963.696),136825.608202,457963.696134,4
5,5.123208,52.10982,22,POINT (136915.338 457986.956),136915.337921,457986.956362,4
6,5.123628,52.110344,23,POINT (136944.348 458045.151),136944.348344,458045.151477,4
7,5.123058,52.10963,24,POINT (136905.033 457965.854),136905.033474,457965.854342,4
8,5.122562,52.108715,23,POINT (136870.664 457864.176),136870.664119,457864.176145,4
9,5.123335,52.1087,22,POINT (136923.640 457862.314),136923.639897,457862.314474,4


In [98]:
full_data.to_csv("test.csv", index=False)

In [99]:
k_means = cluster.KMeans(n_clusters=20, random_state=42)
k_means.fit(full_data[["x","y"]])

X_cluster = k_means.labels_
full_data["cluster"] = X_cluster

full_data.to_csv("test2.csv", index=False)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [113]:
x = full_data["x"].values.reshape(-1, 1)
y = full_data["y"].values.reshape(-1, 1)
pm = full_data["pm2_5"].values.reshape(-1, 1)

In [114]:
scaler = MinMaxScaler()
full_data["x_mm"] = scaler.fit_transform(x)
full_data["y_mm"] = scaler.fit_transform(y)
full_data["pm_mm"] = scaler.fit_transform(pm)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [115]:
k_means = cluster.KMeans(n_clusters=20, random_state=42)
k_means.fit(full_data[["x_mm","y_mm","pm_mm"]])

X_cluster = k_means.labels_
full_data["cluster"] = X_cluster
full_data.to_csv("test3.csv", index=False)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

