# Unevenly spaced timeseries

In [6]:
import pandas
from io import StringIO
from datetime import datetime, timedelta, UTC

In [8]:
# Example weather measurement data from tawes, timestamps include a timzeone information and are in iso format
tawes_utc = """
time,station,cglo_j,rr,tl_mittel,vv_mittel,p_mittel,tlmin,tlmax
1999-04-09T00:00+00:00,5925,,-1,7.4,1.0,962.1,2.2,12.6
1999-04-09T00:02+00:00,5925,,-1,8.3,1.0,957.7,-0.6,17.1
1999-04-09T00:10+00:00,5925,,-1,10.9,1.5,951.9,7.5,14.3
1999-04-09T01:03+00:00,5925,,2.5,8.1,1.0,947.4,4.5,11.7
1999-04-09T01:56+00:00,5925,,7.8,9.1,0.6,939.7,6.2,12.0
1999-04-09T03:30+00:00,5925,,0.4,6.5,1.0,945.4,0.9,12.1
1999-04-09T03:33+00:00,5925,,-1,9.5,1.5,946.4,2.9,16.0
1999-04-09T04:04+00:00,5925,,7.7,11.5,0.6,944.4,9.6,13.3
"""

In [12]:
df = pandas.read_csv(StringIO(tawes_utc))
df['time'] = pandas.to_datetime(df['time'])
df = df.set_index('time')
df

Unnamed: 0_level_0,station,cglo_j,rr,tl_mittel,vv_mittel,p_mittel,tlmin,tlmax
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1999-04-09 00:00:00+00:00,13305,,-1.0,7.4,1.0,962.1,2.2,12.6
1999-04-09 00:02:00+00:00,13305,,-1.0,8.3,1.0,957.7,-0.6,17.1
1999-04-09 00:10:00+00:00,13305,,-1.0,10.9,1.5,951.9,7.5,14.3
1999-04-09 01:03:00+00:00,13305,,2.5,8.1,1.0,947.4,4.5,11.7
1999-04-09 01:56:00+00:00,13305,,7.8,9.1,0.6,939.7,6.2,12.0
1999-04-09 03:30:00+00:00,13305,,0.4,6.5,1.0,945.4,0.9,12.1
1999-04-09 03:33:00+00:00,13305,,-1.0,9.5,1.5,946.4,2.9,16.0
1999-04-09 04:04:00+00:00,13305,,7.7,11.5,0.6,944.4,9.6,13.3


## Working with irregular spaced timeseries is hard
Because some of the pandas built in resamplers are not made for this

In [13]:
# Median still makes sense
df.resample('1h').median()

Unnamed: 0_level_0,station,cglo_j,rr,tl_mittel,vv_mittel,p_mittel,tlmin,tlmax
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1999-04-09 00:00:00+00:00,13305.0,,-1.0,8.3,1.0,957.7,2.2,14.3
1999-04-09 01:00:00+00:00,13305.0,,5.15,8.6,0.8,943.55,5.35,11.85
1999-04-09 02:00:00+00:00,,,,,,,,
1999-04-09 03:00:00+00:00,13305.0,,-0.3,8.0,1.25,945.9,1.9,14.05
1999-04-09 04:00:00+00:00,13305.0,,7.7,11.5,0.6,944.4,9.6,13.3


In [18]:
# Mean however, depends on your context if this is a sensible thing to do:
df.resample('1h').mean()

Unnamed: 0_level_0,station,cglo_j,rr,tl_mittel,vv_mittel,p_mittel,tlmin,tlmax
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1999-04-09 00:00:00+00:00,13305.0,,-1.0,8.866667,1.166667,957.233333,3.033333,14.666667
1999-04-09 01:00:00+00:00,13305.0,,5.15,8.6,0.8,943.55,5.35,11.85
1999-04-09 02:00:00+00:00,,,,,,,,
1999-04-09 03:00:00+00:00,13305.0,,-0.3,8.0,1.25,945.9,1.9,14.05
1999-04-09 04:00:00+00:00,13305.0,,7.7,11.5,0.6,944.4,9.6,13.3


In [21]:
# For example the tlmin in the original data is:
# 1999-04-09 00:00:00+00:00       2.2
# 1999-04-09 00:02:00+00:00      -0.6
# 1999-04-09 00:10:00+00:00       7.5

df.resample('1h').mean().loc['1999-04-09 00:00:00+00:00']



station      13305.000000
cglo_j                NaN
rr              -1.000000
tl_mittel        8.866667
vv_mittel        1.166667
p_mittel       957.233333
tlmin            3.033333
tlmax           14.666667
Name: 1999-04-09 00:00:00+00:00, dtype: float64

In [24]:
# tlmin = 3.033333333333333 according to pandas mean
# which is exactly the mean of the values that occurred within this hour
(2.2 + (-0.6) + 7.5)/3

# however, depending on your context and use case, it may be sensible to consider the time between these timestamps as the -0.6 was occurring only for a short time inbetween the other measurements.

3.033333333333333

There are multiple ways to deal with such scenarios.
One would be to upsample to a really high frequency (e.g. 1 second) under forward fill condition under the assumption that the values stayed the same until a new value arrived, and the downsample again with a mean for an hour.
Or upsample to a really high frequency (e.g. 1 second) under linear interpolation conditon with the assumption that the values change continuously between measurements and then downsample again with a mean.
The other option being writing a custom resampling method or numerically integrating the intervals.