# Resampling

In [1]:
import pandas

In [2]:
df = pandas.read_csv('Messstationen_Tagesdaten_v2_Datensatz_19900101_20250515.csv')
df['time'] = pandas.to_datetime(df['time'])
df = df.set_index('time')
df = df.dropna()
df.head(5)

Unnamed: 0_level_0,station,tlmax,tlmin,tl_mittel,rf_mittel
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1990-01-01 00:00:00+00:00,5925,-0.1,-1.6,-0.9,83.0
1990-01-02 00:00:00+00:00,5925,1.4,-2.3,-0.5,78.0
1990-01-03 00:00:00+00:00,5925,1.4,-0.7,0.4,66.0
1990-01-04 00:00:00+00:00,5925,1.2,-5.5,-2.2,78.0
1990-01-05 00:00:00+00:00,5925,-1.0,-4.6,-2.8,85.0


## Changing the frequency of a timeseries

In [3]:
print(f'start: {df.index[0]}')
print(f'end:   {df.index[-1]}')

start: 1990-01-01 00:00:00+00:00
end:   2025-05-14 00:00:00+00:00


In [4]:
# resample to yearly averages, in case of years, the new index label signifies the end of the period, see https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.resample.html
df.resample('1YE').mean()

Unnamed: 0_level_0,station,tlmax,tlmin,tl_mittel,rf_mittel
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1990-12-31 00:00:00+00:00,5925.0,16.090411,8.610959,12.371781,68.813699
1991-12-31 00:00:00+00:00,5925.0,14.65726,7.773425,11.234247,71.876712
1992-12-31 00:00:00+00:00,5925.0,16.285246,9.139891,12.737978,68.994536
1993-12-31 00:00:00+00:00,5925.0,15.391233,8.088767,11.75863,71.391781
1994-12-31 00:00:00+00:00,5925.0,16.803836,9.529863,13.190959,68.819178
1995-12-31 00:00:00+00:00,5925.0,15.191233,8.569863,11.899178,69.772603
1996-12-31 00:00:00+00:00,5925.0,13.544262,7.230874,10.403825,72.661202
1997-12-31 00:00:00+00:00,5925.0,15.089863,8.15726,11.643562,70.273973
1998-12-31 00:00:00+00:00,5925.0,15.849315,8.727397,12.309589,69.282192
1999-12-31 00:00:00+00:00,5925.0,15.66411,8.950685,12.332055,73.534247


In [5]:
# Monthly aveages
df.resample('1ME').mean()

Unnamed: 0_level_0,station,tlmax,tlmin,tl_mittel,rf_mittel
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1990-01-31 00:00:00+00:00,5925.0,4.167742,-0.693548,1.745161,79.354839
1990-02-28 00:00:00+00:00,5925.0,11.075000,3.589286,7.360714,70.214286
1990-03-31 00:00:00+00:00,5925.0,15.238710,6.609677,10.954839,59.806452
1990-04-30 00:00:00+00:00,5925.0,14.193333,6.850000,10.543333,67.233333
1990-05-31 00:00:00+00:00,5925.0,22.187097,12.767742,17.503226,58.645161
...,...,...,...,...,...
2025-01-31 00:00:00+00:00,5925.0,5.422581,0.961290,3.206452,73.580645
2025-02-28 00:00:00+00:00,5925.0,6.053571,0.567857,3.328571,58.500000
2025-03-31 00:00:00+00:00,5925.0,14.429032,5.512903,9.993548,55.612903
2025-04-30 00:00:00+00:00,5925.0,19.070000,9.876667,14.496667,51.100000


### Resampling methods

In [6]:
# Monthly max
df.resample('1ME').max()

Unnamed: 0_level_0,station,tlmax,tlmin,tl_mittel,rf_mittel
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1990-01-31 00:00:00+00:00,5925,13.1,8.1,10.6,98.0
1990-02-28 00:00:00+00:00,5925,19.5,8.4,13.2,92.0
1990-03-31 00:00:00+00:00,5925,24.0,14.1,19.1,92.0
1990-04-30 00:00:00+00:00,5925,23.1,10.1,16.2,98.0
1990-05-31 00:00:00+00:00,5925,26.7,16.5,21.6,85.0
...,...,...,...,...,...
2025-01-31 00:00:00+00:00,5925,17.0,6.1,11.6,88.0
2025-02-28 00:00:00+00:00,5925,13.6,6.8,9.2,90.0
2025-03-31 00:00:00+00:00,5925,21.9,11.0,14.8,86.0
2025-04-30 00:00:00+00:00,5925,26.7,17.0,20.9,84.0


In [7]:
# Monthly variance
df.resample('1ME').var()

Unnamed: 0_level_0,station,tlmax,tlmin,tl_mittel,rf_mittel
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1990-01-31 00:00:00+00:00,0.0,35.448258,28.303957,30.532559,140.569892
1990-02-28 00:00:00+00:00,0.0,15.250093,4.002474,7.127659,145.063492
1990-03-31 00:00:00+00:00,0.0,20.453118,8.702237,11.835226,191.027957
1990-04-30 00:00:00+00:00,0.0,15.672368,4.390172,7.209437,201.909195
1990-05-31 00:00:00+00:00,0.0,9.402495,4.340925,5.916989,94.036559
...,...,...,...,...,...
2025-01-31 00:00:00+00:00,0.0,24.331806,8.148452,14.483290,121.184946
2025-02-28 00:00:00+00:00,0.0,10.013690,8.707447,8.045820,149.444444
2025-03-31 00:00:00+00:00,0.0,22.566129,10.545828,12.585957,177.711828
2025-04-30 00:00:00+00:00,0.0,21.916655,13.555644,15.230678,140.851724


In [8]:
# Monthly median
df.resample('1ME').median()

Unnamed: 0_level_0,station,tlmax,tlmin,tl_mittel,rf_mittel
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1990-01-31 00:00:00+00:00,5925.0,6.50,-0.70,2.30,80.0
1990-02-28 00:00:00+00:00,5925.0,9.90,3.20,6.35,69.0
1990-03-31 00:00:00+00:00,5925.0,15.20,5.90,11.00,58.0
1990-04-30 00:00:00+00:00,5925.0,14.45,7.25,10.70,68.0
1990-05-31 00:00:00+00:00,5925.0,22.90,12.80,18.30,56.0
...,...,...,...,...,...
2025-01-31 00:00:00+00:00,5925.0,4.00,0.80,2.30,76.0
2025-02-28 00:00:00+00:00,5925.0,5.65,0.35,3.05,57.0
2025-03-31 00:00:00+00:00,5925.0,14.60,5.90,10.20,57.0
2025-04-30 00:00:00+00:00,5925.0,19.60,9.85,14.55,49.5


...

A full list of existing resampling strategies can be found here: https://pandas.pydata.org/docs/reference/resampling.html

### Other approaches

In [9]:
# simply produce a new frequency given the original data of those timestamps in original data (so the 2002-01-31 00:00:00 data is the same as in the original dataframe, the stuff inbetween is just dropped)
df.asfreq('1ME')

Unnamed: 0_level_0,station,tlmax,tlmin,tl_mittel,rf_mittel
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1990-01-31 00:00:00+00:00,5925,6.5,0.2,3.4,98.0
1990-02-28 00:00:00+00:00,5925,7.7,4.0,5.9,67.0
1990-03-31 00:00:00+00:00,5925,16.9,7.8,12.4,64.0
1990-04-30 00:00:00+00:00,5925,17.3,7.0,12.2,62.0
1990-05-31 00:00:00+00:00,5925,19.5,9.7,14.6,53.0
...,...,...,...,...,...
2024-12-31 00:00:00+00:00,5925,3.2,-3.9,-0.4,88.0
2025-01-31 00:00:00+00:00,5925,9.0,2.4,5.7,59.0
2025-02-28 00:00:00+00:00,5925,12.2,6.1,9.2,53.0
2025-03-31 00:00:00+00:00,5925,12.9,6.9,9.9,64.0


## Upsampling
With the above, we only downsampled data - we reduce the amount of datapoints.
However, theres also strategies to upsample data, therefore increase the number of datapoints.

In [10]:
# While pandas does let us do this, this of course produces a lot of NaN (Not A Number) points since there is no data to compute that.
df.resample('1h').mean()

Unnamed: 0_level_0,station,tlmax,tlmin,tl_mittel,rf_mittel
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1990-01-01 00:00:00+00:00,5925.0,-0.1,-1.6,-0.9,83.0
1990-01-01 01:00:00+00:00,,,,,
1990-01-01 02:00:00+00:00,,,,,
1990-01-01 03:00:00+00:00,,,,,
1990-01-01 04:00:00+00:00,,,,,
...,...,...,...,...,...
2025-05-13 20:00:00+00:00,,,,,
2025-05-13 21:00:00+00:00,,,,,
2025-05-13 22:00:00+00:00,,,,,
2025-05-13 23:00:00+00:00,,,,,


In [11]:
# Simplest strategy would probably be forward fill - fill all values with the last known one
df.resample('1h').ffill()

Unnamed: 0_level_0,station,tlmax,tlmin,tl_mittel,rf_mittel
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1990-01-01 00:00:00+00:00,5925,-0.1,-1.6,-0.9,83.0
1990-01-01 01:00:00+00:00,5925,-0.1,-1.6,-0.9,83.0
1990-01-01 02:00:00+00:00,5925,-0.1,-1.6,-0.9,83.0
1990-01-01 03:00:00+00:00,5925,-0.1,-1.6,-0.9,83.0
1990-01-01 04:00:00+00:00,5925,-0.1,-1.6,-0.9,83.0
...,...,...,...,...,...
2025-05-13 20:00:00+00:00,5925,19.5,9.5,14.5,34.0
2025-05-13 21:00:00+00:00,5925,19.5,9.5,14.5,34.0
2025-05-13 22:00:00+00:00,5925,19.5,9.5,14.5,34.0
2025-05-13 23:00:00+00:00,5925,19.5,9.5,14.5,34.0


In [12]:
# But we can also interpolate, by default interpolation method is linear.
df.resample('1h').interpolate()

Unnamed: 0_level_0,station,tlmax,tlmin,tl_mittel,rf_mittel
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1990-01-01 00:00:00+00:00,5925.0,-0.100000,-1.600000,-0.900000,83.000000
1990-01-01 01:00:00+00:00,5925.0,-0.037500,-1.629167,-0.883333,82.791667
1990-01-01 02:00:00+00:00,5925.0,0.025000,-1.658333,-0.866667,82.583333
1990-01-01 03:00:00+00:00,5925.0,0.087500,-1.687500,-0.850000,82.375000
1990-01-01 04:00:00+00:00,5925.0,0.150000,-1.716667,-0.833333,82.166667
...,...,...,...,...,...
2025-05-13 20:00:00+00:00,5925.0,23.166667,8.416667,15.833333,30.666667
2025-05-13 21:00:00+00:00,5925.0,23.350000,8.362500,15.900000,30.500000
2025-05-13 22:00:00+00:00,5925.0,23.533333,8.308333,15.966667,30.333333
2025-05-13 23:00:00+00:00,5925.0,23.716667,8.254167,16.033333,30.166667


In [13]:
# Theres multiple other interpolation options, see https://pandas.pydata.org/docs/reference/api/pandas.core.resample.Resampler.interpolate.html#pandas.core.resample.Resampler.interpolate
df.resample('1h').interpolate(method='cubic')

Unnamed: 0_level_0,station,tlmax,tlmin,tl_mittel,rf_mittel
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1990-01-01 00:00:00+00:00,5925.0,-0.100000,-1.600000,-0.900000,83.000000
1990-01-01 01:00:00+00:00,5925.0,0.023796,-1.881057,-0.987477,83.548602
1990-01-01 02:00:00+00:00,5925.0,0.140985,-2.131827,-1.062065,84.006262
1990-01-01 03:00:00+00:00,5925.0,0.251743,-2.353453,-1.124287,84.376407
1990-01-01 04:00:00+00:00,5925.0,0.356243,-2.547079,-1.174666,84.662461
...,...,...,...,...,...
2025-05-13 20:00:00+00:00,5925.0,22.669253,8.859427,15.800260,31.752539
2025-05-13 21:00:00+00:00,5925.0,22.953859,8.716985,15.874606,31.372073
2025-05-13 22:00:00+00:00,5925.0,23.253571,8.559969,15.949386,30.954238
2025-05-13 23:00:00+00:00,5925.0,23.568810,8.387825,16.024538,30.497418
