In [1]:
import numpy as np # Linear algebra
import pandas as pd # for working with databases
import os # for reading multiple files

# Files import:

In [2]:
path = '/kaggle/input/tankkoenig-december'
files = os.listdir(path)
print(len(files), "files found")

31 files found


# Creating of **Pandas DataFrame**:

In [3]:
dfs = []

for file in files:
    if file.endswith(".csv"):
        df_temp = pd.read_csv(os.path.join(path, file))
        dfs.append(df_temp)

df = pd.concat(dfs, ignore_index=True)

print("Size of splited DataFrame:", df.shape)
df.head()


Size of splited DataFrame: (13648295, 8)


Unnamed: 0,date,station_uuid,diesel,e5,e10,dieselchange,e5change,e10change
0,2025-12-10 00:00:24+01,3116ea83-358d-4528-a440-6a84f56cde37,1.549,1.689,1.629,1,1,1
1,2025-12-10 00:01:25+01,00060166-0002-4444-8888-acdc00000002,1.624,1.694,1.634,1,1,1
2,2025-12-10 00:01:25+01,00060728-0003-4444-8888-acdc00000003,1.624,1.694,1.634,1,1,1
3,2025-12-10 00:01:25+01,89a464bc-b992-4eb5-8888-aa6a4cf51d98,1.609,1.679,1.619,1,1,1
4,2025-12-10 00:01:25+01,7d9462a3-03f6-4363-ae1c-08341249b929,1.534,1.674,1.614,1,1,1


# DF Cleaning

Removing unnecessary columns

In [4]:
df = df.drop(columns=['dieselchange', 'e5change', 'e10change'])

In [5]:
#Date normalazing

df['date'] = pd.to_datetime(df['date'], errors='coerce', utc=True)  # convert everything to UTC
df['date'] = df['date'].dt.tz_convert('Europe/Berlin')  # convert at the right time
df['date'] = df['date'].dt.tz_localize(None)  # remove UTC info

Missing value check:

In [6]:
df['date'].isna().sum()

0

Let's look at the numerical values:

In [7]:
df.describe()

Unnamed: 0,date,diesel,e5,e10
count,13648295,13648300.0,13648300.0,13648300.0
mean,2025-12-16 09:52:06.058587648,1.586595,1.671054,1.5704
min,2025-12-01 00:00:32,-0.001,-0.001,-0.001
25%,2025-12-08 15:37:58,1.539,1.659,1.599
50%,2025-12-16 12:22:47,1.579,1.689,1.629
75%,2025-12-23 17:42:53,1.619,1.729,1.669
max,2025-12-31 23:59:07,3.33,4.444,3.333
std,,0.07100377,0.2213904,0.3356668


There are illogical prices here (for example, those that are less than 0 or equal to zero). Let's replace them with NULL:

In [8]:
df['diesel'] = df['diesel'].mask(df['diesel'] <= 0, np.nan)
df['e5'] = df['e5'].mask(df['e5'] <= 0, np.nan)
df['e10'] = df['e10'].mask(df['e10'] <= 0, np.nan)

In [9]:
df.describe()

Unnamed: 0,date,diesel,e5,e10
count,13648295,13644580.0,13428970.0,13068350.0
mean,2025-12-16 09:52:06.058587648,1.587027,1.698345,1.64009
min,2025-12-01 00:00:32,1.179,1.109,1.19
25%,2025-12-08 15:37:58,1.539,1.659,1.599
50%,2025-12-16 12:22:47,1.579,1.689,1.629
75%,2025-12-23 17:42:53,1.619,1.729,1.669
max,2025-12-31 23:59:07,3.33,4.444,3.333
std,,0.06601294,0.05885015,0.05808169


Let's estimate the number of empty values

In [10]:
df['diesel'].isna().sum()

3713

In [11]:
df['e5'].isna().sum()

219323

In [12]:
df['e10'].isna().sum()

579942

Perhaps this is the value when there was no certain type of fuel at the gas station.

Let's check the number of duplicates by id and date:

In [13]:
duplicates = df.duplicated(subset=['station_uuid', 'date']).sum()
print(f"Number of dublicates: {duplicates}")

Number of dublicates: 0


# Aggregation

### 1. To be fair, if prices have not changed since the previous hour, they carry over to the next hour.

In [14]:
df_temp = df.copy()
df_temp['hour'] = df_temp['date'].dt.floor('h')

In [15]:
full_hours = pd.date_range(
    start='2025-12-01 00:00:00',
    end='2025-12-31 23:00:00',
    freq='h'
)

# Повна сітка для кожної станції
stations = df['station_uuid'].unique()
full_index = pd.MultiIndex.from_product(
    [stations, full_hours],
    names=['station_uuid', 'hour']
)


In [16]:
hourly = (
    df_temp
    .groupby(['station_uuid', 'hour'], as_index=False)
    .agg({
        'diesel': 'mean',
        'e5': 'mean',
        'e10': 'mean'
    })
)

In [17]:
hourly_prices = (
    hourly
    .set_index(['station_uuid', 'hour'])
    .reindex(full_index)
    .reset_index()
)

In [18]:
hourly_prices[['diesel', 'e5', 'e10']] = (
    hourly_prices
    .groupby('station_uuid')[['diesel', 'e5', 'e10']]
    .ffill()
)

In [19]:
hourly_prices.head()

Unnamed: 0,station_uuid,hour,diesel,e5,e10
0,3116ea83-358d-4528-a440-6a84f56cde37,2025-12-01 00:00:00,1.619,1.759,1.699
1,3116ea83-358d-4528-a440-6a84f56cde37,2025-12-01 01:00:00,1.619,1.759,1.699
2,3116ea83-358d-4528-a440-6a84f56cde37,2025-12-01 02:00:00,1.619,1.759,1.699
3,3116ea83-358d-4528-a440-6a84f56cde37,2025-12-01 03:00:00,1.619,1.759,1.699
4,3116ea83-358d-4528-a440-6a84f56cde37,2025-12-01 04:00:00,1.619,1.759,1.699


### 1.2 logic check

In [20]:
hourly_prices.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11205384 entries, 0 to 11205383
Data columns (total 5 columns):
 #   Column        Dtype         
---  ------        -----         
 0   station_uuid  object        
 1   hour          datetime64[ns]
 2   diesel        float64       
 3   e5            float64       
 4   e10           float64       
dtypes: datetime64[ns](1), float64(3), object(1)
memory usage: 427.5+ MB


In [21]:
hourly_prices['diesel'].isna().sum()


186121

In [22]:
percentage = hourly_prices['diesel'].isna().sum() / 11205384 * 100
print(f"Percentage of empty values: {percentage:.2f}%")

Percentage of empty values: 1.66%


In [23]:
hourly_prices['e5'].isna().sum()

439734

In [24]:
percentage = hourly_prices['e5'].isna().sum() / 11205384 * 100
print(f"Percentage of empty values: {percentage:.2f}%")

Percentage of empty values: 3.92%


In [25]:
hourly_prices['e10'].isna().sum()

875044

In [26]:
percentage = hourly_prices['e10'].isna().sum() / 11205384 * 100
print(f"Percentage of empty values: {percentage:.2f}%")

Percentage of empty values: 7.81%


In [27]:
hourly_prices.describe()

Unnamed: 0,hour,diesel,e5,e10
count,11205384,11019260.0,10765650.0,10330340.0
mean,2025-12-16 11:29:59.999999744,1.594763,1.707363,1.65013
min,2025-12-01 00:00:00,1.179,1.109,1.399
25%,2025-12-08 17:45:00,1.539,1.659,1.600667
50%,2025-12-16 11:30:00,1.579,1.689,1.634
75%,2025-12-24 05:15:00,1.619,1.734,1.679
max,2025-12-31 23:00:00,3.33,3.33,3.33
std,,0.09733184,0.09292538,0.09307385


### 2. Calculate the average fuel price per day

In [28]:
df_temp = df.copy()
df_temp['day'] = df_temp['date'].dt.floor('D')

full_days = pd.date_range(
    start='2025-12-01',
    end='2025-12-31',
    freq='D'
)


In [29]:
stations = df['station_uuid'].unique()
full_index = pd.MultiIndex.from_product(
    [stations, full_days],
    names=['station_uuid', 'day']
)

daily = (
    df_temp
    .groupby(['station_uuid', 'day'], as_index=False)
    .agg({
        'diesel': ['mean', 'count'],
        'e5': ['mean', 'count'],
        'e10': ['mean', 'count']
    })
)

daily.columns = ['station_uuid', 'day', 
                 'diesel', 'diesel_changes',
                 'e5', 'e5_changes', 
                 'e10', 'e10_changes']


In [30]:
daily_prices = (
    daily
    .set_index(['station_uuid', 'day'])
    .reindex(full_index)
    .reset_index()
)


daily_prices[['diesel', 'e5', 'e10']] = (
    daily_prices
    .groupby('station_uuid')[['diesel', 'e5', 'e10']]
    .ffill()
)

# For stations with no changes during day, to fill cells with 0.
daily_prices[['diesel_changes', 'e5_changes', 'e10_changes']] = (
    daily_prices[['diesel_changes', 'e5_changes', 'e10_changes']]
    .fillna(0)
)

In [31]:
daily_prices.head()

Unnamed: 0,station_uuid,day,diesel,diesel_changes,e5,e5_changes,e10,e10_changes
0,3116ea83-358d-4528-a440-6a84f56cde37,2025-12-01,1.636234,47.0,1.774957,47.0,1.714957,47.0
1,3116ea83-358d-4528-a440-6a84f56cde37,2025-12-02,1.606083,48.0,1.759417,48.0,1.699417,48.0
2,3116ea83-358d-4528-a440-6a84f56cde37,2025-12-03,1.589566,53.0,1.768057,53.0,1.708057,53.0
3,3116ea83-358d-4528-a440-6a84f56cde37,2025-12-04,1.609536,56.0,1.765429,56.0,1.705429,56.0
4,3116ea83-358d-4528-a440-6a84f56cde37,2025-12-05,1.598649,57.0,1.769877,57.0,1.709877,57.0


### 2.2 logic check

In [32]:
daily_prices.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 466891 entries, 0 to 466890
Data columns (total 8 columns):
 #   Column          Non-Null Count   Dtype         
---  ------          --------------   -----         
 0   station_uuid    466891 non-null  object        
 1   day             466891 non-null  datetime64[ns]
 2   diesel          462569 non-null  float64       
 3   diesel_changes  466891 non-null  float64       
 4   e5              451917 non-null  float64       
 5   e5_changes      466891 non-null  float64       
 6   e10             433605 non-null  float64       
 7   e10_changes     466891 non-null  float64       
dtypes: datetime64[ns](1), float64(6), object(1)
memory usage: 28.5+ MB


In [33]:
daily_prices.describe()

Unnamed: 0,day,diesel,diesel_changes,e5,e5_changes,e10,e10_changes
count,466891,462569.0,466891.0,451917.0,466891.0,433605.0,466891.0
mean,2025-12-16 00:00:00,1.595529,29.224341,1.707998,28.762542,1.650724,27.990158
min,2025-12-01 00:00:00,1.408233,0.0,1.239,0.0,1.399,0.0
25%,2025-12-08 00:00:00,1.556647,22.0,1.671414,21.0,1.614135,20.0
50%,2025-12-16 00:00:00,1.582778,30.0,1.694952,30.0,1.637387,29.0
75%,2025-12-24 00:00:00,1.612333,38.0,1.720944,38.0,1.6634,38.0
max,2025-12-31 00:00:00,3.0,165.0,3.0,165.0,3.0,165.0
std,,0.087955,15.18461,0.087524,15.617165,0.087723,16.140226


In [34]:
daily_prices['diesel'].isna().sum()


4322

In [35]:
percentage = daily_prices['diesel'].isna().sum() / 11205384 * 100
print(f"Percentage of empty values: {percentage:.2f}%")

Percentage of empty values: 0.04%


In [36]:
daily_prices['e5'].isna().sum()


14974

In [37]:
percentage = daily_prices['e5'].isna().sum() / 11205384 * 100
print(f"Percentage of empty values: {percentage:.2f}%")

Percentage of empty values: 0.13%


In [38]:
daily_prices['e10'].isna().sum()


33286

In [39]:
percentage = daily_prices['e10'].isna().sum() / 11205384 * 100
print(f"Percentage of empty values: {percentage:.2f}%")

Percentage of empty values: 0.30%


### 3. Calculate average prices for each station (over the entire period)

In [40]:
stations_avg = (
    df.groupby("station_uuid")[["diesel", "e5", "e10"]]
      .mean()
      .reset_index()
)

In [41]:
stations_avg.head()

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,station_uuid,diesel,e5,e10
0,00006210-0037-4444-8888-acdc00006210,1.614156,1.678973,1.618982
1,00016899-3247-4444-8888-acdc00000007,1.612372,1.709233,1.649814
2,00041414-208c-4444-8888-acdc00000414,1.553211,1.678474,
3,00041450-0002-4444-8888-acdc00000002,1.548,1.640222,1.600222
4,00047369-0001-4444-8888-acdc00000001,1.66305,1.724832,1.665426


This warning appears when pandas tries to format numbers for display and encounters invalid values such as **NaN**, **inf**, or **non‑numeric** data. It does not affect calculations — it only indicates that some values cannot be compared during formatting. No big deal for analysis

In [42]:
stations_avg.describe()

Unnamed: 0,diesel,e5,e10
count,15057.0,14706.0,14092.0
mean,1.596629,1.708682,1.651412
std,0.085159,0.086324,0.086631
min,1.469,1.239,1.399
25%,1.563131,1.675115,1.617501
50%,1.584197,1.695198,1.637603
75%,1.607333,1.717326,1.659954
max,3.0,3.0,3.0


In [43]:
stations_avg['diesel'].isna().sum()

4

In [44]:
stations_avg['e5'].isna().sum()

355

In [45]:
stations_avg['e10'].isna().sum()

969

# Export

In [46]:
hourly_prices.to_csv('tankkoenig_hourly_prices.csv', index=False)

In [47]:
daily_prices.to_csv('tankkoenig_daily_prices.csv', index=False)

In [48]:
stations_avg.to_csv('tankkoenig_station_times_prices.csv', index=False)