# Baseline Solution

 

In [1]:
import warnings
import pandas as pd

warnings.filterwarnings("ignore")

# Preprocessing

In [None]:
pip install tsururu

Collecting tsururu
  Downloading tsururu-1.1.0-py3-none-any.whl.metadata (8.0 kB)
Collecting holidays<0.41,>=0.40 (from tsururu)
  Downloading holidays-0.40-py3-none-any.whl.metadata (21 kB)
Collecting numpy<2.0.0,>=1.26.3 (from tsururu)
  Downloading numpy-1.26.4.tar.gz (15.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.8/15.8 MB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h  Installing build dependencies ... [?25l/

In [2]:
df = pd.read_csv("frames_errors.csv", header=None)
df.columns = [
    "block_id",
    "frame_idx",
    "E_mu_Z",
    "E_mu_phys_est",
    "E_mu_X",
    "E_nu1_X",
    "E_nu2_X",
    "E_nu1_Z",
    "E_nu2_Z",
    "N_mu_X",
    "M_mu_XX",
    "M_mu_XZ",
    "M_mu_X",
    "N_mu_Z",
    "M_mu_ZZ",
    "M_mu_Z",
    "N_nu1_X",
    "M_nu1_XX",
    "M_nu1_XZ",
    "M_nu1_X",
    "N_nu1_Z",
    "M_nu1_ZZ",
    "M_nu1_Z",
    "N_nu2_X",
    "M_nu2_XX",
    "M_nu2_XZ",
    "M_nu2_X",
    "N_nu2_Z",
    "M_nu2_ZZ",
    "M_nu2_Z",
    "nTot",
    "bayesImVoltage",
    "opticalPower",
    "polarizerVoltages[0]",
    "polarizerVoltages[1]",
    "polarizerVoltages[2]",
    "polarizerVoltages[3]",
    "temp_1",
    "biasVoltage_1",
    "temp_2",
    "biasVoltage_2",
    "synErr",
    "N_EC_rounds",
    "maintenance_flag",
    "estimator_name",
    "f_EC",
    "E_mu_Z_est",
    "R",
    "s",
    "p",
]

df_base = df.drop(
    [
        "E_mu_phys_est",
        "f_EC",
    ],
    axis=1,
)
print(f"NaN amount: {df.isna().sum().sum()}")

NaN amount: 579


In [3]:
df = df_base.copy()

In [4]:
df = df.rename(
    columns={
        "block_id": "id",
        "E_mu_Z": "value",
        "frame_idx": "date",
    }
)

# Looking at the length of the time series by the number of frames
timestamp_counts = df.groupby("id")["date"].nunique()
print("Frame count/Series count")
print(timestamp_counts.value_counts())

df_for_ts = df[["id", "value", "date"]].dropna(subset=["value"], how="any")

Frame count/Series count
date
399    569
400    251
398      2
390      1
Name: count, dtype: int64


In [5]:
df_for_ts = df_for_ts.set_index(["id", "date"]).unstack().ffill().stack().reset_index()
timestamp_counts = df_for_ts.groupby("id")["date"].nunique()
print("Frame count/Series count")
print(timestamp_counts.value_counts())

Frame count/Series count
date
400    815
399      8
Name: count, dtype: int64


In [6]:
df_for_ts = df_for_ts.groupby("id").filter(lambda x: len(x) == 400)
print("Leftout segments:", df_for_ts["id"].nunique())

Leftout segments: 815


# DLinear

`DLinear` is a simple and fast model that extracts the trend using AveragePooling, then applies nn.Linear to both the trend and residual components, and finally combines everything back together. You can learn more about the model in the paper:  https://arxiv.org/abs/2205.13504

In [7]:
import logging

logger = logging.getLogger(__name__)
import sys

c_handler = logging.StreamHandler(sys.stdout)
logger.addHandler(c_handler)
logging.basicConfig(level=logging.INFO, force=True)

import random
import warnings


import numpy as np
import pandas as pd
import torch
import torch.nn as nn

from torch.nn import Module

from tsururu.dataset import Pipeline, TSDataset
from tsururu.model_training.trainer import DLTrainer
from tsururu.model_training.validator import HoldOutValidator
from tsururu.strategies import RecursiveStrategy

warnings.filterwarnings("ignore")

ModuleNotFoundError: No module named 'tsururu'

In [None]:
class moving_avg(Module):
    """Moving average block for extracting the trend of a time series.

    Args:
        kernel_size: window size of the convolution (kernel).
        stride: step size of the moving average.
    """

    def __init__(self, kernel_size: int, stride: int):
        super(moving_avg, self).__init__()
        self.kernel_size = kernel_size
        self.avg = nn.AvgPool1d(kernel_size=kernel_size, stride=stride, padding=0)

    def forward(self, x: "torch.Tensor") -> "torch.Tensor":
        """Forward pass for computing the moving average.

        Args:
            x: input tensor.

        Returns:
            Tensor after applying the moving average.
        """
        # add padding (repeat boundary values) on both sides of the time series
        front = x[:, 0:1, :].repeat(1, (self.kernel_size - 1) // 2, 1)
        end = x[:, -1:, :].repeat(1, (self.kernel_size - 1) // 2, 1)
        x = torch.cat([front, x, end], dim=1)

        # apply moving average along the time axis
        x = self.avg(x.permute(0, 2, 1))
        x = x.permute(0, 2, 1)

        return x


class series_decomp(Module):
    """Time series decomposition block.

    Args:
        kernel_size: window size for the moving average.
    """

    def __init__(self, kernel_size: int):
        super(series_decomp, self).__init__()
        self.moving_avg = moving_avg(kernel_size, stride=1)

    def forward(self, x: "torch.Tensor") -> tuple["torch.Tensor", "torch.Tensor"]:
        """Forward pass for decomposing the series into trend and residual.

        Args:
            x: input tensor.

        Returns:
            Tuple of tensors (residual, trend).
        """
        moving_mean = self.moving_avg(x)
        res = x - moving_mean

        return res, moving_mean


class DLinear_NN(Module):
    def __init__(self, features_groups, pred_len, seq_len, moving_avg=25, **kwargs):
        super().__init__()

        # Protection against type "wrappers"
        def _to_int(x):
            if isinstance(x, int):
                return x
            if isinstance(x, dict) and "value" in x:
                return int(x["value"])
            try:
                return int(x)
            except Exception:
                raise TypeError(f"Expected int-like, got {type(x)}: {x}")

        # If named arguments are also passed, we'll take them so they don't interfere
        seq_len = _to_int(kwargs.pop("seq_len", seq_len))
        pred_len = _to_int(kwargs.pop("pred_len", pred_len))
        moving_avg = int(kwargs.pop("moving_avg", moving_avg))

        self.seq_len = seq_len
        self.pred_len = pred_len

        self.decompsition = series_decomp(moving_avg)
        self.Linear_Seasonal = nn.Linear(self.seq_len, self.pred_len)
        self.Linear_Trend = nn.Linear(self.seq_len, self.pred_len)

        self.Linear_Seasonal.weight = nn.Parameter(
            (1 / self.seq_len) * torch.ones([self.pred_len, self.seq_len])
        )
        self.Linear_Trend.weight = nn.Parameter(
            (1 / self.seq_len) * torch.ones([self.pred_len, self.seq_len])
        )

    def forward(self, x: "torch.Tensor") -> "torch.Tensor":
        """Forward pass of the model.

        Args:
            x: input tensor of shape (batch_size, seq_len, num_features).

        Returns:
            Output tensor of shape (batch_size, pred_len, num_features).
        """
        # Decompose the time series into trend and residual (seasonality)
        seasonal_init, trend_init = self.decompsition(x)

        # Transpose tensors to the format (batch_size, num_features, seq_len)
        seasonal_init, trend_init = seasonal_init.permute(0, 2, 1), trend_init.permute(
            0, 2, 1
        )

        # Apply linear layers to trend and residuals
        seasonal_output = self.Linear_Seasonal(seasonal_init)
        trend_output = self.Linear_Trend(trend_init)

        # Sum the results of the linear layers
        x = seasonal_output + trend_output

        # Transpose back to the format (batch_size, seq_len, num_features)
        x = x.permute(0, 2, 1)

        return x[:, -self.pred_len :, :]

In [None]:
# We will predict 8 values ahead using a window of 160

HORIZON = 8
HISTORY = 160

In [None]:
train_df = []
val_df = []
test_df = []
test_targets = []
for current_id in df_for_ts["id"].unique():
    current_df = df_for_ts[df_for_ts["id"] == current_id]
    train_df.append(current_df.iloc[: -2 * HORIZON])
    val_df.append(current_df.iloc[-2 * HORIZON - HISTORY : -HORIZON])
    test_df.append(current_df.iloc[-HORIZON - HISTORY : -HORIZON])
    test_targets.append(current_df.iloc[-HORIZON:])
train_df = pd.concat(train_df)
val_df = pd.concat(val_df)
test_df = pd.concat(test_df)
test_targets = pd.concat(test_targets)


print(f"Training set shape: {train_df.shape}")
print(f"Validation set shape: {val_df.shape}")
print(f"Test set shape: {test_df.shape}")
print(f"Test target shape: {test_targets.shape}")

print(f"Number of series in the training set: {train_df['id'].nunique()}")
print(f"Number of series in the validation set: {val_df['id'].nunique()}")
print(f"Number of series in the test set: {test_df['id'].nunique()}")
print(f"Number of series in the test targets: {test_targets['id'].nunique()}")

In [None]:
# Set the base date (first day)
# This is required for correct operation of the tsururu library and does not affect the core task

base_date = pd.to_datetime("2000-01-01")


def convert_dates(series):
    return base_date + pd.to_timedelta(series.astype(int) - 1, unit="D")


# Apply to every DataFrame

train_df["date"] = convert_dates(train_df["date"])
val_df["date"] = convert_dates(val_df["date"])
test_df["date"] = convert_dates(test_df["date"])
test_targets["date"] = convert_dates(test_targets["date"])

In [None]:
def seed_everything(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

In [None]:
seed_everything()
dataset_params = {
    "target": {
        "columns": ["value"],
        "type": "continuous",
    },
    "date": {
        "columns": ["date"],
        "type": "datetime",
    },
    "id": {
        "columns": ["id"],
        "type": "categorical",
    },
}

train_dataset = TSDataset(
    data=train_df,
    columns_params=dataset_params,
    print_freq_period_info=True,
)
val_dataset = TSDataset(
    data=val_df,
    columns_params=dataset_params,
    print_freq_period_info=False,
)
test_dataset = TSDataset(
    data=test_df,
    columns_params=dataset_params,
    print_freq_period_info=False,
)

In [None]:
pipeline_params = {
    "target": {
        "columns": ["value"],
        "features": {
            "DifferenceNormalizer": {
                "regime": "delta",
                "transform_target": True,
                "transform_features": True,
            },
            "MissingValuesImputer": {  # After DifferenceNormalizer, NaNs inevitably appear in the data (at the first value of each segment)
                "constant_value": 0,  # Fill them with zeros
                "transform_target": True,
                "transform_features": True,
            },
            "StandardScalerTransformer": {  # And align the series values before feeding them into the DL model
                "transform_target": True,
                "transform_features": True,
                "agg_by_id": True,
            },
            "LagTransformer": {"lags": HISTORY},
        },
    }
}

In [None]:
def choose_device():
    if torch.cuda.is_available():
        device = torch.device("cuda")
        print("Using GPU")
    elif torch.backends.mps.is_available():
        device = torch.device("mps")
        print("Using MPS")
    else:
        device = torch.device("cpu")
        print("Using CPU")
    return device

In [None]:
DEVICE = choose_device()

In [None]:
# Train setup

pipeline = Pipeline.from_dict(pipeline_params, multivariate=False)

validation = HoldOutValidator
validation_params = {"validation_data": val_dataset}

trainer_params = {
    "device": DEVICE,
    "num_workers": 4,
    "best_by_metric": True,
    "save_to_dir": False,
    "batch_size": 128,
    "n_epochs": 5,
    "early_stopping_patience": 2,
}


trainer = DLTrainer(
    model=DLinear_NN,
    model_params={"moving_avg": 25},
    validator=validation,
    validation_params=validation_params,
    **trainer_params,
)


strategy = RecursiveStrategy(
    horizon=HORIZON,
    model_horizon=4,
    history=HISTORY,
    pipeline=pipeline,
    trainer=trainer,
)

In [None]:
# Start training

fit_time, metrics = strategy.fit(train_dataset)

In [None]:
# Save the model

import pickle

model_filename = "dlinear_strategy.pkl"
with open(model_filename, "wb") as f:
    pickle.dump(strategy, f)

In [None]:
# Load the model to start predicting

with open(model_filename, "rb") as f:
    loaded_strategy = pickle.load(f)

In [None]:
forecast_time, current_pred = loaded_strategy.predict(test_dataset)

In [None]:
current_pred

In [None]:
current_pred = current_pred.sort_values(["id", "date"]).reset_index(drop=True)

ids = current_pred["id"].unique().tolist()
n_ids = len(ids)

In [None]:
# We need to return 2000 data points
TOTAL = 2000
base = TOTAL // n_ids  # Base number of points per id
rem = TOTAL % n_ids  # For the first rem ids, give 1 extra point

if base == 0:
    # Case when there are too many series (n_ids > 2000): take 1 point for the first 2000 ids
    selected_ids = ids[:TOTAL]
    compressed_values = []
    for i in selected_ids:
        arr = current_pred.loc[current_pred["id"] == i, "value"].to_numpy()
        # Take, for example, the last value of the horizon
        compressed_values.append(float(arr[-1]))
else:
    # Normal case (~815 series): base=2, rem=2000-2*815=370 => 370 series will get 3 points, the rest 2
    compressed_values = []
    for idx, i in enumerate(ids):
        k = base + (1 if idx < rem else 0)  # Target points for this id
        arr = current_pred.loc[current_pred["id"] == i, "value"].to_numpy()

        # Safety check: if horizon < k (shouldn't happen), just repeat the last values
        if len(arr) < k:
            arr = np.pad(arr, (0, k - len(arr)), mode="edge")

        # Split into k ~equal parts and average each
        chunks = np.array_split(arr, k)
        means = [float(np.mean(c)) for c in chunks]
        compressed_values.extend(means)

# Obtain exactly 2000 values in a fixed order
target_df = pd.DataFrame({"value": compressed_values})
assert len(target_df) == 2000, f"Got {len(target_df)} instead of 2000"

In [None]:
from math import ceil

alpha = 0.33
f_ec = 1.15
R_range = [
    round(0.50 + 0.05 * x, 2) for x in range(9)
]  # 0.50..0.90 to match the task requirements
n = 32000
d = 4800

In [None]:
def calculate_ema(prev_ema, current_value, alpha):
    if prev_ema is None:
        return current_value
    return alpha * current_value + (1 - alpha) * prev_ema


def h(x):
    if x > 0:
        return -x * np.log2(x) - (1 - x) * np.log2(1 - x)
    elif x == 0:
        return 0.0
    else:
        raise ValueError("Invalid x for binary entropy")


def select_code_rate(e_mu, f_ec, rates, frame_len, sp_count):
    r_candidate = 1 - h(e_mu) * f_ec
    R_res = 0.50
    s_n = sp_count
    p_n = 0
    for R in rates:
        p_n = int(
            ceil((1 - R) * frame_len - (1 - r_candidate) * (frame_len - sp_count))
        )
        s_n = int(sp_count - p_n)
        if p_n >= 0 and s_n >= 0:
            R_res = R
            return round(R_res, 2), s_n, p_n
    return round(R_res, 2), s_n, p_n

In [None]:
E_series = (
    pd.to_numeric(target_df.iloc[:, 0], errors="coerce").dropna().reset_index(drop=True)
)

prev_ema = None
rows = []
for E_mu_Z in E_series:
    ema_value = calculate_ema(prev_ema, float(E_mu_Z), alpha)
    prev_ema = ema_value
    R, s_n, p_n = select_code_rate(ema_value, f_ec, R_range, n, d)
    rows.append([f"{E_mu_Z:.16f}", R, s_n, p_n])  # 4 columns: E, R, s_n, p_n

# Save the submission
pd.DataFrame(rows).to_csv("submission.csv", header=False, index=False)