# Baseline Model

## Table of Contents
1. [Infrastructure](#infrastructure)
2. [Model Choice](#model-choice)
3. [Feature Selection](#feature-selection)
4. [Implementation](#implementation)
5. [Evaluation](#evaluation)


In [1]:
pip install -q datasets huggingface_hub

In [2]:
# Import necessary libraries
import pandas as pd
import numpy as np
import random
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from huggingface_hub import login, list_repo_files, hf_hub_download

# Import your chosen baseline model
# Example: from sklearn.linear_model import LogisticRegression


# Infrastructure

In [3]:
from huggingface_hub import login

token = "" # Your Hugging Face token here
REPO_ID = "mttfst/Paulette_Cloud_Tracks"

login(token)

In [4]:
# from huggingface_hub import login
# from google.colab import userdata

# token = userdata.get('hf')
# login(token)

In [5]:
# 1) Alle Files im Repo holen
files = list_repo_files(REPO_ID, repo_type="dataset")

# 2) Nur die Track-CSV-Dateien f√ºr exp_1.1 ausw√§hlen
csv_files = [f for f in files if f.startswith("exp_1.1/") and f.endswith(".csv")]
print("Total CSV tracks:", len(csv_files))

# 3) Reproduzierbar mischen
random.seed(42)        # fixer Seed, damit der Split immer gleich ist
csv_files_shuffled = csv_files.copy()
random.shuffle(csv_files_shuffled)

# 4) 70/15/15 Split auf Track-Ebene
n = len(csv_files_shuffled)
n_train = int(0.7 * n)
n_val   = int(0.15 * n)
# Rest geht in Test
n_test  = n - n_train - n_val

train_files = csv_files_shuffled[:n_train]
val_files   = csv_files_shuffled[n_train:n_train + n_val]
test_files  = csv_files_shuffled[n_train + n_val:]

print(f"Train tracks: {len(train_files)}")
print(f"Val tracks:   {len(val_files)}")
print(f"Test tracks:  {len(test_files)}")

# Optional: in einem Dict sammeln, damit es √ºbersichtlich bleibt
split_files = {
    "train": train_files,
    "val": val_files,
    "test": test_files,
}


Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


Total CSV tracks: 9227
Train tracks: 6458
Val tracks:   1384
Test tracks:  1385


In [6]:
def add_stats_features_placeholder(
    df: pd.DataFrame,
    timestep_seconds: float = 30.0,
) -> pd.DataFrame:
    """
    Placeholder for Task A.2 feature engineering.
    Here we will later add:
    - running means / max over past N timesteps
    - growth rates (e.g. d(area)/dt)
    - integrated column values, etc.

    For now, this function just returns df unchanged.
    """
    # Example of something we *could* already add (optional, kannst du auch rauslassen):
    # df["track_length_s"] = df["age_s"].iloc[-1] + timestep_seconds
    
    return df


In [7]:
def preprocess_track(
    df: pd.DataFrame,
    timestep_seconds: float = 30.0,
) -> pd.DataFrame:
    """
    Preprocess a single cloud track:
    - ensure correct ordering
    - create local frame index (0..T-1)
    - compute remaining_lifetime_s per timestep
    - optionally drop useless columns (e.g. global time stamp)
    - hook for later stats features (Task A.2)
    """
    df = df.copy()
    
    # Safety: ensure sorted by frame (global frame currently)
    if "frame" in df.columns:
        df = df.sort_values("frame").reset_index(drop=True)
    
    T = len(df)
    
    # 1) Preserve original global frame (for debugging if needed)
    if "frame" in df.columns:
        df["frame_global"] = df["frame"]
    
    # 2) Create local frame index: 0, 1, ..., T-1
    df["frame"] = np.arange(T, dtype=int)
    
    # 3) Age of the cloud at each timestep (could be useful feature)
    df["age_s"] = df["frame"] * timestep_seconds
    
    # 4) Remaining lifetime from each timestep
    #    Last timestep (frame = T-1) ‚Üí 0 seconds remaining
    df["remaining_lifetime_s"] = (T - 1 - df["frame"]) * timestep_seconds
    
    # 5) Drop irrelevant columns (start minimal; wirf 'time' raus)
    cols_to_drop = []
    if "time" in df.columns:
        cols_to_drop.append("time")
    
    # falls du noch andere Spalten immer loswerden willst:
    for c in ["feature", "feature_orig", "cell", "latitude",  "longitude"]:
        if c in df.columns:
            cols_to_drop.append(c)
    
    if cols_to_drop:
        df = df.drop(columns=cols_to_drop)
    
    # 6) Placeholder: add stats-based features for Task A.2 (snapshot + stats)
    df = add_stats_features_placeholder(df, timestep_seconds=timestep_seconds)
    
    return df


In [8]:

def load_track(csv_path_in_repo: str) -> pd.DataFrame:
    """
    L√§dt einen einzelnen Track (eine CSV-Datei) aus dem HF-Dataset
    und gibt ein nach 'frame' sortiertes pandas DataFrame zur√ºck.
    
    csv_path_in_repo: z.B. "exp_1.1/track_000001.csv"
    """
    # 1) Datei von HF runterladen (wird lokal gecached)
    local_path = hf_hub_download(
        repo_id=REPO_ID,
        repo_type="dataset",
        filename=csv_path_in_repo,
    )
    
    # 2) CSV in DataFrame laden
    df = pd.read_csv(local_path)
    
    # 3) Nach 'frame' sortieren (oder 'time', wenn du lieber Zeitstempel nutzt)
    if "frame" in df.columns:
        df = df.sort_values("frame").reset_index(drop=True)
    elif "time" in df.columns:
        df = df.sort_values("time").reset_index(drop=True)
    else:
        raise ValueError("Neither 'frame' nor 'time' column found in track CSV.")
    
    df = preprocess_track(df)

    return df

# Test: ersten Train-Track laden
example_track_file = train_files[0]
print("Example track file:", example_track_file)

track_df = load_track(example_track_file)
print("Track shape:", track_df.shape)
print("Columns:", track_df.columns[:10])  # nur die ersten paar Spalten
print(track_df.head())


Example track file: exp_1.1/cell_08568.csv


Track shape: (4, 413)
Columns: Index(['frame', 'qv_L00', 'qv_L01', 'qv_L02', 'qv_L03', 'qv_L04', 'qv_L05',
       'qv_L06', 'qv_L07', 'qv_L08'],
      dtype='object')
   frame    qv_L00    qv_L01    qv_L02    qv_L03    qv_L04    qv_L05  \
0      0  0.000002  0.000002  0.000002  0.000002  0.000002  0.000002   
1      1  0.000002  0.000002  0.000002  0.000002  0.000002  0.000002   
2      2  0.000002  0.000002  0.000002  0.000002  0.000002  0.000002   
3      3  0.000002  0.000002  0.000002  0.000002  0.000002  0.000002   

     qv_L06    qv_L07    qv_L08  ...    lwp_L00   iwp_L00  cin_ml_L00  \
0  0.000003  0.000007  0.000004  ...  12.609904  0.052129         NaN   
1  0.000003  0.000007  0.000004  ...  12.313472  0.044247         NaN   
2  0.000003  0.000007  0.000004  ...  11.949812  0.036808    0.865642   
3  0.000003  0.000007  0.000004  ...  11.531737  0.029802         NaN   

    tqc_L00  rain_gsp_rate_L00   tqi_L00      area_m2  frame_global  age_s  \
0  2.852148           0.0092

In [9]:
def is_track_long_enough(df, cutoff_steps: int = 5) -> bool:
    """
    Returns True if the track has enough timesteps to be used for Task A
    (remaining lifetime prediction) with the given cutoff at the end.
    """
    T = len(df)
    # we need at least one valid t in [0, T-1-cutoff_steps]
    return T > cutoff_steps

In [10]:
example_track_file = train_files[0]
print("Example track file:", example_track_file)

track_df = load_track(example_track_file)

print("Track shape:", track_df.shape)
print(track_df[["frame", "frame_global", "age_s", "remaining_lifetime_s"]].head())
print(track_df[["frame", "age_s", "remaining_lifetime_s"]].tail())


Example track file: exp_1.1/cell_08568.csv
Track shape: (4, 413)
   frame  frame_global  age_s  remaining_lifetime_s
0      0          5288    0.0                  90.0
1      1          5289   30.0                  60.0
2      2          5290   60.0                  30.0
3      3          5291   90.0                   0.0
   frame  age_s  remaining_lifetime_s
0      0    0.0                  90.0
1      1   30.0                  60.0
2      2   60.0                  30.0
3      3   90.0                   0.0


## Model Choice

[Explain why you've chosen a particular model as the baseline. This could be a simple statistical model or a basic machine learning model. Justify your choice.]


## Feature Selection

[Indicate which features from the dataset you will be using for the baseline model, and justify your selection.]


In [11]:
SCALAR_BASE_FEATURES = [
    "cape_ml_L00",
    "cin_ml_L00",
    "lwp_L00",
    "iwp_L00",
    "tqc_L00",
    "tqi_L00",
    "rain_gsp_rate_L00",
    "area_m2",
    "age_s",
]

def get_scalar_feature_columns(df: pd.DataFrame):
    """
    Returns the list of scalar feature columns to use
    for the snapshot baseline (Task A.1).
    
    Only uses a predefined set of scalar features.
    Profile variables qv_Lxx, w_Lxx, qc_Lxx, etc. are ignored.
    """
    feature_cols = [c for c in SCALAR_BASE_FEATURES if c in df.columns]
    return feature_cols

In [12]:
def sample_snapshot_from_track(
    df: pd.DataFrame,
    num_samples_per_track: int = 3,
    cutoff_steps: int = 5,
    rng: np.random.Generator | None = None,
):
    if rng is None:
        rng = np.random.default_rng()
    
    T = len(df)
    
    if not is_track_long_enough(df, cutoff_steps=cutoff_steps):
        return None, None, None, None
    
    max_valid_t = T - 1 - cutoff_steps
    valid_indices = np.arange(0, max_valid_t + 1)
    
    if len(valid_indices) == 0:
        return None, None, None, None
    
    n_samples = min(num_samples_per_track, len(valid_indices))
    
    if n_samples == len(valid_indices):
        chosen_t = valid_indices
    else:
        chosen_t = rng.choice(valid_indices, size=n_samples, replace=False)
    
    chosen_t = np.sort(chosen_t)
    
    # üëâ neu: nur skalare Features
    feature_cols = get_scalar_feature_columns(df)
    
    if len(feature_cols) == 0:
        raise ValueError("No scalar feature columns found in track DataFrame.")
    
    X = df.loc[chosen_t, feature_cols].to_numpy(dtype=np.float32)
    y = df.loc[chosen_t, "remaining_lifetime_s"].to_numpy(dtype=np.float32)
    
    return X, y, chosen_t, feature_cols



In [13]:
example_track_file = train_files[2]
track_df = load_track(example_track_file)

X, y, t_idx, feat_cols = sample_snapshot_from_track(
    track_df,
    num_samples_per_track=3,
    cutoff_steps=5,
)

print("Chosen timesteps:", t_idx)
print("Feature columns:", feat_cols)
print("X shape:", X.shape)
print("y:", y)
print("First sample (as dict):")
print(dict(zip(feat_cols, X[0])))

Chosen timesteps: [ 2  9 10]
Feature columns: ['cape_ml_L00', 'cin_ml_L00', 'lwp_L00', 'iwp_L00', 'tqc_L00', 'tqi_L00', 'rain_gsp_rate_L00', 'area_m2', 'age_s']
X shape: (3, 9)
y: [570. 360. 330.]
First sample (as dict):
{'cape_ml_L00': np.float32(nan), 'cin_ml_L00': np.float32(nan), 'lwp_L00': np.float32(4.2851415), 'iwp_L00': np.float32(0.6015448), 'tqc_L00': np.float32(2.0657444), 'tqi_L00': np.float32(0.2333252), 'rain_gsp_rate_L00': np.float32(0.0035198503), 'area_m2': np.float32(31360000.0), 'age_s': np.float32(60.0)}


In [14]:
import numpy as np

def build_snapshot_dataset(
    file_list,
    num_samples_per_track: int = 3,
    cutoff_steps: int = 5,
    max_tracks: int | None = None,
    rng_seed: int = 0,
):
    """
    Build a snapshot-based dataset for Task A (remaining lifetime) from many tracks.
    
    Parameters
    ----------
    file_list : list of str
        List of CSV paths inside the HF repo (e.g. split_files["train"]).
    num_samples_per_track : int
        How many timesteps t to sample per track (max).
    cutoff_steps : int
        How many last timesteps to exclude from being chosen as inputs.
    max_tracks : int or None
        If not None, limit the number of tracks to this value (for quick tests).
    rng_seed : int
        Seed for the random generator (for reproducibility).
    
    Returns
    -------
    X : np.ndarray, shape (N_samples, N_features)
    y : np.ndarray, shape (N_samples,)
    feature_cols : list of str
        Names of the feature columns in the same order as in X.
    stats : dict
        Some info: how many tracks used, how many skipped, etc.
    """
    rng = np.random.default_rng(rng_seed)
    
    X_list = []
    y_list = []
    
    n_tracks_total = 0
    n_tracks_used = 0
    n_tracks_too_short = 0
    n_tracks_no_samples = 0
    
    feature_cols_ref = None
    
    for i, csv_path in enumerate(file_list):
        n_tracks_total += 1
        
        if (max_tracks is not None) and (i >= max_tracks):
            break
        
        # Track laden + preprocess
        df = load_track(csv_path)
        
        # L√§nge pr√ºfen
        if not is_track_long_enough(df, cutoff_steps=cutoff_steps):
            n_tracks_too_short += 1
            continue
        
        # Snapshot-Samples aus diesem Track ziehen
        X, y, t_idx, feature_cols = sample_snapshot_from_track(
            df,
            num_samples_per_track=num_samples_per_track,
            cutoff_steps=cutoff_steps,
            rng=rng,
        )
        
        if X is None or len(X) == 0:
            n_tracks_no_samples += 1
            continue
        
        # Feature-Spaltenkonsistenz checken
        if feature_cols_ref is None:
            feature_cols_ref = feature_cols
        else:
            if feature_cols != feature_cols_ref:
                raise ValueError(
                    f"Inconsistent feature columns in track {csv_path}."
                )
        
        X_list.append(X)
        y_list.append(y)
        n_tracks_used += 1
    
    if len(X_list) == 0:
        raise RuntimeError("No samples collected; check cutoff_steps or file_list.")
    
    X_all = np.concatenate(X_list, axis=0)
    y_all = np.concatenate(y_list, axis=0)
    
    stats = {
        "n_tracks_total": n_tracks_total,
        "n_tracks_used": n_tracks_used,
        "n_tracks_too_short": n_tracks_too_short,
        "n_tracks_no_samples": n_tracks_no_samples,
        "n_samples": len(y_all),
        "n_features": X_all.shape[1],
    }
    
    return X_all, y_all, feature_cols_ref, stats


In [15]:
# Beispiel: nur 100 Tracks f√ºr einen ersten Test
X_train, y_train, feature_cols, train_stats = build_snapshot_dataset(
    train_files,
    num_samples_per_track=3,
    cutoff_steps=5,
    max_tracks=1000,    # zum Testen; sp√§ter None setzen f√ºr alle
    rng_seed=42,
)

print("Train stats:", train_stats)
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("Feature columns:", feature_cols)
print("First sample:", dict(zip(feature_cols, X_train[0])))
print("First label (remaining_lifetime_s):", y_train[0])


Train stats: {'n_tracks_total': 1001, 'n_tracks_used': 737, 'n_tracks_too_short': 263, 'n_tracks_no_samples': 0, 'n_samples': 2101, 'n_features': 9}
X_train shape: (2101, 9)
y_train shape: (2101,)
Feature columns: ['cape_ml_L00', 'cin_ml_L00', 'lwp_L00', 'iwp_L00', 'tqc_L00', 'tqi_L00', 'rain_gsp_rate_L00', 'area_m2', 'age_s']
First sample: {'cape_ml_L00': np.float32(nan), 'cin_ml_L00': np.float32(nan), 'lwp_L00': np.float32(4.123248), 'iwp_L00': np.float32(0.21214005), 'tqc_L00': np.float32(2.1722622), 'tqi_L00': np.float32(0.05314788), 'rain_gsp_rate_L00': np.float32(7.3694406e-05), 'area_m2': np.float32(31360000.0), 'age_s': np.float32(0.0)}
First label (remaining_lifetime_s): 180.0


In [16]:
X_val, y_val, _, val_stats = build_snapshot_dataset(
    val_files,
    num_samples_per_track=3,
    cutoff_steps=5,
    max_tracks=500,
    rng_seed=123,
)

X_test, y_test, _, test_stats = build_snapshot_dataset(
    test_files,
    num_samples_per_track=3,
    cutoff_steps=5,
    max_tracks=500,
    rng_seed=456,
)

print("Val stats:", val_stats)
print("Test stats:", test_stats)

Val stats: {'n_tracks_total': 501, 'n_tracks_used': 370, 'n_tracks_too_short': 130, 'n_tracks_no_samples': 0, 'n_samples': 1050, 'n_features': 9}
Test stats: {'n_tracks_total': 501, 'n_tracks_used': 375, 'n_tracks_too_short': 125, 'n_tracks_no_samples': 0, 'n_samples': 1078, 'n_features': 9}


## Implementation

[Implement your baseline model here.]



In [17]:
# Simple baseline model for Task A (remaining lifetime)
rf = RandomForestRegressor(
    n_estimators=200,      # number of trees
    max_depth=None,       # let trees grow fully (kannst sp√§ter begrenzen)
    n_jobs=-1,            # alle CPUs nutzen
    random_state=42,      # reproducible
)

print("Fitting RandomForestRegressor on X_train with shape:", X_train.shape)
rf.fit(X_train, y_train)


Fitting RandomForestRegressor on X_train with shape: (2101, 9)


## Evaluation

[Clearly state what metrics you will use to evaluate the model's performance. These metrics will serve as a starting point for evaluating more complex models later on.]



In [18]:
def evaluate_regression(model, X, y, name: str = "set"):
    y_pred = model.predict(X)
    mae = mean_absolute_error(y, y_pred)
    mse = mean_squared_error(y, y_pred)
    rmse = np.sqrt(mse)
    print(f"{name} - MAE:  {mae:.2f} s")
    print(f"{name} - RMSE: {rmse:.2f} s")
    return {"mae": mae, "rmse": rmse, "y_pred": y_pred}

print("\n=== Evaluation Task A (Snapshot Baseline, scalar features only) ===")
train_metrics = evaluate_regression(rf, X_train, y_train, name="Train")
val_metrics   = evaluate_regression(rf, X_val,   y_val,   name="Val")
test_metrics  = evaluate_regression(rf, X_test,  y_test,  name="Test")




=== Evaluation Task A (Snapshot Baseline, scalar features only) ===
Train - MAE:  273.12 s
Train - RMSE: 495.82 s
Val - MAE:  818.44 s
Val - RMSE: 1451.29 s
Test - MAE:  732.65 s
Test - RMSE: 1227.20 s


In [19]:
import pandas as pd

feature_importances = rf.feature_importances_
fi_df = pd.DataFrame({
    "feature": feature_cols,
    "importance": feature_importances,
}).sort_values("importance", ascending=False)

print("\nTop 15 features by importance:")
print(fi_df.head(15))



Top 15 features by importance:
             feature  importance
2            lwp_L00    0.226060
8              age_s    0.216965
4            tqc_L00    0.123196
3            iwp_L00    0.114900
6  rain_gsp_rate_L00    0.107071
7            area_m2    0.100798
5            tqi_L00    0.084940
0        cape_ml_L00    0.016229
1         cin_ml_L00    0.009840
