In [1]:
# =========================================
# Imports
# =========================================
import numpy as np
import pandas as pd
import random
import tensorflow as tf

from sklearn.preprocessing import StandardScaler
from tensorflow.keras import layers, models
from tensorflow.keras.preprocessing.sequence import pad_sequences
from huggingface_hub import login, list_repo_files, hf_hub_download, upload_file

In [None]:
# =========================================
# Konfiguration
# =========================================
REPO_ID = "mttfst/Paulette_Cloud_Tracks"
token = ""

PAD_VALUE = -99.0
TIMESTEP_SECONDS = 30.0
CUTOFF_STEPS = 5

BATCH_SIZE = 8
EPOCHS = 2 #40

In [3]:
# =========================================
# Login HuggingFace
# =========================================
login(token)

In [4]:
# =========================================
# Logging
# =========================================
import os, json, hashlib
from datetime import datetime

# Du kannst hier ein separates Repo für Logs/Configs nutzen (empfohlen),
# oder du lässt es auf dem Dataset-Repo laufen.
CONFIG_REPO_ID = REPO_ID  # z.B. "thorsten789/hurricane_cloud_runs"

def make_run_id(prefix: str, config: dict) -> str:
    ts = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
    cfg_str = json.dumps(config, sort_keys=True)
    h = hashlib.sha1(cfg_str.encode("utf-8")).hexdigest()[:10]
    return f"{prefix}_{ts}_{h}"

def save_json_local(path: str, data: dict) -> str:
    os.makedirs(os.path.dirname(path), exist_ok=True)
    with open(path, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=2, ensure_ascii=False)
    return path

def upload_json_hf(local_path: str, run_id: str, name: str, base_dir: str = "runs"):
    """Lädt JSON als {base_dir}/{run_id}/{name}.json in CONFIG_REPO_ID hoch."""
    try:
        path_in_repo = f"{base_dir}/{run_id}/{name}.json"
        upload_file(
            path_or_fileobj=local_path,
            path_in_repo=path_in_repo,
            repo_id=CONFIG_REPO_ID,
            repo_type="dataset",
            commit_message=f"Add {name}.json for {run_id}",
        )
        print(f"[HF] uploaded: {path_in_repo}")
    except Exception as e:
        print(f"[HF] upload skipped/failed ({name}): {e}")

# --- zentrale Run-Config (die ID basiert auf config -> sinnvoller Run-Name)
RUN_CONFIG = {
    "model": "LSTM",
    "optimizer": "adam",
    "units1": 64,
    "units2": 32,
    "lr": 1e-3,
    "batch_size": BATCH_SIZE,
    "epochs": EPOCHS,
    "cutoff_steps": CUTOFF_STEPS,
    "pad_value": PAD_VALUE,
}

RUN_ID = make_run_id("simple_rnn", RUN_CONFIG)
print("RUN_ID:", RUN_ID)

# Setup sofort speichern (damit du schon am Anfang einen Run hast)
setup = {
    "run_id": RUN_ID,
    "config": RUN_CONFIG,
    "data": {"repo_id": REPO_ID},
    "meta": {"notebook": "3_Model/lstm_baseline_JS.ipynb"},
}

# +++ Logging: Save Setup +++
local_setup = save_json_local(f"runs_local/{RUN_ID}/setup.json", setup)
upload_json_hf(local_setup, RUN_ID, "setup")

class AutoSaveTrain(tf.keras.callbacks.Callback):
    """Speichert train.json am Ende von model.fit (History + best_val_loss)."""
    def __init__(self, run_id: str):
        super().__init__()
        self.run_id = run_id

    def on_train_end(self, logs=None):
        hist = getattr(self.model, "history", None)
        history_dict = hist.history if hist is not None else {}

        train_data = {
            "run_id": self.run_id,
            "history": history_dict,
            "summary": {
                "best_val_loss": float(min(history_dict["val_loss"])) if "val_loss" in history_dict else None,
                "final_train_loss": float(history_dict["loss"][-1]) if "loss" in history_dict and len(history_dict["loss"]) else None,
            },
        }
        local_train = save_json_local(f"runs_local/{self.run_id}/train.json", train_data)
        upload_json_hf(local_train, self.run_id, "train")


RUN_ID: simple_rnn_2026-02-09_18-18-15_3d1caad569


Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


[HF] uploaded: runs/simple_rnn_2026-02-09_18-18-15_3d1caad569/setup.json


In [5]:
PROFILE_PREFIXES = ["qr_", "qc_", "qi_", "qs_", "qg_", "qv_", "roh_", "w_"]

SCALAR_FEATURES = [
    #"cape_ml_L00", "cin_ml_L00",
    "lwp_L00",
    "iwp_L00", "rain_gsp_rate_L00",
    "tqc_L00", "tqi_L00", "area_m2"
]

In [6]:
tracks_120 = None

# Wenn lokal nicht da: von Hugging Face Dataset-Repo herunterladen
if tracks_120 is None:
    from huggingface_hub import hf_hub_download

    filename_in_repo = "track_len/track_len_exp_1.1.csv"

    print(f"⬇️  Downloading from HF: {REPO_ID}/{filename_in_repo}")
    local_file = hf_hub_download(
        repo_id=REPO_ID,
        repo_type="dataset",
        filename=filename_in_repo,
    )

    print(f"✅ Loading track_len from HF-cached file: {local_file}")
    tracks_120 = pd.read_csv(local_file)

print("tracks_120 shape:", tracks_120.shape)
tracks_120.head()

# =========================================
# Dataset-Split
# =========================================
files = list_repo_files(REPO_ID, repo_type="dataset")
csv_files = [f for f in files if f.startswith("exp_1.1/") and f.endswith(".csv")]

# tracks_120 = pd.read_csv("/content/hurricane_cloud_prediction/5_Data_Sample/track_len.csv")

tracks_120 = tracks_120[tracks_120.track_len <= 120]
print("Total CSV tracks with at least 120 timesteps:", len(tracks_120))

tracks_120 = tracks_120.filename.to_list()

csv_files = [
    f for f in csv_files
    if f.split("/")[1] in tracks_120
]

random.seed(42)
random.shuffle(csv_files)

n = len(csv_files)
train_files = csv_files[: int(0.7 * n)]
val_files   = csv_files[int(0.7 * n): int(0.85 * n)]
test_files  = csv_files[int(0.85 * n):]

print(f"Train: {len(train_files)}, Val: {len(val_files)}, Test: {len(test_files)}")

⬇️  Downloading from HF: mttfst/Paulette_Cloud_Tracks/track_len/track_len_exp_1.1.csv
✅ Loading track_len from HF-cached file: /root/.cache/huggingface/hub/datasets--mttfst--Paulette_Cloud_Tracks/snapshots/13153ed2f2d981c249bc7287e95ac63dbb171914/track_len/track_len_exp_1.1.csv
tracks_120 shape: (9227, 2)
Total CSV tracks with at least 120 timesteps: 8119
Train: 5683, Val: 1218, Test: 1218


In [7]:
def compute_remaining_lifetime(df, timestep_minutes=5):
    """
    Berechnet verbleibende Lebensdauer pro Zeitschritt
    """
    n = len(df)
    return [(n - i - 1) * timestep_minutes for i in range(n)]

#def compute_future_rain(df, timestep_minutes=5):
#    rain = df["rain_gsp_rate_L00"].values
#    dt = timestep_minutes * 60  # Sekunden

#    future_rain = []
#    for i in range(len(rain)):
#        future_rain.append(rain[i:].sum() * dt)

#    return future_rain

In [8]:
def extract_profile(df, prefix, n_levels=50):
    """
    Extrahiert ein vertikales Profil mit exakt n_levels.
    Fehlende Level werden mit 0 aufgefüllt.
    """
    data = np.zeros((len(df), n_levels), dtype="float32")
    for i in range(n_levels):
        col = f"{prefix}L{i:02d}"
        if col in df.columns:
            data[:, i] = df[col].values
    return data

In [9]:
def preprocess_cloud(df):
    df = df.sort_values("time")

    # Target
    y_lifetime = compute_remaining_lifetime(df)

    # Profile (T, Z, F)
    profile_features = []
    for prefix in PROFILE_PREFIXES:
        prof = extract_profile(df, prefix, n_levels=50)
        profile_features.append(prof)
    
    profiles = np.stack(profile_features, axis=-1)
    
    # Scalars
    scalars = df[SCALAR_FEATURES].values.astype("float32")
    
    return {
        "profiles": profiles,
        "scalars": scalars,
        "y": np.array(y_lifetime, dtype="float32")[:, None]
    }

In [10]:
def load_and_preprocess(files):
    samples = []
    for f in files:
        local_file = hf_hub_download(
            repo_id=REPO_ID,
            repo_type="dataset",
            filename=f,
        )
        df = pd.read_csv(local_file)
        
        if len(df) <= CUTOFF_STEPS:
            continue
        
        sample = preprocess_cloud(df)
        samples.append(sample)
    
    return samples

In [11]:
print("\n Loading data...")
train_samples = load_and_preprocess(train_files)
val_samples   = load_and_preprocess(val_files)
test_samples  = load_and_preprocess(test_files)

print(f"Loaded: {len(train_samples)} train, {len(val_samples)} val, {len(test_samples)} test")


 Loading data...
Loaded: 4036 train, 869 val, 864 test


## Normalisierung

In [12]:
print("\n Fitting Scalers...")

# Scaler für Scalars
scalar_scaler = StandardScaler()
all_train_scalars = np.concatenate([s["scalars"] for s in train_samples], axis=0)

# Check for NaN
nan_count = np.isnan(all_train_scalars).sum(axis=0)
if nan_count.sum() > 0:
    print(f"Warning: {nan_count.sum()} NaN values in scalars")
    # Ersetze NaN mit 0
    all_train_scalars = np.nan_to_num(all_train_scalars, nan=0.0)

scalar_scaler.fit(all_train_scalars)

# Scaler für Profiles
profile_scaler = StandardScaler()
all_train_profiles = np.concatenate(
    [s["profiles"].reshape(-1, s["profiles"].shape[-1]) for s in train_samples],
    axis=0
)

# Check for constant features
variances = np.var(all_train_profiles, axis=0)
constant_mask = variances < 1e-8
if constant_mask.any():
    print(f"{constant_mask.sum()} constant profile features found")

profile_scaler.fit(all_train_profiles)

# Fix constant features (set scale to 1.0 to avoid NaN)
profile_scaler.scale_[constant_mask] = 1.0

print("✅ Scalers fitted")

# =========================================
# Target (Y) Normalisierung
# =========================================
print("\n Normalizing targets...")

all_train_y = np.concatenate([s["y"] for s in train_samples], axis=0)
y_mean = np.mean(all_train_y)
y_std = np.std(all_train_y)

print(f"Target Stats - Mean: {y_mean:.2f} min, Std: {y_std:.2f} min")

# =========================================
# Apply Normalization to Samples
# =========================================
def normalize_sample(sample, scalar_scaler, profile_scaler, y_mean, y_std):
    sample_norm = sample.copy()
    
    # Normalize scalars
    sample_norm["scalars"] = scalar_scaler.transform(sample["scalars"])
    
    # Normalize profiles (reshape, transform, reshape back)
    T, Z, F = sample["profiles"].shape
    prof_flat = sample["profiles"].reshape(-1, F)
    prof_norm = profile_scaler.transform(prof_flat)
    sample_norm["profiles"] = prof_norm.reshape(T, Z, F)
    
    # Normalize targets
    sample_norm["y"] = (sample["y"] - y_mean) / y_std
    
    return sample_norm


 Fitting Scalers...
3 constant profile features found
✅ Scalers fitted

 Normalizing targets...
Target Stats - Mean: 131.75 min, Std: 122.69 min


In [13]:
print("Applying normalization...")
train_samples_norm = [normalize_sample(s, scalar_scaler, profile_scaler, y_mean, y_std) 
                      for s in train_samples]
val_samples_norm = [normalize_sample(s, scalar_scaler, profile_scaler, y_mean, y_std) 
                    for s in val_samples]
test_samples_norm = [normalize_sample(s, scalar_scaler, profile_scaler, y_mean, y_std) 
                     for s in test_samples]

print("Normalization applied")

Applying normalization...
Normalization applied


## Padding

In [14]:
def pad_sequences(samples):
    max_len = max(s["profiles"].shape[0] for s in samples)
    
    def pad(arr, pad_width):
        return np.pad(arr, pad_width, mode="constant", constant_values=PAD_VALUE)
    
    X_profiles, X_scalars, Y = [], [], []
    
    for s in samples:
        T = s["profiles"].shape[0]
        
        X_profiles.append(pad(s["profiles"], ((0, max_len-T), (0,0), (0,0))))
        X_scalars.append(pad(s["scalars"], ((0, max_len-T), (0,0))))
        Y.append(pad(s["y"], ((0, max_len-T), (0,0))))
    
    return (np.array(X_profiles, dtype=np.float32), 
            np.array(X_scalars, dtype=np.float32), 
            np.array(Y, dtype=np.float32))

In [15]:
print("\n Padding sequences...")
Xprof_train, Xsca_train, Y_train = pad_sequences(train_samples_norm)
Xprof_val, Xsca_val, Y_val = pad_sequences(val_samples_norm)
Xprof_test, Xsca_test, Y_test = pad_sequences(test_samples_norm)

print(f"Padded shapes:")
print(f"   Profiles: {Xprof_train.shape}")
print(f"   Scalars: {Xsca_train.shape}")
print(f"   Targets: {Y_train.shape}")


 Padding sequences...
Padded shapes:
   Profiles: (4036, 120, 50, 8)
   Scalars: (4036, 120, 6)
   Targets: (4036, 120, 1)


# Custom Loss + Metric

In [16]:
def masked_mse_loss(pad_value=PAD_VALUE):
    def loss(y_true, y_pred):
        mask = tf.not_equal(y_true, pad_value)
        mask = tf.cast(mask, tf.float32)
        
        squared_error = tf.square(y_true - y_pred)
        masked_error = squared_error * mask
        
        sum_error = tf.reduce_sum(masked_error)
        count = tf.reduce_sum(mask)
        count = tf.maximum(count, 1.0)
        
        return sum_error / count
    return loss

def masked_mae_metric(pad_value=PAD_VALUE):
    def metric(y_true, y_pred):
        mask = tf.not_equal(y_true, pad_value)
        mask = tf.cast(mask, tf.float32)
        
        abs_error = tf.abs(y_true - y_pred)
        masked_error = abs_error * mask
        
        sum_error = tf.reduce_sum(masked_error)
        count = tf.reduce_sum(mask)
        count = tf.maximum(count, 1.0)
        
        return sum_error / count
    return metric

# Model

## Definition

In [17]:
def build_hybrid_model(z_levels=50, n_profile_features=8, n_scalars=6):
    # Profile Input
    profile_input = layers.Input(shape=(None, z_levels, n_profile_features), 
                                 name='profile_input')
    
    # CNN für vertikale Features
    x = layers.TimeDistributed(
        layers.Conv1D(64, kernel_size=3, activation="relu", padding="same"),
        name='conv1d_1'
    )(profile_input)
    
    x = layers.TimeDistributed(
        layers.Conv1D(32, kernel_size=3, activation="relu", padding="same"),
        name='conv1d_2'
    )(x)
    
    x = layers.TimeDistributed(layers.Flatten(), name='flatten')(x)
    
    # Scalar Input
    scalar_input = layers.Input(shape=(None, n_scalars), name='scalar_input')
    
    # Concatenate
    x = layers.Concatenate(name='concatenate')([x, scalar_input])
    
    # Masking
    x = layers.Masking(mask_value=PAD_VALUE, name='masking')(x)
    
    # LSTM Layers
    x = layers.LSTM(128, return_sequences=True, 
                   dropout=0.3, recurrent_dropout=0.2,
                   name='lstm_1')(x)
    x = layers.BatchNormalization(name='batch_norm_1')(x)
    
    x = layers.LSTM(64, return_sequences=True,
                   dropout=0.3, recurrent_dropout=0.2,
                   name='lstm_2')(x)
    x = layers.BatchNormalization(name='batch_norm_2')(x)
    
    # Dense Layer
    x = layers.TimeDistributed(
        layers.Dense(32, activation='relu'),
        name='dense_hidden'
    )(x)
    
    # Output
    lifetime_out = layers.TimeDistributed(
        layers.Dense(1), 
        name="lifetime_output"
    )(x)
    
    model = models.Model(
        inputs=[profile_input, scalar_input],
        outputs=lifetime_out,
        name='Hybrid_CNN_LSTM'
    )
    
    # Compile with masked loss
    optimizer = tf.keras.optimizers.Adam(
        learning_rate=1e-3,
        clipnorm=1.0  # Gradient clipping
    )
    
    model.compile(
        optimizer=optimizer,
        loss=masked_mse_loss(PAD_VALUE),
        metrics=[masked_mae_metric(PAD_VALUE)]
    )
    
    return model

print("\n Building model...")
model = build_hybrid_model(
    z_levels=50,
    n_profile_features=len(PROFILE_PREFIXES),
    n_scalars=len(SCALAR_FEATURES)
)

model.summary()


 Building model...


## Training

In [18]:
callbacks = [
    tf.keras.callbacks.EarlyStopping(
        monitor='val_loss',
        patience=10,
        restore_best_weights=True,
        verbose=1
    ),
    tf.keras.callbacks.ReduceLROnPlateau(
        monitor='val_loss',
        factor=0.5,
        patience=5,
        min_lr=1e-7,
        verbose=1
    ),
    tf.keras.callbacks.TerminateOnNaN(),
    AutoSaveTrain(RUN_ID)
]

print("\n Train model...")

history = model.fit(
    [Xprof_train, Xsca_train], Y_train,
    validation_data=([Xprof_val, Xsca_val], Y_val),
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    callbacks=callbacks,
    verbose=1
)


 Train model...
Epoch 1/2
[1m505/505[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1052s[0m 2s/step - loss: 0.7289 - metric: 0.6324 - val_loss: 0.8261 - val_metric: 0.6670 - learning_rate: 0.0010
Restoring model weights from the end of the best epoch: 2.
[HF] uploaded: runs/simple_rnn_2026-02-09_18-18-15_3d1caad569/train.json


# Evaluation

In [19]:
# Evaluate on test set
test_results = model.evaluate(
    [Xprof_test, Xsca_test], Y_test, 
    verbose=1,
    batch_size=BATCH_SIZE
)

print(f"\nTest Loss (normalized): {test_results[0]:.4f}")
print(f"Test MAE (normalized): {test_results[1]:.4f}")

# Make predictions
y_pred_norm = model.predict(
    [Xprof_test, Xsca_test], 
    verbose=0,
    batch_size=BATCH_SIZE
)

# Denormalize
def denormalize_predictions(y_norm, y_mean, y_std, pad_value=PAD_VALUE):
    y = y_norm.copy()
    mask = y != pad_value
    y[mask] = y[mask] * y_std + y_mean
    return y

y_pred = denormalize_predictions(y_pred_norm, y_mean, y_std)
y_test_denorm = denormalize_predictions(Y_test, y_mean, y_std)

# Real-world metrics
mask = y_test_denorm != PAD_VALUE
mae_minutes = np.mean(np.abs(y_test_denorm[mask] - y_pred[mask]))
rmse_minutes = np.sqrt(np.mean((y_test_denorm[mask] - y_pred[mask])**2))

print(f"\n Real-world Metrics:")
print(f"MAE:  {mae_minutes:.2f} minutes")
print(f"RMSE: {rmse_minutes:.2f} minutes")

# =========================================
# Save Evaluation
# =========================================
eval_data = {
    "run_id": RUN_ID,
    "metrics": {
        "test_loss_norm": float(test_results[0]),
        "test_mae_norm": float(test_results[1]),
        "mae_minutes": float(mae_minutes),
        "rmse_minutes": float(rmse_minutes),
    }
}

local_eval = save_json_local(f"runs_local/{RUN_ID}/eval.json", eval_data)
upload_json_hf(local_eval, RUN_ID, "eval")

print(f"\n Run {RUN_ID} completed successfully!")

[1m108/108[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 257ms/step - loss: 0.8058 - metric: 0.6572

Test Loss (normalized): 0.8399
Test MAE (normalized): 0.6664

 Real-world Metrics:
MAE:  86.97 minutes
RMSE: 120.28 minutes
[HF] uploaded: runs/simple_rnn_2026-02-09_18-18-15_3d1caad569/eval.json

 Run simple_rnn_2026-02-09_18-18-15_3d1caad569 completed successfully!
