In [None]:
import warnings
import pandas as pd
import pytorch_lightning as pl
from tqdm.auto import tqdm
from evaluation.visualization import print_sequence_table

warnings.filterwarnings("ignore")
tqdm.pandas()

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv("bst_data/bst_all.csv")
print(f"✅ Загружено {len(df):,} записей из Best Track данных")
print(f"📊 Уникальных циклонов: {df['intl_id'].nunique()}")
print(f"📅 Временной диапазон: {df['analysis_time'].min()} - {df['analysis_time'].max()}")

✅ Загружено 70,217 записей из Best Track данных
📊 Уникальных циклонов: 1924
📅 Временной диапазон: 1951-02-19 06:00:00 - 2024-12-25 12:00:00


In [3]:
from data_processing.data_processor import DataProcessor
from data_processing.dataset_models import SequenceConfig
from data_processing.dataset_utils import split_data_by_years

sequence_config = SequenceConfig(
    min_history_length=2,
    max_history_length=None,
)

processor = DataProcessor(
    horizons_hours=[6, 12, 18, 24, 30, 36, 42, 48],
    # horizons_hours=[6, 24, 48],
    sequence_config=sequence_config,
    train_max_year=2021,
    val_max_year=2023,
    validate_data=True,
)

dataset = processor.build_dataset(df)

DataProcessor: dropped 1694 rows with missing values
Dataset validation: 68523 rows, 7 columns


Creating training examples: 100%|██████████| 1846/1846 [11:24<00:00,  2.70it/s]


In [13]:
X_train, y_train, X_val, y_val, X_test, y_test = split_data_by_years(
    dataset.X, dataset.y, dataset.times, train_max_year=2020, val_max_year=2022
)

In [14]:
len(X_train)

434745

In [15]:
print_sequence_table(X_train["sequences"][0])

📊 Таблица последовательностей

🔹 Seq_0
------------------------------------------------------------
Размер: 2 шагов × 9 признаков
+-------+-----------+-----------+------------+--------------+----------------+----------------+-----------+--------------------+--------------------+
|       |   lat_deg |   lon_deg |   pressure |   wind_speed |   velocity_lat |   velocity_lon |   bearing |   acceleration_lat |   acceleration_lon |
| Шаг 1 |     8.800 |   137.500 |   1004.000 |        2.000 |          0.000 |          0.000 |     0.000 |              0.000 |              0.000 |
+-------+-----------+-----------+------------+--------------+----------------+----------------+-----------+--------------------+--------------------+
| Шаг 2 |     9.700 |   136.000 |   1004.000 |        2.000 |         32.109 |        301.414 |     0.000 |              0.000 |              0.000 |
+-------+-----------+-----------+------------+--------------+----------------+----------------+-----------+-------------

In [16]:
print_sequence_table(X_train["sequences"][8])

📊 Таблица последовательностей

🔹 Seq_0
------------------------------------------------------------
Размер: 3 шагов × 9 признаков
+-------+-----------+-----------+------------+--------------+----------------+----------------+-----------+--------------------+--------------------+
|       |   lat_deg |   lon_deg |   pressure |   wind_speed |   velocity_lat |   velocity_lon |   bearing |   acceleration_lat |   acceleration_lon |
| Шаг 1 |     8.800 |   137.500 |   1004.000 |        2.000 |          0.000 |          0.000 |     0.000 |              0.000 |              0.000 |
+-------+-----------+-----------+------------+--------------+----------------+----------------+-----------+--------------------+--------------------+
| Шаг 2 |     9.700 |   136.000 |   1004.000 |        2.000 |         32.109 |        301.414 |     0.000 |              0.000 |              0.000 |
+-------+-----------+-----------+------------+--------------+----------------+----------------+-----------+-------------

In [17]:
X_train[:8]

Unnamed: 0,sequences,analysis_time,intl_id,storm_name,target_time_hours,day_of_year_sin,day_of_year_cos,month_of_year_sin,month_of_year_cos
0,"[[8.8, 137.5, 1004.0, 2.0, 0.0, 0.0, 0.0, 0.0,...",2000-05-05,1,6 DAMREY,6.0,0.829677,-0.558244,0.5,-0.866025
1,"[[8.8, 137.5, 1004.0, 2.0, 0.0, 0.0, 0.0, 0.0,...",2000-05-05,1,6 DAMREY,12.0,0.829677,-0.558244,0.5,-0.866025
2,"[[8.8, 137.5, 1004.0, 2.0, 0.0, 0.0, 0.0, 0.0,...",2000-05-05,1,6 DAMREY,18.0,0.829677,-0.558244,0.5,-0.866025
3,"[[8.8, 137.5, 1004.0, 2.0, 0.0, 0.0, 0.0, 0.0,...",2000-05-05,1,6 DAMREY,24.0,0.829677,-0.558244,0.5,-0.866025
4,"[[8.8, 137.5, 1004.0, 2.0, 0.0, 0.0, 0.0, 0.0,...",2000-05-05,1,6 DAMREY,30.0,0.829677,-0.558244,0.5,-0.866025
5,"[[8.8, 137.5, 1004.0, 2.0, 0.0, 0.0, 0.0, 0.0,...",2000-05-05,1,6 DAMREY,36.0,0.829677,-0.558244,0.5,-0.866025
6,"[[8.8, 137.5, 1004.0, 2.0, 0.0, 0.0, 0.0, 0.0,...",2000-05-05,1,6 DAMREY,42.0,0.829677,-0.558244,0.5,-0.866025
7,"[[8.8, 137.5, 1004.0, 2.0, 0.0, 0.0, 0.0, 0.0,...",2000-05-05,1,6 DAMREY,48.0,0.829677,-0.558244,0.5,-0.866025


In [18]:
y_train[:8]

Unnamed: 0,dlat_target,dlon_target
0,0.2,-1.0
1,0.5,-1.6
2,1.0,-2.3
3,1.4,-3.0
4,2.1,-3.8
5,2.8,-3.8
6,3.3,-4.2
7,3.5,-4.4


In [38]:
from training import CycloneDataModule, calculate_sample_weights

# from models.model import LightningCycloneModel
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint, LearningRateMonitor
import numpy as np
from core.features import FeatureConfig

feature_cfg = FeatureConfig()
# sample_weight = calculate_sample_weights(X_train, strategy="no")
sample_weight = np.ones(len(X_train))

data = CycloneDataModule(
    X_train,
    y_train,
    X_val,
    y_val,
    sample_weight=sample_weight,
    batch_size=512,
    shuffle_batch=True,
    shuffle_dataset=True,
    shuffle_sequences=False,
    normalize_sequences=True,
    augment_data=True,
)

model = LightningCycloneModel(
    sequence_feature_dim=len(feature_cfg.sequence_features),
    static_feature_dim=len(feature_cfg.static_features),
    hidden_dim=64,
    learning_rate=1e-3,
    warmup_epochs=3,
    loss_fn="haversine"
    # loss_fn="horizon_aware_sector",
    # loss_fn="sector",
    # loss_fn="improved_sector",
)

callbacks = [
    EarlyStopping(monitor="val_loss", patience=5, verbose=True, mode="min"),
    ModelCheckpoint(
        monitor="val_loss",
        filename="best-{epoch:02d}-{val_loss:.2f}",
        save_top_k=1,
        mode="min",
    ),
    LearningRateMonitor(logging_interval="epoch"),
]

trainer = pl.Trainer(
    max_epochs=30,
    callbacks=callbacks,
    log_every_n_steps=10,
    accelerator="auto",
    devices="auto",
)

GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


In [None]:
print("🚀 Начало обучения...")
trainer.fit(model, datamodule=data)

🚀 Начало обучения...



  | Name      | Type          | Params | Mode 
----------------------------------------------------
0 | criterion | HaversineLoss | 0      | train
1 | net       | NNLatLon      | 47.3 K | train
----------------------------------------------------
47.3 K    Trainable params
0         Non-trainable params
47.3 K    Total params
0.189     Total estimated model params size (MB)
21        Modules in train mode
0         Modules in eval mode


Epoch 0: 100%|██████████| 850/850 [10:59<00:00,  1.29it/s, v_num=30, train_loss_step=409.0, val_loss_step=288.0, val_loss_epoch=352.0, val_p300_48h=20.70, train_loss_epoch=433.0]

Metric val_loss improved. New best score: 351.858


Epoch 1: 100%|██████████| 850/850 [06:17<00:00,  2.25it/s, v_num=30, train_loss_step=287.0, val_loss_step=259.0, val_loss_epoch=297.0, val_p300_48h=26.30, train_loss_epoch=313.0, train_p300_48h=24.00]

Metric val_loss improved by 54.804 >= min_delta = 0.0. New best score: 297.054


Epoch 2: 100%|██████████| 850/850 [05:54<00:00,  2.40it/s, v_num=30, train_loss_step=361.0, val_loss_step=234.0, val_loss_epoch=280.0, val_p300_48h=27.50, train_loss_epoch=288.0, train_p300_48h=31.20]

Metric val_loss improved by 16.875 >= min_delta = 0.0. New best score: 280.179


Epoch 3: 100%|██████████| 850/850 [07:08<00:00,  1.98it/s, v_num=30, train_loss_step=291.0, val_loss_step=204.0, val_loss_epoch=279.0, val_p300_48h=28.90, train_loss_epoch=275.0, train_p300_48h=33.10]

Metric val_loss improved by 1.554 >= min_delta = 0.0. New best score: 278.625


Epoch 6: 100%|██████████| 850/850 [04:38<00:00,  3.06it/s, v_num=30, train_loss_step=282.0, val_loss_step=247.0, val_loss_epoch=265.0, val_p300_48h=31.50, train_loss_epoch=260.0, train_p300_48h=35.00]

Metric val_loss improved by 14.030 >= min_delta = 0.0. New best score: 264.595


Epoch 9:  50%|█████     | 429/850 [02:40<02:37,  2.68it/s, v_num=30, train_loss_step=266.0, val_loss_step=189.0, val_loss_epoch=265.0, val_p300_48h=30.70, train_loss_epoch=256.0, train_p300_48h=34.50]

In [27]:
len(feature_cfg.sequence_features)

9

In [None]:
def horizon_split(X, y, horizon_hours):
    idx = X["target_time_hours"] == horizon_hours
    return X[idx].reset_index(drop=True), y[idx].reset_index(drop=True)


X_val6, y_val6 = horizon_split(X_val, y_val, 6)
X_val12, y_val12 = horizon_split(X_val, y_val, 12)
X_val24, y_val24 = horizon_split(X_val, y_val, 24)
X_val48, y_val48 = horizon_split(X_val, y_val, 48)

In [None]:
def horizon_split(X, y, horizon_hours):
    idx = X["target_time_hours"] == horizon_hours
    return X[idx].reset_index(drop=True), y[idx].reset_index(drop=True)


X_val6, y_val6 = horizon_split(X_val, y_val, 6)
X_val12, y_val12 = horizon_split(X_val, y_val, 12)
X_val24, y_val24 = horizon_split(X_val, y_val, 24)
X_val48, y_val48 = horizon_split(X_val, y_val, 48)
X_val72, y_val72 = horizon_split(X_val, y_val, 72)

In [None]:
def horizon_split(X, y, horizon_hours):
    idx = X["target_time_hours"] == horizon_hours
    return X[idx].reset_index(drop=True), y[idx].reset_index(drop=True)


X_val6, y_val6 = horizon_split(X_val, y_val, 6)
X_val12, y_val12 = horizon_split(X_val, y_val, 12)
X_val24, y_val24 = horizon_split(X_val, y_val, 24)
X_val48, y_val48 = horizon_split(X_val, y_val, 48)
X_val72, y_val72 = horizon_split(X_val, y_val, 72)

In [13]:
import pandas as pd
from evaluation.evaluator import ModelEvaluator

evaluator = ModelEvaluator()

# Список (датафреймы, метка для вывода)
horizon_sets = [
    (X_val6, y_val6, "6 ч"),
    (X_val12, y_val12, "12 ч"),
    (X_val24, y_val24, "24 ч"),
    (X_val48, y_val48, "48 ч"),
]

rows = []
for X_h, y_h, label in horizon_sets:
    res = evaluator.evaluate_horizon(model, X_h, y_h)
    res["Горизонт"] = label
    rows.append(res)

df = (
    pd.DataFrame(rows)
    .rename(
        columns={
            "samples": "Примеров",
            "mean_km": "Средняя (км)",
            "median_km": "Медиана (км)",
            "max_km": "Макс. ошибка (км)",
            "p50": "P<50 км(%)",
            "p100": "P<100 км(%)",
            "p300": "P<300 км(%)",
        }
    )
    .set_index("Горизонт")
    .round(1)
)

print(df.to_string())

          Примеров  Средняя (км)  Медиана (км)  Макс. ошибка (км)  P<50 км(%)  P<100 км(%)  P<300 км(%)
Горизонт                                                                                               
6 ч           1487          52.9          42.7              322.2        59.4         88.7         99.9
12 ч          1425         101.2          81.3              527.8        27.7         60.9         97.1
24 ч          1312         218.7         177.6             1143.1         6.2         22.3         76.7
48 ч          1109         481.9         381.9             2150.1         1.3          4.2         36.3


In [32]:
# from evaluation.visualization import plot_animated_trajectory

# unique_cyclones = X_val["intl_id"].unique()[:5]  # Первые 5 циклонов
# for cyclone_id in unique_cyclones:
#     print(f"Визуализация циклона {cyclone_id}")
#     idx = X_val["target_time_hours"] == 48.0
#     plot_animated_trajectory(model, X_val[idx], y_val[idx], cyclone_id)

In [34]:
# from evaluation.visualization import plot_enhanced_trajectory

# unique_cyclones = X_val["intl_id"].unique()[:5]  # Первые 5 циклонов
# for cyclone_id in unique_cyclones:
#     print(f"Визуализация циклона {cyclone_id}")
#     idx = X_val["target_time_hours"] == 48.0
#     plot_enhanced_trajectory(model, X_val[idx], y_val[idx], cyclone_id)