In [17]:
import warnings
import pandas as pd
import pytorch_lightning as pl
from tqdm.auto import tqdm
from evaluation.visualization import print_sequence_table

warnings.filterwarnings("ignore")
tqdm.pandas()

In [18]:
df = pd.read_csv("bst_data/bst_all.csv")
print(f"✅ Загружено {len(df):,} записей из Best Track данных")
print(f"📊 Уникальных циклонов: {df['intl_id'].nunique()}")
print(f"📅 Временной диапазон: {df['analysis_time'].min()} - {df['analysis_time'].max()}")

✅ Загружено 70,217 записей из Best Track данных
📊 Уникальных циклонов: 1924
📅 Временной диапазон: 1951-02-19 06:00:00 - 2024-12-25 12:00:00


In [19]:
import pickle
from pathlib import Path

from data_processing.data_processor import DataProcessor
from data_processing.dataset_models import SequenceConfig
from data_processing.dataset_utils import split_data_by_years


sequence_config = SequenceConfig(
    min_history_length=2,
    max_history_length=None,
)

processor = DataProcessor(
    horizons_hours=[6, 12, 18, 24, 30, 36, 42, 48],
    # horizons_hours=[6, 24, 48],
    sequence_config=sequence_config,
    train_max_year=2021,
    val_max_year=2023,
    validate_data=True,
)


dataset_path = Path("bst_data/processed_dataset.pkl")
processor_config_path = Path("bst_data/processor_config.pkl")

processor_config = {
    "horizons_hours": processor.horizons_hours,
    "train_max_year": processor.train_max_year,
    "val_max_year": processor.val_max_year,
    "sequence_config": processor.seq_config,
    "validate_data": processor.validate_data
}

if dataset_path.exists():
    print("🔄 Загружаем сохраненный датасет...")
    with open(dataset_path, 'rb') as f:
        dataset = pickle.load(f)
    print("✅ Датасет загружен из файла")
else:
    print("�� Создаем новый датасет...")
    dataset = processor.build_dataset(df)
    
    print("💾 Сохраняем датасет...")
    with open(dataset_path, 'wb') as f:
        pickle.dump(dataset, f)
    with open(processor_config_path, 'wb') as f:
        pickle.dump(processor_config, f)
    print("✅ Датасет сохранен в processed_dataset.pkl")

🔄 Загружаем сохраненный датасет...
✅ Датасет загружен из файла


In [20]:
X_train, y_train, X_val, y_val, X_test, y_test = split_data_by_years(
    dataset.X, dataset.y, dataset.times, train_max_year=2020, val_max_year=2022
)

In [21]:
len(X_train)

434745

In [22]:
print_sequence_table(X_train["sequences"][0])


🔹 Seq_0
------------------------------------------------------------
Размер: 2 шагов × 9 признаков
+-------+-----------+-----------+------------------------+---------+----------------+---------------+---------------------+------------------------+-----------------------+
|       |   lat_deg |   lon_deg |   central_pressure_hpa |   grade |   velocity_kmh |   bearing_deg |   acceleration_kmh2 |   angular_velocity_deg |   pressure_change_hpa |
| Шаг 1 |     8.800 |   137.500 |               1004.000 |   2.000 |          0.000 |         0.000 |               0.000 |                  0.000 |                 0.000 |
+-------+-----------+-----------+------------------------+---------+----------------+---------------+---------------------+------------------------+-----------------------+
| Шаг 2 |     9.700 |   136.000 |               1004.000 |   2.000 |         32.109 |       301.414 |               0.000 |                  0.000 |                 0.000 |
+-------+-----------+-----------+--

In [23]:
print_sequence_table(X_train["sequences"][8])


🔹 Seq_0
------------------------------------------------------------
Размер: 3 шагов × 9 признаков
+-------+-----------+-----------+------------------------+---------+----------------+---------------+---------------------+------------------------+-----------------------+
|       |   lat_deg |   lon_deg |   central_pressure_hpa |   grade |   velocity_kmh |   bearing_deg |   acceleration_kmh2 |   angular_velocity_deg |   pressure_change_hpa |
| Шаг 1 |     8.800 |   137.500 |               1004.000 |   2.000 |          0.000 |         0.000 |               0.000 |                  0.000 |                 0.000 |
+-------+-----------+-----------+------------------------+---------+----------------+---------------+---------------------+------------------------+-----------------------+
| Шаг 2 |     9.700 |   136.000 |               1004.000 |   2.000 |         32.109 |       301.414 |               0.000 |                  0.000 |                 0.000 |
+-------+-----------+-----------+--

In [24]:
X_train[:8]

Unnamed: 0,sequences,analysis_time,intl_id,storm_name,target_time_hours,day_of_year_sin,day_of_year_cos,month_of_year_sin,month_of_year_cos
0,"[[8.8, 137.5, 1004.0, 2.0, 0.0, 0.0, 0.0, 0.0,...",2000-05-05,1,6 DAMREY,6.0,0.829677,-0.558244,0.5,-0.866025
1,"[[8.8, 137.5, 1004.0, 2.0, 0.0, 0.0, 0.0, 0.0,...",2000-05-05,1,6 DAMREY,12.0,0.829677,-0.558244,0.5,-0.866025
2,"[[8.8, 137.5, 1004.0, 2.0, 0.0, 0.0, 0.0, 0.0,...",2000-05-05,1,6 DAMREY,18.0,0.829677,-0.558244,0.5,-0.866025
3,"[[8.8, 137.5, 1004.0, 2.0, 0.0, 0.0, 0.0, 0.0,...",2000-05-05,1,6 DAMREY,24.0,0.829677,-0.558244,0.5,-0.866025
4,"[[8.8, 137.5, 1004.0, 2.0, 0.0, 0.0, 0.0, 0.0,...",2000-05-05,1,6 DAMREY,30.0,0.829677,-0.558244,0.5,-0.866025
5,"[[8.8, 137.5, 1004.0, 2.0, 0.0, 0.0, 0.0, 0.0,...",2000-05-05,1,6 DAMREY,36.0,0.829677,-0.558244,0.5,-0.866025
6,"[[8.8, 137.5, 1004.0, 2.0, 0.0, 0.0, 0.0, 0.0,...",2000-05-05,1,6 DAMREY,42.0,0.829677,-0.558244,0.5,-0.866025
7,"[[8.8, 137.5, 1004.0, 2.0, 0.0, 0.0, 0.0, 0.0,...",2000-05-05,1,6 DAMREY,48.0,0.829677,-0.558244,0.5,-0.866025


In [25]:
y_train[:8]

Unnamed: 0,dlat_target,dlon_target
0,0.2,-1.0
1,0.5,-1.6
2,1.0,-2.3
3,1.4,-3.0
4,2.1,-3.8
5,2.8,-3.8
6,3.3,-4.2
7,3.5,-4.4


In [None]:
from training import CycloneDataModule

from models.model import LightningCycloneModel
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint, LearningRateMonitor
import numpy as np
from core.features import FeatureConfig

feature_cfg = FeatureConfig()
sample_weight = np.ones(len(X_train))

data = CycloneDataModule(
    X_train,
    y_train,
    X_val,
    y_val,
    sample_weight=sample_weight,
    batch_size=1024,
    shuffle_batch=True,
    shuffle_dataset=True,
    shuffle_sequences=False,
    normalize_sequences=True,
    augment_data=True,
)

model = LightningCycloneModel(
    sequence_feature_dim=len(feature_cfg.sequence_features),
    static_feature_dim=len(feature_cfg.static_features),
    hidden_dim=128,
    learning_rate=1e-3,
    loss_fn="haversine"
    # loss_fn="horizon_aware_sector",
    # loss_fn="sector",
    # loss_fn="improved_sector",
)

callbacks = [
    EarlyStopping(monitor="val_loss", patience=5, verbose=True, mode="min"),
    ModelCheckpoint(
        monitor="val_loss",
        filename="best-{epoch:02d}-{val_loss:.2f}",
        save_top_k=1,
        mode="min",
    ),
    LearningRateMonitor(logging_interval="epoch"),
]

trainer = pl.Trainer(
    max_epochs=20,
    callbacks=callbacks,
    log_every_n_steps=10,
    accelerator="auto",
    devices="auto",
)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


In [None]:
import torch

torch.set_float32_matmul_precision("high")

print("🚀 Начало обучения...")
trainer.fit(model, datamodule=data)

🚀 Начало обучения...


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type          | Params | Mode 
----------------------------------------------------
0 | criterion | HaversineLoss | 0      | train
1 | net       | NNLatLon      | 181 K  | train
----------------------------------------------------
181 K     Trainable params
0         Non-trainable params
181 K     Total params
0.728     Total estimated model params size (MB)
17        Modules in train mode
0         Modules in eval mode


Epoch 0: 100%|██████████| 425/425 [00:28<00:00, 15.02it/s, v_num=8, train_loss_step=316.0, val_loss_step=321.0, val_loss_epoch=328.0, val_p300_48h=22.90, train_loss_epoch=368.0]

Metric val_loss improved. New best score: 327.805


Epoch 1: 100%|██████████| 425/425 [00:31<00:00, 13.30it/s, v_num=8, train_loss_step=288.0, val_loss_step=268.0, val_loss_epoch=287.0, val_p300_48h=27.40, train_loss_epoch=295.0, train_p300_48h=28.30]

Metric val_loss improved by 40.719 >= min_delta = 0.0. New best score: 287.086


Epoch 2: 100%|██████████| 425/425 [00:29<00:00, 14.18it/s, v_num=8, train_loss_step=288.0, val_loss_step=252.0, val_loss_epoch=275.0, val_p300_48h=28.80, train_loss_epoch=276.0, train_p300_48h=30.10]

Metric val_loss improved by 11.961 >= min_delta = 0.0. New best score: 275.125


Epoch 3: 100%|██████████| 425/425 [00:32<00:00, 13.24it/s, v_num=8, train_loss_step=266.0, val_loss_step=244.0, val_loss_epoch=272.0, val_p300_48h=28.80, train_loss_epoch=268.0, train_p300_48h=33.50]

Metric val_loss improved by 2.955 >= min_delta = 0.0. New best score: 272.170


Epoch 4: 100%|██████████| 425/425 [00:28<00:00, 15.10it/s, v_num=8, train_loss_step=258.0, val_loss_step=233.0, val_loss_epoch=267.0, val_p300_48h=31.40, train_loss_epoch=263.0, train_p300_48h=34.30]

Metric val_loss improved by 5.317 >= min_delta = 0.0. New best score: 266.852


Epoch 5: 100%|██████████| 425/425 [00:27<00:00, 15.19it/s, v_num=8, train_loss_step=279.0, val_loss_step=276.0, val_loss_epoch=273.0, val_p300_48h=30.20, train_loss_epoch=260.0, train_p300_48h=33.70]

In [13]:
def horizon_split(X, y, horizon_hours):
    idx = X["target_time_hours"] == horizon_hours
    return X[idx].reset_index(drop=True), y[idx].reset_index(drop=True)



X_val6, y_val6 = horizon_split(X_val, y_val, 6)
X_val12, y_val12 = horizon_split(X_val, y_val, 12)
X_val24, y_val24 = horizon_split(X_val, y_val, 24)
X_val48, y_val48 = horizon_split(X_val, y_val, 48)

In [14]:
import pandas as pd
from evaluation.evaluator import ModelEvaluator

evaluator = ModelEvaluator()

# Список (датафреймы, метка для вывода)
horizon_sets = [
    (X_val6, y_val6, "6 ч"),
    (X_val12, y_val12, "12 ч"),
    (X_val24, y_val24, "24 ч"),
    (X_val48, y_val48, "48 ч"),
]

model.eval()
rows = []
for X_h, y_h, label in horizon_sets:
    res = evaluator.evaluate_horizon(model, X_h, y_h)
    res["Горизонт"] = label
    rows.append(res)

df = (
    pd.DataFrame(rows)
    .rename(
        columns={
            "samples": "Примеров",
            "mean_km": "Средняя (км)",
            "median_km": "Медиана (км)",
            "max_km": "Макс. ошибка (км)",
            "p50": "P<50 км(%)",
            "p100": "P<100 км(%)",
            "p300": "P<300 км(%)",
        }
    )
    .set_index("Горизонт")
    .round(1)
)

print(df.to_string())

          Примеров  Средняя (км)  Медиана (км)  Макс. ошибка (км)  P<50 км(%)  P<100 км(%)  P<300 км(%)
Горизонт                                                                                               
6 ч           1487          59.2          47.8              400.6        53.2         85.8         99.7
12 ч          1425         119.2          97.1              696.1        21.4         51.2         95.4
24 ч          1312         245.2         199.9             1224.1         5.2         19.7         70.4
48 ч          1109         532.7         432.0             2259.7         1.3          3.7         29.4


In [15]:
from evaluation.visualization import plot_animated_trajectory

unique_cyclones = X_val["intl_id"].unique()[:5]  # Первые 5 циклонов
for cyclone_id in unique_cyclones:
    print(f"Визуализация циклона {cyclone_id}")
    idx = X_val["target_time_hours"] == 48.0
    plot_animated_trajectory(model, X_val[idx], y_val[idx], cyclone_id)

Визуализация циклона 2101


Визуализация циклона 2102


Визуализация циклона 2103


Визуализация циклона 2104


Визуализация циклона 2105


In [16]:
# from evaluation.visualization import plot_enhanced_trajectory

# unique_cyclones = X_val["intl_id"].unique()[:5]  # Первые 5 циклонов
# for cyclone_id in unique_cyclones:
#     print(f"Визуализация циклона {cyclone_id}")
#     idx = X_val["target_time_hours"] == 48.0
#     plot_enhanced_trajectory(model, X_val[idx], y_val[idx], cyclone_id)