#  Import

In [1]:
print("hello worlds")

hello worlds


In [2]:
import os
import sys

import numpy as  np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset as TorchDataset, DataLoader, Subset
import time

from PIL import Image
import numpy as np

from deepverse import ParameterManager
from deepverse.scenario import ScenarioManager
from deepverse import Dataset

from deepverse.visualizers import ImageVisualizer, LidarVisualizer

# Settings

In [3]:
# Scenes 100
## Subcarriers 64

scenarios_name = "DT31"
config_path = f"scenarios/{scenarios_name}/param/config.m"
param_manager = ParameterManager(config_path)

params = param_manager.get_params()

param_manager.params["scenes"] =list(range(100))
param_manager.params["comm"]["OFDM"]["selected_subcarriers"] = list(range(64))


# Generate a dataset

In [4]:
dataset = Dataset(param_manager)

Generating camera dataset: ⏳ In progress
[F[KGenerating camera dataset: ✅ Completed (0.02s)
Generating LiDAR dataset: ⏳ In progress
[F[KGenerating LiDAR dataset: ✅ Completed (0.00s)
Generating mobility dataset: ⏳ In progress
[F[KGenerating mobility dataset: ✅ Completed (0.00s)
Generating comm dataset: ⏳ In progress


                                                                    

[F[KGenerating comm dataset: ✅ Completed (4.86s)
Generating radar dataset: ⏳ In progress


                                                                    

[F[KGenerating radar dataset: ✅ Completed (236.69s)




# location dataset 
 지금 실험에서는 안쓰임

In [5]:
# comm = dataset.comm_dataset
# location =  comm

# location = [
#     {
#         "bs_loc": d["bs_loc"],                      # (3,)
#         "ue_loc": np.asarray(d["ue_loc"]).squeeze() # (3,)  (원래 (1,3)이면 squeeze)
#     }
#     for row in comm.data      # row: [dict] 형태
#     for d in row              # d: dict
# ]

# print(ue_location)  #)

# communication  dataset

In [6]:
# UE 정보
comm = dataset.comm_dataset
ch = comm.data[0][0]['ue'][0]
print(ch.coeffs.shape)  

(1, 16, 64)


# preprocessing

In [7]:
def get_coeffs_from_frame(frame, ue_idx=0):
    ue_obj = frame["ue"]

    # 케이스1) list/tuple이면 ue_idx로 선택
    if isinstance(ue_obj, (list, tuple)):
        ch_obj = ue_obj[ue_idx]
    else:
        # 케이스2) 단일 OFDMChannel이면 그대로 사용
        ch_obj = ue_obj

    # coeffs는 dict key가 아니라 attribute일 확률이 매우 큼
    if hasattr(ch_obj, "coeffs"):
        return ch_obj.coeffs

    # 혹시 dict라면 마지막 보험
    if isinstance(ch_obj, dict) and "coeffs" in ch_obj:
        return ch_obj["coeffs"]

    raise TypeError(f"Cannot get coeffs. ue type={type(ue_obj)}, ch type={type(ch_obj)}")


In [8]:
def get_train_min_max_realimag(frames, train_idx, us_idx=0):

    rmin, rmax =  float('inf'), float('-inf')
    imin, imax =  float('inf'), float('-inf')

    print("Calculating min/max over training set...")

    for t  in train_idx:
        frame  = frames[t]
        cooeffs  = get_coeffs_from_frame(frame, us_idx)  # (N_subcarriers, )

        rmin = min(rmin, float(cooeffs.real.min()))
        rmax = max(rmax, float(cooeffs.real.max()))
        imin = min(imin, float(cooeffs.imag.min()))
        imax = max(imax, float(cooeffs.imag.max()))

    print(f"Done. rmin={rmin}, rmax={rmax}, imin={imin}, imax={imax}")
    return (rmin, rmax), (imin, imax)

In [9]:
def preprocess_channel_coeffs_minmax(coeffs_np, r_min, r_max, i_min, i_max, device="cuda", eps=1e-12):
    # Convert Numpy to Tensor
    coeffs = torch.from_numpy(coeffs_np).to(torch.complex64)
    
    r = coeffs.real
    i = coeffs.imag
    
    # Min-Max Scaling [0, 1]
    # Add eps to denominator to prevent division by zero
    r_scaled = (r - r_min) / max(r_max - r_min, eps)
    i_scaled = (i - i_min) / max(i_max - i_min, eps)
    
    # Concat (Maintains shape like (..., 2*subcarriers))
    H = torch.cat([r_scaled, i_scaled], dim=-1).to(device)
    return H

In [10]:
# 사용예시
H = preprocess_channel_coeffs_minmax(ch.coeffs, r_min=-0.5, r_max=0.5, i_min=-0.5, i_max=0.5)
print(H.shape)  # (1, 16, 128) 64 subcar

torch.Size([1, 16, 128])


### image dataset

In [11]:
sensor = dataset.camera_dataset.sensors["unit1_cam1"]
path0 = sensor.files[0]
img = Image.open(path0).convert("RGB")
arr = np.array(img)

print("path:", path0)
print("PIL size (W,H):", img.size)
print("np shape:", arr.shape, "dtype:", arr.dtype)  # 보통 (H,W,3), uint8


path: scenarios/DT31/RGB_images/unit1_cam1/0.png
PIL size (W,H): (1920, 1080)
np shape: (1080, 1920, 3) dtype: uint8


In [12]:
IMG_SIZE = 224

def preprocess_img(path, img_size=IMG_SIZE, device="cuda"):
    # 1) load (H,W,3) uint8
    img = Image.open(path).convert("RGB")
    arr = np.array(img)

    # 2) numpy -> torch, (3,H,W), float32
    x = torch.from_numpy(arr).permute(2, 0, 1).contiguous().float()
    x = x / 255.0  # [0,1]

    # 3) add batch dim -> (1,3,H,W)
    x = x.unsqueeze(0)

    # 4) resize -> (1,3,224,224)
    x = F.interpolate(x, size=(img_size, img_size),
                      mode="bilinear", align_corners=False)

    # 5) normalize (ImageNet)
    mean = torch.tensor([0.485, 0.456, 0.406], dtype=x.dtype).view(1, 3, 1, 1)
    std  = torch.tensor([0.229, 0.224, 0.225], dtype=x.dtype).view(1, 3, 1, 1)
    x = (x - mean) / std

    # 6) move to device (GPU)
    x = x.to(device, non_blocking=True)

    return x  # (1,3,224,224) on device


In [13]:
# 사용예시
cd = dataset.camera_dataset
sensor = cd.sensors['unit1_cam1']
path0 = sensor.files[0]
img = preprocess_img(path0, device="cuda")
print(img.shape, img.device)  # torch.Size([1,3,224,224]) cuda:0
print(path0)

torch.Size([1, 3, 224, 224]) cuda:0
scenarios/DT31/RGB_images/unit1_cam1/0.png


# Dataset 구현

In [14]:
def flatten_comm_frames(comm):
    frames = []
    for row in comm.data:
        for d in row:
            frames.append(d)
    return frames

class MultiModalNextStepDatasetGPU(TorchDataset):
    def __init__(self, comm_frames, cam_files, ue_idx=0, past_len=15, device="cuda",
                 # Arguments for statistical values (initialized with default values)
                 r_min=0.0, r_max=1.0, i_min=0.0, i_max=1.0):
        
        self.comm_frames = comm_frames
        self.cam_files = list(cam_files)
        self.ue_idx = ue_idx
        self.past_len = past_len
        self.device = device
        
        # Save statistical values
        self.r_min, self.r_max = r_min, r_max
        self.i_min, self.i_max = i_min, i_max

        self.N = min(len(self.comm_frames), len(self.cam_files))
        self.valid_start = past_len - 1
        self.valid_end = self.N - 2 

    def __len__(self):
        return self.valid_end - self.valid_start + 1

    def __getitem__(self, idx):
        t = self.valid_start + idx

        # 1. Image Past (Apply Preprocessing)
        img_list = []
        for k in range(t - self.past_len + 1, t + 1):
            img_path = self.cam_files[k]
            img_k = preprocess_img(img_path, device=self.device).squeeze(0)
            img_list.append(img_k)
        img = torch.stack(img_list, dim=0)  # Shape: (past_len

        
        # 2. Channel Past (Apply Scaling)
        ch_list = []
        for k in range(t - self.past_len + 1, t + 1):
            coeffs_np = get_coeffs_from_frame(self.comm_frames[k], ue_idx=self.ue_idx)
            # Use the newly defined Min-Max preprocessing function
            h = preprocess_channel_coeffs_minmax(
                coeffs_np, 
                self.r_min, self.r_max, self.i_min, self.i_max, 
                device=self.device
            ).reshape(-1)
            ch_list.append(h)
        channel_past = torch.stack(ch_list, dim=0)

        # 3. Target (Apply Scaling) - Target must also be scaled for model training!
        coeffs_np_next = get_coeffs_from_frame(self.comm_frames[t + 1], ue_idx=self.ue_idx)
        target = preprocess_channel_coeffs_minmax(
            coeffs_np_next, 
            self.r_min, self.r_max, self.i_min, self.i_max, 
            device=self.device
        ).reshape(-1)

        return channel_past, img, target

# DataLoader 구현

In [15]:
comm_frames = flatten_comm_frames(dataset.comm_dataset)
sensor = dataset.camera_dataset.sensors["unit1_cam1"]

ds = MultiModalNextStepDatasetGPU(
    comm_frames=comm_frames,
    cam_files=sensor.files,
    ue_idx=0,
    past_len=16,
    device="cuda"
)

loader = DataLoader(
    ds,
    batch_size=8,
    shuffle=True,
    num_workers=0,     
    pin_memory=False   # ✅ 의미 없음 (이미 GPU)
)

ch, img, y = next(iter(loader))
print(ch.shape, img.shape, y.shape)
print(ch.device, img.device, y.device)


torch.Size([8, 16, 2048]) torch.Size([8, 16, 3, 224, 224]) torch.Size([8, 2048])
cuda:0 cuda:0 cuda:0


# ConvLSTM

In [16]:
import torch
import torch.nn as nn
import torch.nn.functional as F

# -----------------------------
# 1) ConvLSTM building blocks
# -----------------------------
class ConvLSTMCell(nn.Module):
    def __init__(self, input_channels, hidden_channels, kernel_size=3, bias=True):
        super().__init__()
        padding = kernel_size // 2
        self.input_channels = input_channels
        self.hidden_channels = hidden_channels

        self.conv = nn.Conv2d(
            in_channels=input_channels + hidden_channels,
            out_channels=4 * hidden_channels,
            kernel_size=kernel_size,
            padding=padding,
            bias=bias
        )

    def forward(self, x, state):
        # x: (B, Cin, H, W)
        # state: (h, c) each (B, Chid, H, W)
        h, c = state
        combined = torch.cat([x, h], dim=1)  # (B, Cin+Chid, H, W)
        gates = self.conv(combined)          # (B, 4*Chid, H, W)

        i, f, o, g = torch.chunk(gates, 4, dim=1)
        i = torch.sigmoid(i)
        f = torch.sigmoid(f)
        o = torch.sigmoid(o)
        g = torch.tanh(g)

        c_next = f * c + i * g
        h_next = o * torch.tanh(c_next)
        return h_next, c_next


class ConvLSTM(nn.Module):
    """
    Input:  x (B,T,C,H,W)
    Output: last_h (B,hidden,H,W), (last_h,last_c)
    """
    def __init__(self, input_channels, hidden_channels=64, num_layers=2, kernel_size=3, dropout=0.0):
        super().__init__()
        self.num_layers = num_layers
        self.hidden_channels = hidden_channels

        cells = []
        for layer in range(num_layers):
            cin = input_channels if layer == 0 else hidden_channels
            cells.append(ConvLSTMCell(cin, hidden_channels, kernel_size=kernel_size))
        self.cells = nn.ModuleList(cells)

        self.dropout = dropout

    def forward(self, x):
        # x: (B,T,C,H,W)
        B, T, C, H, W = x.shape
        device = x.device

        # init states
        hs, cs = [], []
        for _ in range(self.num_layers):
            hs.append(torch.zeros(B, self.hidden_channels, H, W, device=device, dtype=x.dtype))
            cs.append(torch.zeros(B, self.hidden_channels, H, W, device=device, dtype=x.dtype))

        # time loop
        for t in range(T):
            inp = x[:, t]  # (B,C,H,W)
            for l, cell in enumerate(self.cells):
                h, c = hs[l], cs[l]
                h, c = cell(inp, (h, c))
                hs[l], cs[l] = h, c
                inp = h
                if self.dropout > 0 and l < self.num_layers - 1:
                    inp = F.dropout(inp, p=self.dropout, training=self.training)

        return hs[-1], (hs[-1], cs[-1])


# -----------------------------
# 2) Encoders: image->map, channel->map
# -----------------------------
class ImgFrameEncoderMap(nn.Module):
    """
    (B*T,3,224,224) -> (B*T,Cm,Hm,Wm)  (Hm,Wm fixed by adaptive pooling)
    """
    def __init__(self, out_channels=64, out_hw=14):
        super().__init__()
        self.backbone = nn.Sequential(
            nn.Conv2d(3, 32, 5, stride=2, padding=2), nn.ReLU(),   # 224 -> 112
            nn.Conv2d(32, 64, 3, stride=2, padding=1), nn.ReLU(),  # 112 -> 56
            nn.Conv2d(64, out_channels, 3, stride=2, padding=1), nn.ReLU(),  # 56 -> 28
        )
        self.pool = nn.AdaptiveAvgPool2d((out_hw, out_hw))  # 28 -> out_hw

    def forward(self, x):
        z = self.backbone(x)
        z = self.pool(z)
        return z  # (B*T, out_channels, out_hw, out_hw)


class ChannelVecToMap(nn.Module):
    """
    (B,T,F_in) -> (B,T,Cc,Hm,Wm)
    """
    def __init__(self, F_in, Cc=16, out_hw=14, hidden=512):
        super().__init__()
        self.Cc = Cc
        self.out_hw = out_hw
        self.proj = nn.Sequential(
            nn.Linear(F_in, hidden),
            nn.GELU(),
            nn.Linear(hidden, Cc * out_hw * out_hw),
        )

    def forward(self, ch):
        B, T, F = ch.shape
        z = self.proj(ch)  # (B,T,Cc*H*W)
        z = z.view(B, T, self.Cc, self.out_hw, self.out_hw)
        return z


# -----------------------------
# 3) ConvLSTM Early Fusion Forecaster
# -----------------------------
class ConvLSTM_EarlyFusion_Forecaster(nn.Module):
    """
    ch : (B,T,F_in=2048)
    img: (B,T,3,224,224)
    yhat: (B,F_out)
    """
    def __init__(
        self,
        F_in,
        F_out,
        img_map_ch=64,
        ch_map_ch=16,
        map_hw=14,
        convlstm_hidden=64,
        convlstm_layers=2,
        convlstm_kernel=3,
        dropout=0.1,
    ):
        super().__init__()
        self.map_hw = map_hw
        self.img_enc = ImgFrameEncoderMap(out_channels=img_map_ch, out_hw=map_hw)
        self.ch_map = ChannelVecToMap(F_in=F_in, Cc=ch_map_ch, out_hw=map_hw)

        fused_c = img_map_ch + ch_map_ch
        self.convlstm = ConvLSTM(
            input_channels=fused_c,
            hidden_channels=convlstm_hidden,
            num_layers=convlstm_layers,
            kernel_size=convlstm_kernel,
            dropout=dropout,
        )

        self.head = nn.Sequential(
            nn.AdaptiveAvgPool2d((1, 1)),  # (B,Ch,1,1)
            nn.Flatten(),                  # (B,Ch)
            nn.Linear(convlstm_hidden, F_out),
        )

    def forward(self, ch, img):
        # ch:  (B,T,F_in)
        # img: (B,T,3,224,224)
        B, T = ch.shape[0], ch.shape[1]

        # image -> map
        img_ = img.view(B * T, *img.shape[2:])      # (B*T,3,224,224)
        img_map = self.img_enc(img_)                # (B*T,Cm,Hm,Wm)
        img_map = img_map.view(B, T, *img_map.shape[1:])  # (B,T,Cm,Hm,Wm)

        # channel -> map
        ch_map = self.ch_map(ch)                    # (B,T,Cc,Hm,Wm)

        # early fusion on maps
        x = torch.cat([img_map, ch_map], dim=2)     # (B,T,Cm+Cc,Hm,Wm)

        # ConvLSTM over time
        last_h, _ = self.convlstm(x)                # (B, hidden, Hm, Wm)

        # head -> (B,F_out)
        yhat = self.head(last_h)
        return yhat


# Fine-tuning
data shape 맞추기 위해

In [17]:
# import torch
# import torch.nn as nn
# import torch.nn.functional as F

# from lwm_multi_model import multi_modal_lwm  # 너가 올린 backbone

# class FinetuneChannelPredictor(nn.Module):
#     """
#     직접 구현한 lwm_multi_model(channel + image)에 맞는 파인튜닝 모델
#     Input:
#       ch:  (B, T, F_in)  e.g., (B,16,2048) 16: past Length, 2048: feature dim
#       img: (B, 3, 224, 224)
#     Output:
#       yhat: (B, F_out)  e.g., (B,2048)
#     """
#     def __init__(
#         self,
#         backbone: nn.Module,
#         F_in: int,
#         F_out: int,
#         pool: str = "last",          # "last" or "mean"
#         freeze_image: bool = False,
#         freeze_backbone: bool = False,
#         element_length: int = 16,    # 채널 벡터 차원 (backbone 기대값)
#         d_model: int = 64            # backbone 내부 feature dim
#     ):
#         super().__init__()
#         self.backbone = backbone
#         self.pool = pool

#         # backbone이 기대하는 channel feature dim = ELEMENT_LENGTH
#         # (backbone 내부 Channel_Embedding: Linear(ELEMENT_LENGTH -> D_MODEL))
#         if element_length is None:
#             element_length = backbone.channel_embedding.element_length
#         if d_model is None:
#             d_model = backbone.channel_embedding.d_model

#         # 입력 차원 정렬: F_in -> ELEMENT_LENGTH
#         self.in_proj = nn.Sequential(
#             nn.Linear(F_in, 512),
#             nn.GELU(),
#             nn.Linear(512, 128),
#             nn.GELU(),
#             nn.Linear(128, element_length)
#         )

#         # 출력 head: D_MODEL -> F_out
#         self.head = nn.Linear(d_model, F_out)

#         if freeze_image:
#             for p in self.backbone.image_embedding.parameters():
#                 p.requires_grad = False

#         if freeze_backbone:
#             for p in self.backbone.parameters():
#                 p.requires_grad = False
#             # 그래도 projection/head는 학습되게 다시 켜기
#             for p in self.in_proj.parameters():
#                 p.requires_grad = True
#             for p in self.head.parameters():
#                 p.requires_grad = True

#     def forward(self, ch, img):
#         # ch: (B,T,F_in) -> (B,T,ELEMENT_LENGTH)
#         ch = self.in_proj(ch)

#         # backbone: (B,T,D_MODEL)/
#         tokens = self.backbone(ch, img)

#         # pooling -> (B,D_MODEL)
#         if self.pool == "last":
#             z = tokens[:, -1, :]
#         elif self.pool == "mean":
#             z = tokens.mean(dim=1)
#         else:
#             raise ValueError(f"Unknown pool={self.pool}")

#         # head -> (B,F_out)
#         yhat = self.head(z)
#         return yhat
    


## NMSE(dB)

In [18]:
@torch.no_grad()
def nmse_db(yhat: torch.Tensor, y: torch.Tensor, eps: float = 1e-12) -> torch.Tensor:
    # yhat, y: (B,F)
    num = torch.sum((yhat - y) ** 2, dim=1)
    den = torch.sum(y ** 2, dim=1).clamp_min(eps)
    nmse = num / den
    return 10.0 * torch.log10(nmse.clamp_min(eps)).mean()


# Train/Val split

In [19]:
n = len(ds)
n_train = int(0.75 * n)
train_idx = list(range(0, n_train))
val_idx = list(range(n_train, n))

train_ts = [ds.valid_start + i for i in train_idx]

(real_min,  real_max), (imag_min, imag_max) = get_train_min_max_realimag(
    comm_frames, train_ts, us_idx=0
)

ds.r_min = real_min
ds.r_max = real_max
ds.i_min = imag_min
ds.i_max = imag_max

print("Dataset statistical values set in the dataset.")

train_ds = Subset(ds, train_idx)
val_ds   = Subset(ds, val_idx)

train_loader = DataLoader(train_ds, batch_size=32, shuffle=True,  num_workers=0)
val_loader   = DataLoader(val_ds,   batch_size=32, shuffle=False, num_workers=0)
F_in = 2048  # 

# Verify
ch, img, y = next(iter(train_loader))
F_out = y.shape[-1]
print("\n=== Data Check ===")
print(f"y stats | min: {y.min().item():.4f}, max: {y.max().item():.4f}")
print("If scaling worked correctly, values should be within [0, 1].")

Calculating min/max over training set...
Done. rmin=-1.0646139668415024e-06, rmax=1.091798991441695e-06, imin=-1.0719515918791155e-06, imax=1.0643867764302909e-06
Dataset statistical values set in the dataset.

=== Data Check ===
y stats | min: 0.0000, max: 1.0000
If scaling worked correctly, values should be within [0, 1].


In [20]:
comm_frames = flatten_comm_frames(dataset.comm_dataset)
cam_files = list(dataset.camera_dataset.sensors["unit1_cam1"].files)

print("len(comm_frames):", len(comm_frames))
print("len(cam_files):", len(cam_files))
print("first comm frame keys:", list(comm_frames[0].keys()))
print("first cam file:", cam_files[0])


len(comm_frames): 100
len(cam_files): 7012
first comm frame keys: ['bs_loc', 'ue', 'ue_loc', 'bs']
first cam file: scenarios/DT31/RGB_images/unit1_cam1/0.png


In [21]:
len(val_loader)

1

# Model generate and also check

In [22]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# print("device:", device)

# # 배치 하나로 F_in/F_out 자동 확정
# ch, img, y = next(iter(train_loader))
# F_in  = ch.shape[-1]
# F_out = y.shape[-1]
# print("Detected:", "F_in=", F_in, "F_out=", F_out)
# print("Batch devices:", ch.device, img.device, y.device)

# # backbone + finetune model
# backbone = multi_modal_lwm().to(device)

# model = FinetuneChannelPredictor(
#     backbone=backbone,
#     F_in=F_in,
#     F_out=F_out,
#     pool="last",            # "mean"으로 바꿔도 됨
#     freeze_image=False,     # 원하면 True (이미지 인코더 고정)
#     freeze_backbone=False,  # 원하면 True (proj/head만 학습)
#     element_length=16,
#     d_model=64
# ).to(device)

# # sanity forward
# model.eval()
# with torch.no_grad():
#     # ds가 이미 cuda 텐서 반환이면 아래 .to(device) 생략 가능
#     yhat = model(ch.to(device), img.to(device))
# print("yhat:", yhat.shape, "y:", y.shape)



# Train/ Eval 함수 (AMP + grad clip)

In [23]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
def train_one_epoch(model, loader, optimizer, device, use_amp=True, grad_clip=1.0):
    model.train()
    scaler = torch.cuda.amp.GradScaler(enabled=use_amp)

    total_loss = 0.0
    total_nmse = 0.0
    n = 0

    for ch, img, y in loader:
        # Dataset이 이미 cuda 텐서를 반환하더라도 안전하게 유지
        ch = ch.to(device, non_blocking=True)
        img = img.to(device, non_blocking=True)
        y  = y.to(device, non_blocking=True)

        optimizer.zero_grad(set_to_none=True)

        with torch.cuda.amp.autocast(enabled=use_amp):
            yhat = model(ch, img)
            loss = F.mse_loss(yhat, y)

        scaler.scale(loss).backward()

        if grad_clip is not None and grad_clip > 0:
            scaler.unscale_(optimizer)
            nn.utils.clip_grad_norm_(model.parameters(), grad_clip)

        scaler.step(optimizer)
        scaler.update()

        total_loss += loss.item()
        total_nmse += nmse_db(yhat.detach(), y).item()
        n += 1

    return total_loss / max(n, 1), total_nmse / max(n, 1)


@torch.no_grad()
def evaluate(model, loader, device):
    model.eval()

    total_loss = 0.0
    total_nmse = 0.0
    n = 0

    for ch, img, y in loader:
        ch = ch.to(device, non_blocking=True)
        img = img.to(device, non_blocking=True)
        y  = y.to(device, non_blocking=True)

        yhat = model(ch, img)
        loss = F.mse_loss(yhat, y)

        total_loss += loss.item()
        total_nmse += nmse_db(yhat, y).item()
        n += 1

    return total_loss / max(n, 1), total_nmse / max(n, 1)


In [24]:
import time
import torch
import torch.nn as nn

def fit_model(
    model: nn.Module,
    train_loader,
    val_loader,
    device,
    epochs: int,
    ckpt_path: str,
    lr: float = 1e-4,
    weight_decay: float = 1e-4,
    use_amp: bool = True,
    grad_clip: float = 1.0,
):
    trainable_params = [p for p in model.parameters() if p.requires_grad]
    print("trainable params:", sum(p.numel() for p in trainable_params))

    optimizer = torch.optim.AdamW(trainable_params, lr=lr, weight_decay=weight_decay)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs)

    # best trackers
    best_val_loss = float("inf")
    best_val_nmse = float("inf")  # (dB) lower is better

    total_t0 = time.time()

    for epoch in range(1, epochs + 1):
        t0 = time.time()

        tr_loss, tr_nmse = train_one_epoch(
            model, train_loader, optimizer,
            device=device, use_amp=use_amp, grad_clip=grad_clip
        )
        va_loss, va_nmse = evaluate(model, val_loader, device=device)

        scheduler.step()

        dt = time.time() - t0
        print(
            f"[{epoch:02d}/{epochs}] "
            f"train loss={tr_loss:.6f}, nmse(dB)={tr_nmse:.4f} | "
            f"val loss={va_loss:.6f}, nmse(dB)={va_nmse:.4f} | "
            f"{dt:.1f}s"
        )

        # ✅ checkpoint 기준: val loss (기존 유지)
        if va_loss < best_val_loss:
            best_val_loss = va_loss
            best_val_nmse = va_nmse  # loss 기준으로 best일 때의 nmse 저장

            torch.save(
                {
                    "epoch": epoch,
                    "model_state": model.state_dict(),
                    "optimizer_state": optimizer.state_dict(),
                    "best_val_loss": best_val_loss,
                    "best_val_nmse_db": best_val_nmse,
                    "F_in": F_in,
                    "F_out": F_out,
                },
                ckpt_path
            )
            print(f"  ↳ saved {ckpt_path} (best val loss)")

        # (옵션) 만약 "NMSE(dB) 기준 best"도 따로 저장하고 싶으면 아래처럼 추가 가능
        # if va_nmse < best_val_nmse:
        #     best_val_nmse = va_nmse
        #     torch.save(..., "best_nmse_convlstm.pt")

    total_dt = time.time() - total_t0
    print(f"\nTOTAL TRAIN TIME: {total_dt/60:.2f} min ({total_dt:.1f} sec)")
    print(f"BEST (by val loss) -> val loss={best_val_loss:.6f}, val nmse(dB)={best_val_nmse:.4f}")

    # ✅ best loss, best nmse, total time 모두 리턴
    return best_val_loss, best_val_nmse, total_dt


#  Optiimizer  / Scheduler 설정

In [25]:
# # requires_grad=True인 파라미터만 학습
# trainable_params = [p for p in model.parameters() if p.requires_grad]
# print("trainable params:", sum(p.numel() for p in trainable_params))

# optimizer = torch.optim.AdamW(trainable_params, lr=1e-4, weight_decay=1e-4)

# # (선택) cosine scheduler
# epochs = 1000
# scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs)


# 학습 루프 + checkpoint 저장

In [26]:
epochs = 500

model_convlstm = ConvLSTM_EarlyFusion_Forecaster(
    F_in=F_in,
    F_out=F_out,
    img_map_ch=64,
    ch_map_ch=16,
    map_hw=14,
    convlstm_hidden=64,
    convlstm_layers=2,
    convlstm_kernel=3,
    dropout=0.1,
).to(device)

print("\n=== Training: ConvLSTM Early Fusion ===")
best_loss, best_nmse_db, total_time_sec = fit_model(
    model=model_convlstm,
    train_loader=train_loader,
    val_loader=val_loader,
    device=device,
    epochs=epochs,
    ckpt_path="best_convlstm_earlyfusion.pt",
)

print("\n=== Done ===")
print(f"Best val loss      : {best_loss:.6f}")
print(f"Best val NMSE (dB) : {best_nmse_db:.4f}")
print(f"Total train time   : {total_time_sec/60:.2f} min ({total_time_sec:.1f} sec)")



=== Training: ConvLSTM Early Fusion ===
trainable params: 3476032


  scaler = torch.cuda.amp.GradScaler(enabled=use_amp)
  with torch.cuda.amp.autocast(enabled=use_amp):


[01/500] train loss=0.289630, nmse(dB)=0.0518 | val loss=0.261238, nmse(dB)=0.0482 | 85.3s
  ↳ saved best_convlstm_earlyfusion.pt (best val loss)
[02/500] train loss=0.289253, nmse(dB)=0.0446 | val loss=0.260723, nmse(dB)=0.0396 | 85.5s
  ↳ saved best_convlstm_earlyfusion.pt (best val loss)
[03/500] train loss=0.288630, nmse(dB)=0.0364 | val loss=0.260090, nmse(dB)=0.0290 | 81.8s
  ↳ saved best_convlstm_earlyfusion.pt (best val loss)
[04/500] train loss=0.287991, nmse(dB)=0.0260 | val loss=0.259203, nmse(dB)=0.0142 | 85.1s
  ↳ saved best_convlstm_earlyfusion.pt (best val loss)
[05/500] train loss=0.287020, nmse(dB)=0.0108 | val loss=0.257797, nmse(dB)=-0.0095 | 81.9s
  ↳ saved best_convlstm_earlyfusion.pt (best val loss)
[06/500] train loss=0.285249, nmse(dB)=-0.0141 | val loss=0.255410, nmse(dB)=-0.0499 | 82.8s
  ↳ saved best_convlstm_earlyfusion.pt (best val loss)
[07/500] train loss=0.282736, nmse(dB)=-0.0565 | val loss=0.251406, nmse(dB)=-0.1186 | 85.8s
  ↳ saved best_convlstm_earl

In [27]:
ch, img, y = next(iter(train_loader))

# 안전하게 device 이동 (dataset이 이미 cuda를 주더라도 문제 없음)
ch = ch.to(device, non_blocking=True)
img = img.to(device, non_blocking=True)
y  = y.to(device, non_blocking=True)

print("y abs mean:", y.abs().mean().item())
print("y abs max :", y.abs().max().item())
print("y power   :", (y**2).mean().item())

with torch.no_grad():
    model_convlstm.eval()
    yhat = model_convlstm(ch, img)

print("yhat abs mean:", yhat.abs().mean().item())
print("yhat abs max :", yhat.abs().max().item())
print("yhat power   :", (yhat**2).mean().item())


y abs mean: 0.479672908782959
y abs max : 1.0
y power   : 0.2711747884750366
yhat abs mean: 0.48492735624313354
yhat abs max : 0.6587302088737488
yhat power   : 0.23690596222877502


# 데이터 입력 및 형태

In [28]:
@torch.no_grad()
def debug_batch_and_forward(loader, device, model=None, name="ConvLSTM", force_img_seq=True):
    """
    - loader에서 배치 하나 뽑아서 shape/device/dtype 확인
    - model이 있으면 forward까지 해서 yhat shape + 간단 통계 확인
    - ConvLSTM은 기본적으로 img=(B,T,3,H,W)를 기대
      -> img가 (B,3,H,W)면 force_img_seq=True일 때 (B,T,3,H,W)로 자동 확장
    """
    ch, img, y = next(iter(loader))

    def info(x, label):
        print(f"{label:>4}: shape={tuple(x.shape)} dtype={x.dtype} device={x.device}")

    print("\n=== one batch tensor info ===")
    info(ch, "ch")
    info(img, "img")
    info(y,  "y")

    # img 차원 설명 + ConvLSTM용 자동 확장
    if img.dim() == 5:
        print("img format: (B,T,3,H,W) ✅")
    elif img.dim() == 4:
        print("img format: (B,3,H,W)")
        if force_img_seq and model is not None:
            # ch에서 T를 가져와 img를 (B,T,3,H,W)로 맞춤
            T = ch.shape[1]
            img = img.unsqueeze(1).repeat(1, T, 1, 1, 1)
            print(f" -> expanded img to (B,T,3,H,W) with T={T}")
            info(img, "img*")
    else:
        print(f"img format: unexpected dim={img.dim()}")

    if model is not None:
        model.eval()
        ch = ch.to(device, non_blocking=True)
        img = img.to(device, non_blocking=True)
        y  = y.to(device, non_blocking=True)

        yhat = model(ch, img)
        info(yhat, f"{name}_yhat")

        # 간단 통계
        print(f"{name} yhat abs mean:", yhat.abs().mean().item())
        print(f"{name} yhat abs max :", yhat.abs().max().item())
        print(f"{name} yhat power   :", (yhat**2).mean().item())

        return (ch, img, y, yhat)

    return (ch, img, y)


In [29]:
print("=== dataset sizes ===")
print("N(comm_frames):", len(comm_frames))
print("N(cam_files)  :", len(cam_files))
print("N(min)        :", min(len(comm_frames), len(cam_files)))
print("past_len      :", ds.past_len)
print("len(ds)       :", len(ds))
print("len(train_ds) :", len(train_ds))
print("len(val_ds)   :", len(val_ds))
print("len(train_loader):", len(train_loader))
print("len(val_loader)  :", len(val_loader))

# ConvLSTM
debug_batch_and_forward(train_loader, device, model_convlstm, name="ConvLSTM", force_img_seq=True)


=== dataset sizes ===
N(comm_frames): 100
N(cam_files)  : 7012
N(min)        : 100
past_len      : 16
len(ds)       : 84
len(train_ds) : 63
len(val_ds)   : 21
len(train_loader): 2
len(val_loader)  : 1

=== one batch tensor info ===
  ch: shape=(32, 16, 2048) dtype=torch.float32 device=cuda:0
 img: shape=(32, 16, 3, 224, 224) dtype=torch.float32 device=cuda:0
   y: shape=(32, 2048) dtype=torch.float32 device=cuda:0
img format: (B,T,3,H,W) ✅
ConvLSTM_yhat: shape=(32, 2048) dtype=torch.float32 device=cuda:0
ConvLSTM yhat abs mean: 0.4949188828468323
ConvLSTM yhat abs max : 0.7043417096138
ConvLSTM yhat power   : 0.24614623188972473


(tensor([[[0.7071, 0.7267, 0.7437,  ..., 0.3161, 0.2963, 0.2776],
          [0.3173, 0.2997, 0.2824,  ..., 0.9147, 0.9237, 0.9295],
          [0.9300, 0.9405, 0.9473,  ..., 0.3734, 0.3941, 0.4135],
          ...,
          [0.1897, 0.1977, 0.2072,  ..., 0.2411, 0.2543, 0.2662],
          [0.6055, 0.6269, 0.6474,  ..., 0.7570, 0.7697, 0.7793],
          [0.6364, 0.6241, 0.6110,  ..., 0.3712, 0.3637, 0.3543]],
 
         [[0.5360, 0.5395, 0.5451,  ..., 0.4707, 0.4819, 0.4974],
          [0.6890, 0.6798, 0.6731,  ..., 0.6881, 0.6868, 0.6869],
          [0.5834, 0.5985, 0.6148,  ..., 0.7316, 0.7507, 0.7668],
          ...,
          [0.3330, 0.3232, 0.3142,  ..., 0.7128, 0.6953, 0.6739],
          [0.6469, 0.6474, 0.6486,  ..., 0.5114, 0.5193, 0.5300],
          [0.6921, 0.6864, 0.6767,  ..., 0.5602, 0.5750, 0.5904]],
 
         [[0.6039, 0.5822, 0.5604,  ..., 0.2329, 0.2328, 0.2335],
          [0.7136, 0.7239, 0.7306,  ..., 0.4165, 0.4095, 0.4035],
          [0.7090, 0.7159, 0.7231,  ...,