#  Import

In [1]:
import os
import sys

import numpy as  np
from lwm_multi_model import multi_modal_lwm  # 클래스 import
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset as TorchDataset, DataLoader, Subset
import time

from PIL import Image
import numpy as np

from deepverse import ParameterManager
from deepverse.scenario import ScenarioManager
from deepverse import Dataset

from deepverse.visualizers import ImageVisualizer, LidarVisualizer

# Settings

In [2]:
# Scenes 2000
## Subcarriers 64

scenarios_name = "DT31"
config_path = f"scenarios/{scenarios_name}/param/config.m"
param_manager = ParameterManager(config_path)

params = param_manager.get_params()

param_manager.params["scenes"] =list(range(100))
param_manager.params["comm"]["OFDM"]["selected_subcarriers"] = list(range(64))


# Generate a dataset

In [3]:
dataset = Dataset(param_manager)

Generating camera dataset: ⏳ In progress
[F[KGenerating camera dataset: ✅ Completed (0.00s)
Generating LiDAR dataset: ⏳ In progress
[F[KGenerating LiDAR dataset: ✅ Completed (0.00s)
Generating mobility dataset: ⏳ In progress
[F[KGenerating mobility dataset: ✅ Completed (0.00s)
Generating comm dataset: ⏳ In progress


                                                                   

[F[KGenerating comm dataset: ✅ Completed (1.50s)
Generating radar dataset: ⏳ In progress


                                                                    

[F[KGenerating radar dataset: ✅ Completed (216.90s)




# location dataset 
 지금 실험에서는 안쓰임

In [4]:
# comm = dataset.comm_dataset
# location =  comm

# location = [
#     {
#         "bs_loc": d["bs_loc"],                      # (3,)
#         "ue_loc": np.asarray(d["ue_loc"]).squeeze() # (3,)  (원래 (1,3)이면 squeeze)
#     }
#     for row in comm.data      # row: [dict] 형태
#     for d in row              # d: dict
# ]

# print(ue_location)  #)

# communication  dataset

In [5]:
# UE 정보
comm = dataset.comm_dataset
ch = comm.data[0][0]['ue'][0]
print(ch.coeffs.shape)  

(1, 16, 64)


# preprocessing

In [6]:
def get_coeffs_from_frame(frame, ue_idx=0):
    ue_obj = frame["ue"]

    # 케이스1) list/tuple이면 ue_idx로 선택
    if isinstance(ue_obj, (list, tuple)):
        ch_obj = ue_obj[ue_idx]
    else:
        # 케이스2) 단일 OFDMChannel이면 그대로 사용
        ch_obj = ue_obj

    # coeffs는 dict key가 아니라 attribute일 확률이 매우 큼
    if hasattr(ch_obj, "coeffs"):
        return ch_obj.coeffs

    # 혹시 dict라면 마지막 보험
    if isinstance(ch_obj, dict) and "coeffs" in ch_obj:
        return ch_obj["coeffs"]

    raise TypeError(f"Cannot get coeffs. ue type={type(ue_obj)}, ch type={type(ch_obj)}")


In [7]:
def get_train_min_max_realimag(frames, train_idx, us_idx=0):

    rmin, rmax =  float('inf'), float('-inf')
    imin, imax =  float('inf'), float('-inf')

    print("Calculating min/max over training set...")

    for t  in train_idx:
        frame  = frames[t]
        cooeffs  = get_coeffs_from_frame(frame, us_idx)  # (N_subcarriers, )

        rmin = min(rmin, float(cooeffs.real.min()))
        rmax = max(rmax, float(cooeffs.real.max()))
        imin = min(imin, float(cooeffs.imag.min()))
        imax = max(imax, float(cooeffs.imag.max()))

    print(f"Done. rmin={rmin}, rmax={rmax}, imin={imin}, imax={imax}")
    return (rmin, rmax), (imin, imax)

In [8]:
def preprocess_channel_coeffs_minmax(coeffs_np, r_min, r_max, i_min, i_max, device="cuda", eps=1e-12):
    # Convert Numpy to Tensor
    coeffs = torch.from_numpy(coeffs_np).to(torch.complex64)
    
    r = coeffs.real
    i = coeffs.imag
    
    # Min-Max Scaling [0, 1]
    # Add eps to denominator to prevent division by zero
    r_scaled = (r - r_min) / max(r_max - r_min, eps)
    i_scaled = (i - i_min) / max(i_max - i_min, eps)
    
    # Concat (Maintains shape like (..., 2*subcarriers))
    H = torch.cat([r_scaled, i_scaled], dim=-1).to(device)
    return H

In [None]:
def preprocess_channel_coeffs_minmax(coeffs_np, r_min, r_max, i_min, i_max, device="cuda", eps=1e-12):
    # Convert Numpy to Tensor
    coeffs = torch.from_numpy(coeffs_np).to(torch.complex64)
    
    r = coeffs.real
    i = coeffs.imag
    
    # Min-Max Scaling [0, 1]
    # Add eps to denominator to prevent division by zero
    r_scaled = (r - r_min) / max(r_max - r_min, eps)
    i_scaled = (i - i_min) / max(i_max - i_min, eps)
    
    # Concat (Maintains shape like (..., 2*subcarriers))
    H = torch.cat([r_scaled, i_scaled], dim=-1).to(device)
    return H

In [9]:
# 사용예시
H = preprocess_channel_coeffs_minmax(ch.coeffs, r_min=-0.5, r_max=0.5, i_min=-0.5, i_max=0.5)
print(H.shape)  # (1, 16, 128) 64 subcar

torch.Size([1, 16, 128])


### image dataset

In [10]:
sensor = dataset.camera_dataset.sensors["unit1_cam1"]
path0 = sensor.files[0]
img = Image.open(path0).convert("RGB")
arr = np.array(img)

print("path:", path0)
print("PIL size (W,H):", img.size)
print("np shape:", arr.shape, "dtype:", arr.dtype)  # 보통 (H,W,3), uint8


path: scenarios/DT31/RGB_images/unit1_cam1/0.png
PIL size (W,H): (1920, 1080)
np shape: (1080, 1920, 3) dtype: uint8


In [11]:
IMG_SIZE = 224

def preprocess_img(path, img_size=IMG_SIZE, device="cuda"):
    # 1) load (H,W,3) uint8
    img = Image.open(path).convert("RGB")
    arr = np.array(img)

    # 2) numpy -> torch, (3,H,W), float32
    x = torch.from_numpy(arr).permute(2, 0, 1).contiguous().float()
    x = x / 255.0  # [0,1]

    # 3) add batch dim -> (1,3,H,W)
    x = x.unsqueeze(0)

    # 4) resize -> (1,3,224,224)
    x = F.interpolate(x, size=(img_size, img_size),
                      mode="bilinear", align_corners=False)

    # 5) normalize (ImageNet)
    mean = torch.tensor([0.485, 0.456, 0.406], dtype=x.dtype).view(1, 3, 1, 1)
    std  = torch.tensor([0.229, 0.224, 0.225], dtype=x.dtype).view(1, 3, 1, 1)
    x = (x - mean) / std

    # 6) move to device (GPU)
    x = x.to(device, non_blocking=True)

    return x  # (1,3,224,224) on device


In [12]:
# 사용예시
cd = dataset.camera_dataset
sensor = cd.sensors['unit1_cam1']
path0 = sensor.files[0]
img = preprocess_img(path0, device="cuda")
print(img.shape, img.device)  # torch.Size([1,3,224,224]) cuda:0
print(path0)

torch.Size([1, 3, 224, 224]) cuda:0
scenarios/DT31/RGB_images/unit1_cam1/0.png


# Dataset 구현

In [13]:
def flatten_comm_frames(comm):
    frames = []
    for row in comm.data:
        for d in row:
            frames.append(d)
    return frames

class MultiModalNextStepDatasetGPU(TorchDataset):
    def __init__(self, comm_frames, cam_files, ue_idx=0, past_len=15, device="cuda",
                 # Arguments for statistical values (initialized with default values)
                 r_min=0.0, r_max=1.0, i_min=0.0, i_max=1.0):
        
        self.comm_frames = comm_frames
        self.cam_files = list(cam_files)
        self.ue_idx = ue_idx
        self.past_len = past_len
        self.device = device
        
        # Save statistical values
        self.r_min, self.r_max = r_min, r_max
        self.i_min, self.i_max = i_min, i_max

        self.N = min(len(self.comm_frames), len(self.cam_files))
        self.valid_start = past_len - 1
        self.valid_end = self.N - 2 

    def __len__(self):
        return self.valid_end - self.valid_start + 1

    def __getitem__(self, idx):
        t = self.valid_start + idx

        # 1. Image Past (Apply Preprocessing)
        img_list = []
        for k in range(t - self.past_len + 1, t + 1):
            img_path = self.cam_files[k]
            img_k = preprocess_img(img_path, device=self.device).squeeze(0)
            img_list.append(img_k)
        img = torch.stack(img_list, dim=0)  # Shape: (past_len

        
        # 2. Channel Past (Apply Scaling)
        ch_list = []
        for k in range(t - self.past_len + 1, t + 1):
            coeffs_np = get_coeffs_from_frame(self.comm_frames[k], ue_idx=self.ue_idx)
            # Use the newly defined Min-Max preprocessing function
            h = preprocess_channel_coeffs_minmax(
                coeffs_np, 
                self.r_min, self.r_max, self.i_min, self.i_max, 
                device=self.device
            ).reshape(-1)
            ch_list.append(h)
        channel_past = torch.stack(ch_list, dim=0)

        # 3. Target (Apply Scaling) - Target must also be scaled for model training!
        coeffs_np_next = get_coeffs_from_frame(self.comm_frames[t + 1], ue_idx=self.ue_idx)
        target = preprocess_channel_coeffs_minmax(
            coeffs_np_next, 
            self.r_min, self.r_max, self.i_min, self.i_max, 
            device=self.device
        ).reshape(-1)

        return channel_past, img, target

# DataLoader 구현

In [14]:
comm_frames = flatten_comm_frames(dataset.comm_dataset)
sensor = dataset.camera_dataset.sensors["unit1_cam1"]

ds = MultiModalNextStepDatasetGPU(
    comm_frames=comm_frames,
    cam_files=sensor.files,
    ue_idx=0,
    past_len=16,
    device="cuda"
)

loader = DataLoader(
    ds,
    batch_size=8,
    shuffle=True,
    num_workers=0,     
    pin_memory=False   # ✅ 의미 없음 (이미 GPU)
)

ch, img, y = next(iter(loader))
print(ch.shape, img.shape, y.shape)
print(ch.device, img.device, y.device)


torch.Size([8, 16, 2048]) torch.Size([8, 16, 3, 224, 224]) torch.Size([8, 2048])
cuda:0 cuda:0 cuda:0


# Fine-tuning
data shape 맞추기 위해

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

from lwm_multi_model import multi_modal_lwm  # 너가 올린 backbone

class FinetuneChannelPredictor(nn.Module):
    """
    직접 구현한 lwm_multi_model(channel + image)에 맞는 파인튜닝 모델
    Input:
      ch:  (B, T, F_in)  e.g., (B,16,2048) 16: past Length, 2048: feature dim
      img: (B, 3, 224, 224)
    Output:
      yhat: (B, F_out)  e.g., (B,2048)
    """
    def __init__(
        self,
        backbone: nn.Module,
        F_in: int,
        F_out: int,
        pool: str = "last",          # "last" or "mean"
        freeze_image: bool = False,
        freeze_backbone: bool = False,
        element_length: int = 16,    # 채널 벡터 차원 (backbone 기대값)
        d_model: int = 64            # backbone 내부 feature dim
    ):
        super().__init__()
        self.backbone = backbone
        self.pool = pool

        # backbone이 기대하는 channel feature dim = ELEMENT_LENGTH
        # (backbone 내부 Channel_Embedding: Linear(ELEMENT_LENGTH -> D_MODEL))
        if element_length is None:
            element_length = backbone.channel_embedding.element_length
        if d_model is None:
            d_model = backbone.channel_embedding.d_model

        # 입력 차원 정렬: F_in -> ELEMENT_LENGTH
        self.in_proj = nn.Sequential(
            nn.Linear(F_in, 512),
            nn.GELU(),
            nn.Linear(512, 128),
            nn.GELU(),
            nn.Linear(128, element_length)
        )

        # 출력 head: D_MODEL -> F_out
        self.head = nn.Linear(d_model, F_out)

        if freeze_image:
            for p in self.backbone.image_embedding.parameters():
                p.requires_grad = False

        if freeze_backbone:
            for p in self.backbone.parameters():
                p.requires_grad = False
            # 그래도 projection/head는 학습되게 다시 켜기
            for p in self.in_proj.parameters():
                p.requires_grad = True
            for p in self.head.parameters():
                p.requires_grad = True

    def forward(self, ch, img):
        # ch: (B,T,F_in) -> (B,T,ELEMENT_LENGTH)
        ch = self.in_proj(ch)

        # backbone: (B,T,D_MODEL)
        tokens = self.backbone(ch, img)

        # pooling -> (B,D_MODEL)
        if self.pool == "last":
            z = tokens[:, -1, :]
        elif self.pool == "mean":
            z = tokens.mean(dim=1)
        else:
            raise ValueError(f"Unknown pool={self.pool}")

        # head -> (B,F_out)
        yhat = self.head(z)
        return yhat


## NMSE(dB)

In [16]:
@torch.no_grad()
def nmse_db(yhat: torch.Tensor, y: torch.Tensor, eps: float = 1e-12) -> torch.Tensor:
    # yhat, y: (B,F)
    num = torch.sum((yhat - y) ** 2, dim=1)
    den = torch.sum(y ** 2, dim=1).clamp_min(eps)
    nmse = num / den
    return 10.0 * torch.log10(nmse.clamp_min(eps)).mean()


# Train/Val split

In [17]:
n = len(ds)
n_train = int(0.75 * n)
train_idx = list(range(0, n_train))
val_idx = list(range(n_train, n))

train_ts = [ds.valid_start + i for i in train_idx]

(real_min,  real_max), (imag_min, imag_max) = get_train_min_max_realimag(
    comm_frames, train_ts, us_idx=0
)

ds.r_min = real_min
ds.r_max = real_max
ds.i_min = imag_min
ds.i_max = imag_max

print("Dataset statistical values set in the dataset.")

train_ds = Subset(ds, train_idx)
val_ds   = Subset(ds, val_idx)

train_loader = DataLoader(train_ds, batch_size=32, shuffle=True,  num_workers=0)
val_loader   = DataLoader(val_ds,   batch_size=32, shuffle=False, num_workers=0)

# Verify
ch, img, y = next(iter(train_loader))
print("\n=== Data Check ===")
print(f"y stats | min: {y.min().item():.4f}, max: {y.max().item():.4f}")
print("If scaling worked correctly, values should be within [0, 1].")

Calculating min/max over training set...
Done. rmin=-1.0646139668415024e-06, rmax=1.091798991441695e-06, imin=-1.0719515918791155e-06, imax=1.0643867764302909e-06
Dataset statistical values set in the dataset.

=== Data Check ===
y stats | min: 0.0007, max: 0.9746
If scaling worked correctly, values should be within [0, 1].


In [18]:
comm_frames = flatten_comm_frames(dataset.comm_dataset)
cam_files = list(dataset.camera_dataset.sensors["unit1_cam1"].files)

print("len(comm_frames):", len(comm_frames))
print("len(cam_files):", len(cam_files))
print("first comm frame keys:", list(comm_frames[0].keys()))
print("first cam file:", cam_files[0])


len(comm_frames): 100
len(cam_files): 7012
first comm frame keys: ['bs_loc', 'ue', 'ue_loc', 'bs']
first cam file: scenarios/DT31/RGB_images/unit1_cam1/0.png


In [19]:
len(val_loader)

1

# Model generate and also check

In [20]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("device:", device)

# 배치 하나로 F_in/F_out 자동 확정
ch, img, y = next(iter(train_loader))
F_in  = ch.shape[-1]
F_out = y.shape[-1]
print("Detected:", "F_in=", F_in, "F_out=", F_out)
print("Batch devices:", ch.device, img.device, y.device)

# backbone + finetune model
backbone = multi_modal_lwm().to(device)

model = FinetuneChannelPredictor(
    backbone=backbone,
    F_in=F_in,
    F_out=F_out,
    pool="last",            # "mean"으로 바꿔도 됨
    freeze_image=False,     # 원하면 True (이미지 인코더 고정)
    freeze_backbone=False,  # 원하면 True (proj/head만 학습)
    element_length=16,
    d_model=64
).to(device)

# sanity forward
model.eval()
with torch.no_grad():
    # ds가 이미 cuda 텐서 반환이면 아래 .to(device) 생략 가능
    yhat = model(ch.to(device), img.to(device))
print("yhat:", yhat.shape, "y:", y.shape)



device: cuda
Detected: F_in= 2048 F_out= 2048
Batch devices: cuda:0 cuda:0 cuda:0
yhat: torch.Size([32, 2048]) y: torch.Size([32, 2048])


# Train/ Eval 함수 (AMP + grad clip)

In [21]:
def train_one_epoch(model, loader, optimizer, device, use_amp=True, grad_clip=1.0):
    model.train()
    scaler = torch.cuda.amp.GradScaler(enabled=use_amp)

    total_loss = 0.0
    total_nmse = 0.0
    n = 0

    for ch, img, y in loader:
        # Dataset이 이미 cuda 텐서를 반환하더라도 안전하게 유지
        ch = ch.to(device, non_blocking=True)
        img = img.to(device, non_blocking=True)
        y  = y.to(device, non_blocking=True)

        optimizer.zero_grad(set_to_none=True)

        with torch.cuda.amp.autocast(enabled=use_amp):
            yhat = model(ch, img)
            loss = F.mse_loss(yhat, y)

        scaler.scale(loss).backward()

        if grad_clip is not None and grad_clip > 0:
            scaler.unscale_(optimizer)
            nn.utils.clip_grad_norm_(model.parameters(), grad_clip)

        scaler.step(optimizer)
        scaler.update()

        total_loss += loss.item()
        total_nmse += nmse_db(yhat.detach(), y).item()
        n += 1

    return total_loss / max(n, 1), total_nmse / max(n, 1)


@torch.no_grad()
def evaluate(model, loader, device):
    model.eval()

    total_loss = 0.0
    total_nmse = 0.0
    n = 0

    for ch, img, y in loader:
        ch = ch.to(device, non_blocking=True)
        img = img.to(device, non_blocking=True)
        y  = y.to(device, non_blocking=True)

        yhat = model(ch, img)
        loss = F.mse_loss(yhat, y)

        total_loss += loss.item()
        total_nmse += nmse_db(yhat, y).item()
        n += 1

    return total_loss / max(n, 1), total_nmse / max(n, 1)


#  Optiimizer  / Scheduler 설정

In [22]:
# requires_grad=True인 파라미터만 학습
trainable_params = [p for p in model.parameters() if p.requires_grad]
print("trainable params:", sum(p.numel() for p in trainable_params))

optimizer = torch.optim.AdamW(trainable_params, lr=1e-4, weight_decay=1e-4)

# (선택) cosine scheduler
epochs = 1000
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs)


trainable params: 2499120


# 학습 루프 + checkpoint 저장

In [23]:
best_val = float("inf")

for epoch in range(1, epochs + 1):
    t0 = time.time()

    tr_loss, tr_nmse = train_one_epoch(model, train_loader, optimizer, device=device, use_amp=True, grad_clip=1.0)
    va_loss, va_nmse = evaluate(model, val_loader, device=device)

    scheduler.step()

    dt = time.time() - t0
    print(
        f"[{epoch:02d}/{epochs}] "
        f"train loss={tr_loss:.6f}, nmse(dB)={tr_nmse:.4f} | "
        f"val loss={va_loss:.6f}, nmse(dB)={va_nmse:.4f} | "
        f"{dt:.1f}s"
    )

    if va_loss < best_val:
        best_val = va_loss
        torch.save(
            {
                "epoch": epoch,
                "model_state": model.state_dict(),
                "optimizer_state": optimizer.state_dict(),
                "F_in": F_in,
                "F_out": F_out,
            },
            "best_finetune.pt"
        )
        print("  ↳ saved best_finetune.pt")


  scaler = torch.cuda.amp.GradScaler(enabled=use_amp)
  with torch.cuda.amp.autocast(enabled=use_amp):


[01/1000] train loss=0.592901, nmse(dB)=3.2082 | val loss=0.477058, nmse(dB)=2.6647 | 84.6s
  ↳ saved best_finetune.pt
[02/1000] train loss=0.515475, nmse(dB)=2.5984 | val loss=0.436902, nmse(dB)=2.2827 | 83.9s
  ↳ saved best_finetune.pt
[03/1000] train loss=0.484834, nmse(dB)=2.3308 | val loss=0.416878, nmse(dB)=2.0789 | 81.5s
  ↳ saved best_finetune.pt
[04/1000] train loss=0.463062, nmse(dB)=2.1342 | val loss=0.398489, nmse(dB)=1.8830 | 81.3s
  ↳ saved best_finetune.pt
[05/1000] train loss=0.445451, nmse(dB)=1.9630 | val loss=0.381698, nmse(dB)=1.6960 | 83.4s
  ↳ saved best_finetune.pt
[06/1000] train loss=0.429579, nmse(dB)=1.8043 | val loss=0.366875, nmse(dB)=1.5239 | 81.0s
  ↳ saved best_finetune.pt
[07/1000] train loss=0.412460, nmse(dB)=1.6261 | val loss=0.352936, nmse(dB)=1.3556 | 83.2s
  ↳ saved best_finetune.pt
[08/1000] train loss=0.398698, nmse(dB)=1.4735 | val loss=0.339825, nmse(dB)=1.1912 | 80.7s
  ↳ saved best_finetune.pt
[09/1000] train loss=0.384732, nmse(dB)=1.3220 |

In [None]:
ch, img, y = next(iter(train_loader))
print("y abs mean:", y.abs().mean().item())
print("y abs max :", y.abs().max().item())
print("y power   :", (y**2).mean().item())

with torch.no_grad():
    yhat = model(ch.to(device), img.to(device))
print("yhat abs mean:", yhat.abs().mean().item())
print("yhat abs max :", yhat.abs().max().item())
print("yhat power   :", (yhat**2).mean().item())


y abs mean: 0.49508270621299744
y abs max : 1.0
y power   : 0.2848026752471924
yhat abs mean: 0.495068222284317
yhat abs max : 0.5914323925971985
yhat power   : 0.24561259150505066


# 데이터 입력 및 형태

In [25]:
print("=== dataset sizes ===")
print("N(comm_frames):", len(comm_frames))
print("N(cam_files)  :", len(cam_files))
print("N(min)        :", min(len(comm_frames), len(cam_files)))
print("past_len      :", ds.past_len)
print("len(ds)       :", len(ds))
print("len(train_ds) :", len(train_ds))
print("len(val_ds)   :", len(val_ds))
print("len(train_loader):", len(train_loader))
print("len(val_loader)  :", len(val_loader))

print("\n=== one batch shapes ===")
ch, img, y = next(iter(train_loader))
print("ch :", tuple(ch.shape), " -> (B,T,F_in)")
print("img:", tuple(img.shape), " -> (B,3,224,224)")
print("y  :", tuple(y.shape), " -> (B,F_out)")
with torch.no_grad():
    yhat = model(ch.to(device), img.to(device))
print("yhat:", tuple(yhat.shape), " -> (B,F_out)")
print("this forward predicted vectors:", yhat.shape[0], "(=B)")
print("each vector predicts elements:", yhat.shape[1], "(=F_out)")


=== dataset sizes ===
N(comm_frames): 100
N(cam_files)  : 7012
N(min)        : 100
past_len      : 16
len(ds)       : 84
len(train_ds) : 63
len(val_ds)   : 21
len(train_loader): 2
len(val_loader)  : 1

=== one batch shapes ===
ch : (32, 16, 2048)  -> (B,T,F_in)
img: (32, 16, 3, 224, 224)  -> (B,3,224,224)
y  : (32, 2048)  -> (B,F_out)
yhat: (32, 2048)  -> (B,F_out)
this forward predicted vectors: 32 (=B)
each vector predicts elements: 2048 (=F_out)
