# Proposed Model: Company Data Generation

## Configuration Parameters

In [1]:
YEAR_DIM_PARAM = 13
NUM_YEARS_PARAM = 13 
FT_OUT_DIM_PARAM = 16 
STOCK_DIM_PARAM = 32
DENOISER_D_MODEL = 64
EPOCHS_COMPANY_MODEL = 5 # Reduced for quick testing, original was 200
EPOCHS_DENOISER_MODEL = 5 # Reduced for quick testing, original was 200
BATCH_SIZE = 64
LEARNING_RATE_COMPANY = 0.001
LEARNING_RATE_DENOISER = 1e-3
TENSORBOARD_LOG_DIR_COMPANY = 'runs/company_model_experiment'
TENSORBOARD_LOG_DIR_DENOISER = 'runs/denoiser_model_experiment'

## Imports

In [2]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import rtdl
from sklearn.preprocessing import MinMaxScaler
from torch.utils.data import DataLoader, TensorDataset
import math # Added for PositionalEncoding
from torch.utils.tensorboard.writer import SummaryWriter # TensorBoard import
import time # For unique log directory names
#from torchsort import soft_sort # Added for CDF loss

## 0. 실행 환경에 따라 디바이스 결정 및 시각화 활성화

In [3]:
%load_ext tensorboard
%tensorboard --logdir runs
# or tensorboard --logdir runs --bind_all --port 6006

Reusing TensorBoard on port 6006 (pid 509062), started 0:32:48 ago. (Use '!kill 509062' to kill it.)

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


## 1. 데이터 로드 및 전처리

In [5]:
file_path_csv = "data/Table_Data.csv"  # 실제 파일 경로 설정

try:
    df = pd.read_csv(file_path_csv, encoding='utf-8')
except UnicodeDecodeError:
    df = pd.read_csv(file_path_csv, encoding='euc-kr')

# 불필요한 컬럼 제거 (예: 'Name' 컬럼)
if 'Name' in df.columns:
    df = df.drop(columns=['Name'], errors='ignore')

# 2011~2023년 동안 존재하는 기업만 필터링
stock_min_year = df.groupby("Stock")["YEAR"].min()
stock_max_year = df.groupby("Stock")["YEAR"].max()

valid_stocks_initial = stock_min_year[(stock_min_year == 2011) & (stock_max_year == 2023)].index
df_filtered = df[df["Stock"].isin(valid_stocks_initial)].copy() # Use .copy() to avoid SettingWithCopyWarning

if df_filtered.empty:
    print("No companies found that existed continuously from 2011 to 2023. Exiting.")
    # Potentially exit or raise an error if no data to process
else:
    # 정확히 13년치 데이터가 있는 기업만 선택
    year_counts = df_filtered.groupby("Stock")["YEAR"].count()
    valid_stocks_final = year_counts[year_counts == NUM_YEARS_PARAM].index
    df_filtered = df_filtered[df_filtered["Stock"].isin(valid_stocks_final)].copy()
    df_filtered = df_filtered.sort_values(by=["Stock", "YEAR"])

    print("Data before preprocessing:")
    print(f"Original df shape: {df.shape}")
    print(f"Number of unique stocks before filtering: {df['Stock'].nunique()}")
    display(df.head())

    print("\nData after preprocessing (filtering):")
    print(f"Filtered df shape: {df_filtered.shape}")
    print(f"Number of unique stocks after filtering: {df_filtered['Stock'].nunique()}")
    display(df_filtered.head())

Data before preprocessing:
Original df shape: (16692, 18)
Number of unique stocks before filtering: 1284


Unnamed: 0,Stock,YEAR,OWN,FORN,BIG4,SIZE,LEV,CUR,GRW,ROA,ROE,CFO,PPE,AGE,INVREC,MB,TQ,LOSS
0,0,2011,0.3286,0.0256,1,26.532442,0.34856,1.793309,0.08957,0.053036,0.081413,0.138727,0.392632,4.744932,0.284906,0.496292,0.659244,0
1,0,2012,0.3157,0.0798,1,26.550538,0.323504,1.506044,-0.047706,0.003863,0.00571,0.074321,0.362544,4.75359,0.255722,0.743216,0.820255,0
2,0,2013,0.3114,0.0613,1,26.504888,0.300014,1.467747,-0.014019,0.00312,0.004458,0.025302,0.350281,4.762174,0.262648,0.539623,0.669123,0
3,0,2014,0.3151,0.0502,1,26.479532,0.281291,1.591329,-0.030732,0.015616,0.021728,0.042746,0.327404,4.770685,0.267463,0.677235,0.763162,0
4,0,2015,0.3235,0.0749,1,26.469703,0.266223,1.74922,0.045576,0.017915,0.024415,0.06335,0.298817,4.779123,0.267951,0.988025,0.991312,0



Data after preprocessing (filtering):
Filtered df shape: (16692, 18)
Number of unique stocks after filtering: 1284


Unnamed: 0,Stock,YEAR,OWN,FORN,BIG4,SIZE,LEV,CUR,GRW,ROA,ROE,CFO,PPE,AGE,INVREC,MB,TQ,LOSS
0,0,2011,0.3286,0.0256,1,26.532442,0.34856,1.793309,0.08957,0.053036,0.081413,0.138727,0.392632,4.744932,0.284906,0.496292,0.659244,0
1,0,2012,0.3157,0.0798,1,26.550538,0.323504,1.506044,-0.047706,0.003863,0.00571,0.074321,0.362544,4.75359,0.255722,0.743216,0.820255,0
2,0,2013,0.3114,0.0613,1,26.504888,0.300014,1.467747,-0.014019,0.00312,0.004458,0.025302,0.350281,4.762174,0.262648,0.539623,0.669123,0
3,0,2014,0.3151,0.0502,1,26.479532,0.281291,1.591329,-0.030732,0.015616,0.021728,0.042746,0.327404,4.770685,0.267463,0.677235,0.763162,0
4,0,2015,0.3235,0.0749,1,26.469703,0.266223,1.74922,0.045576,0.017915,0.024415,0.06335,0.298817,4.779123,0.267951,0.988025,0.991312,0


## 2. 연속형 & 이진 변수 분리, 정규화

In [6]:
continuous_features = ["OWN", "FORN", "SIZE", "LEV", "CUR", "GRW", "ROA", "ROE", "CFO", "PPE", "AGE", "INVREC", "MB", "TQ"]
binary_features = ["BIG4", "LOSS"]

if not df_filtered.empty:
    # Stock 정보를 범주형(정수형)으로 변환
    df_filtered.loc[:, "Stock_ID"] = df_filtered["Stock"].astype('category').cat.codes

    # 연속형 변수 MinMax 정규화
    minmax_scaler = MinMaxScaler()
    scaled_cont = minmax_scaler.fit_transform(
        df_filtered[continuous_features]
    )

    # ② logit(σ⁻¹) 변환 : [0,1] → ℝ
    EPS = 1e-6                           # 수치 안정
    scaled_cont = np.clip(scaled_cont, EPS, 1.0-EPS)
    logit_cont  = np.log(scaled_cont / (1.0 - scaled_cont))

    df_filtered.loc[:, continuous_features] = logit_cont

    # 이진 변수: 0/1 정수형
    df_filtered.loc[:, binary_features] = df_filtered[binary_features].astype(int)

    # 전체 feature 목록
    features = continuous_features + binary_features

    print("Data after normalization and transformation:")
    display(df_filtered[features + ['Stock_ID']].head()) # Use display for better notebook output
else:
    print("Skipping normalization and transformation as df_filtered is empty.")

Data after normalization and transformation:


Unnamed: 0,OWN,FORN,SIZE,LEV,CUR,GRW,ROA,ROE,CFO,PPE,AGE,INVREC,MB,TQ,BIG4,LOSS,Stock_ID
0,-0.714524,-3.63923,-0.332483,-0.492675,-2.954608,-0.937985,0.754126,0.930243,0.878894,0.647758,3.709221,-0.579711,-3.800607,-3.387102,1,0,0
1,-0.773604,-2.445068,-0.32185,-0.620203,-3.162043,-1.194294,0.46993,0.719498,0.394065,0.431259,3.802627,-0.744595,-3.148134,-2.870179,1,0,0
2,-0.793582,-2.728716,-0.348709,-0.744359,-3.192924,-1.128321,0.465802,0.716146,0.056219,0.346058,3.90428,-0.704655,-3.653171,-3.347222,1,0,0
3,-0.776383,-2.940236,-0.363681,-0.847365,-3.096265,-1.160768,0.535794,0.762696,0.175092,0.190144,4.01593,-0.677205,-3.287325,-3.027016,1,0,0
4,-0.737733,-2.513748,-0.369496,-0.933433,-2.984015,-1.016739,0.548808,0.770009,0.317183,-0.001703,4.139951,-0.674435,-2.746297,-2.499389,1,0,0


## 3. 기업 단위 시퀀스 데이터 생성 (각 기업 13년치)

In [7]:
if not df_filtered.empty:
    stocks = df_filtered["Stock"].unique()
    grouped_cont = []
    grouped_bin = []
    grouped_year = []
    grouped_stock = []

    for stock_val in stocks: 
        df_stock = df_filtered[df_filtered["Stock"] == stock_val].sort_values(by="YEAR")
        grouped_cont.append(df_stock[continuous_features].values)  
        grouped_bin.append(df_stock[binary_features].values)         
        grouped_year.append(df_stock["YEAR"].values)                 
        grouped_stock.append(df_stock["Stock_ID"].iloc[0])            

    # 각 주식에 대해 연속형 및 이진 특성을 시퀀스 형태로 변환
    # grouped_cont: (num_years, num_continuous_features)
    # grouped_bin: (num_years, num_binary_features)
    # grouped_year: (num_years,)
    # grouped_stock: (num_years,)
        
    X_cont_seq = np.stack(grouped_cont, axis=0)  
    X_bin_seq = np.stack(grouped_bin, axis=0)     
    year_seq = np.stack(grouped_year, axis=0)       
    stock_seq = np.array(grouped_stock)            

    # X_cont_seq: (num_stocks, num_years, num_continuous_features)
    # X_bin_seq: (num_stocks, num_years, num_binary_features)
    # year_seq: (num_stocks, num_years)
    # stock_seq: (num_stocks,)


    target_seq = np.concatenate([X_cont_seq, X_bin_seq], axis=-1)  

    X_cont_tensor = torch.tensor(X_cont_seq, dtype=torch.float32)
    X_bin_tensor = torch.tensor(X_bin_seq, dtype=torch.float32)
    year_tensor_seq = torch.tensor(year_seq, dtype=torch.float32) 
    stock_tensor_seq = torch.tensor(stock_seq, dtype=torch.long) 
    target_tensor_seq = torch.tensor(target_seq, dtype=torch.float32)

    # 시퀀스 데이터셋 생성
    # X_cont_tensor: (num_stocks, num_years, num_continuous_features)
    # X_bin_tensor: (num_stocks, num_years, num_binary_features)
    # year_tensor_seq: (num_stocks, num_years)
    # stock_tensor_seq: (num_stocks,)
    # target_tensor_seq: (num_stocks, num_years, num_continuous_features + num_binary_features)


    dataset_seq = TensorDataset(X_cont_tensor, X_bin_tensor, year_tensor_seq, stock_tensor_seq, target_tensor_seq)
    dataloader_seq = DataLoader(dataset_seq, batch_size=BATCH_SIZE, shuffle=True)

    print(f"X_cont_tensor shape: {X_cont_tensor.shape}")
    print(f"X_bin_tensor shape: {X_bin_tensor.shape}")
    print(f"year_tensor_seq shape: {year_tensor_seq.shape}")
    print(f"stock_tensor_seq shape: {stock_tensor_seq.shape}")
    print(f"target_tensor_seq shape: {target_tensor_seq.shape}")
    NUM_STOCK_EMBEDDINGS = df_filtered["Stock_ID"].nunique()
else:
    print("Skipping sequence data generation as df_filtered is empty.")
    # Initialize with empty tensors or handle appropriately if needed downstream
    X_cont_tensor = torch.empty(0)
    X_bin_tensor = torch.empty(0)
    year_tensor_seq = torch.empty(0)
    stock_tensor_seq = torch.empty(0)
    target_tensor_seq = torch.empty(0)
    dataloader_seq = [] # Or an empty DataLoader
    NUM_STOCK_EMBEDDINGS = 0 # Default if no stocks

X_cont_tensor shape: torch.Size([1284, 13, 14])
X_bin_tensor shape: torch.Size([1284, 13, 2])
year_tensor_seq shape: torch.Size([1284, 13])
stock_tensor_seq shape: torch.Size([1284])
target_tensor_seq shape: torch.Size([1284, 13, 16])


## 4. sine-cosine 기반 연도 임베딩 함수 (sequence 지원)

In [8]:
def get_sine_cosine_year_embedding(years, dim=YEAR_DIM_PARAM):
    """
    - years: (batch, num_years) 또는 (num_samples,) 형태의 실제 연도값 텐서
    - 출력: (..., dim) 형태의 연도 임베딩
    """
    if len(years.shape) < 3: 
        years = years.unsqueeze(-1) #
        
    half_dim = dim // 2
    freqs = torch.exp(
        torch.arange(0, half_dim, dtype=torch.float32) * (-np.log(10000.0) / half_dim)
    ).to(years.device)
    sinusoidal_input = years * freqs  
    sin_embed = torch.sin(sinusoidal_input)
    cos_embed = torch.cos(sinusoidal_input)
    year_embedding = torch.cat([sin_embed, cos_embed], dim=-1)
    if year_embedding.shape[-1] < dim: 
        pad_size = dim - year_embedding.shape[-1]
        padding = torch.zeros(year_embedding.shape[:-1] + (pad_size,), device=year_embedding.device)
        year_embedding = torch.cat([year_embedding, padding], dim=-1)

    
    return year_embedding

## 5. Positional Encoding (Sinusoidal)

In [None]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super(PositionalEncoding, self).__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        if d_model % 2 == 1: 
            pe[:, 1::2] = torch.cos(position * div_term)[:, :pe[:, 1::2].shape[1]] 
        else:
            pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)  
        self.register_buffer('pe', pe)
    def forward(self, x):
        seq_len = x.size(1)
        x = x + self.pe[:, :seq_len]
        return x

## 6. CompanySequenceModel: FT-Transformer + tst (Sequence 모델)

In [None]:
class CompanySequenceModel(nn.Module):
    def __init__(self, cont_input_dim, bin_input_dim, year_dim, num_stock_embeddings, stock_dim=STOCK_DIM_PARAM, ft_out_dim=FT_OUT_DIM_PARAM, num_years=NUM_YEARS_PARAM):
        super(CompanySequenceModel, self).__init__()
        self.num_years = num_years
        
        self.cont_embedding = nn.Linear(cont_input_dim, 32)
        self.bin_embedding = nn.Linear(bin_input_dim, 16)
        self.stock_embedding = nn.Embedding(num_embeddings=num_stock_embeddings, embedding_dim=stock_dim)
        
        total_input_dim = 32 + 16 + year_dim + stock_dim
        self.embedding = nn.Linear(total_input_dim, 128)
        self.bn = nn.BatchNorm1d(128) 
        
        self.ft_transformer = rtdl.FTTransformer.make_default(
            n_num_features=128, 
            cat_cardinalities=None, 
            d_out=ft_out_dim
        )
        
        self.conv1d = nn.Conv1d(in_channels=ft_out_dim, out_channels=ft_out_dim, kernel_size=3, padding=1)
        self.pos_encoder = PositionalEncoding(ft_out_dim, max_len=num_years)
        encoder_layer = nn.TransformerEncoderLayer(d_model=ft_out_dim, nhead=2, dropout=0.1, batch_first=True)
        self.tst_encoder = nn.TransformerEncoder(encoder_layer, num_layers=2)
    
    def forward(self, x_cont, x_bin, x_year_values, x_stock_id):
        batch, num_years, _ = x_cont.shape
        year_embed = get_sine_cosine_year_embedding(x_year_values, dim=YEAR_DIM_PARAM) 
        
        cont_emb = self.cont_embedding(x_cont.reshape(-1, x_cont.shape[-1]))
        bin_emb = self.bin_embedding(x_bin.reshape(-1, x_bin.shape[-1]))
        
        stock_emb = self.stock_embedding(x_stock_id) 
        stock_emb = stock_emb.unsqueeze(1).repeat(1, num_years, 1) 
        
        x_all = torch.cat([
            cont_emb, 
            bin_emb,  
            year_embed.reshape(-1, year_embed.shape[-1]), 
            stock_emb.reshape(-1, stock_emb.shape[-1])   
        ], dim=-1)
        
        x_all = self.embedding(x_all)  
        x_all = self.bn(x_all) 
        
        ft_out = self.ft_transformer(x_num=x_all, x_cat=None)
        ft_out = ft_out.view(batch, num_years, -1)  
        
        conv_in = ft_out.transpose(1, 2)       
        conv_out = self.conv1d(conv_in)          
        conv_out = conv_out.transpose(1, 2)      
        
        tst_input = self.pos_encoder(conv_out)   
        tst_output = self.tst_encoder(tst_input)
        return tst_output

## 7. CompanySequenceModel 학습 (FT-Transformer + Tst)

### 7.1 모델 파라미터 및 초기화

In [None]:
cont_input_dim = len(continuous_features)  
bin_input_dim = len(binary_features)         

if not df_filtered.empty:
    p_big4 = (df_filtered['BIG4'] == 1).mean()
    p_loss = (df_filtered['LOSS'] == 1).mean()
    p_big4 = np.clip(p_big4, EPS, 1.0 - EPS)
    p_loss = np.clip(p_loss, EPS, 1.0 - EPS)
    pos_w  = torch.tensor([
        ((1-p_big4)/p_big4)**0.5, 
        ((1-p_loss)/p_loss)**0.5
    ], device=device)
else:
    pos_w = torch.ones(2, device=device) 

bce_bin   = nn.BCEWithLogitsLoss(pos_weight=pos_w)   
mse_cont  = nn.MSELoss()                             
λ_bin_enc = 10.0                                     

if NUM_STOCK_EMBEDDINGS > 0: # Ensure there are stocks to create embeddings for
    company_model = CompanySequenceModel(
        cont_input_dim, bin_input_dim, YEAR_DIM_PARAM, 
        num_stock_embeddings=NUM_STOCK_EMBEDDINGS, 
        stock_dim=STOCK_DIM_PARAM, 
        ft_out_dim=FT_OUT_DIM_PARAM, 
        num_years=NUM_YEARS_PARAM
    ).to(device)
    optimizer_company = optim.Adam(company_model.parameters(), lr=LEARNING_RATE_COMPANY)
    # Unique log directory for each run
    current_time_str = time.strftime("%Y%m%d-%H%M%S")
    writer_company = SummaryWriter(f'{TENSORBOARD_LOG_DIR_COMPANY}_{current_time_str}')
else:
    print("Skipping CompanySequenceModel initialization as there are no stocks after filtering.")
    company_model = None # Or handle as appropriate
    optimizer_company = None
    writer_company = None

### 7.2 학습 루프

In [None]:
if company_model and dataloader_seq: # Check if model and dataloader are initialized
    # 모델 그래프 로깅 (학습 시작 전, 첫 번째 배치 데이터 사용)
    if writer_company: # Check if writer is initialized
        try:
            data_iter_company = iter(dataloader_seq) 
            sample_cont_company, sample_bin_company, sample_year_company, sample_stock_company, _ = next(data_iter_company)
            writer_company.add_graph(company_model, [sample_cont_company.to(device), sample_bin_company.to(device), sample_year_company.to(device), sample_stock_company.to(device)])
            del data_iter_company 
        except Exception as e:
            print(f"Error adding CompanySequenceModel graph to TensorBoard: {e}")

    for epoch in range(EPOCHS_COMPANY_MODEL):
        epoch_loss_cont = 0.0
        epoch_loss_bin = 0.0
        num_batches = 0
        for batch_cont, batch_bin, batch_year, batch_stock, batch_target in dataloader_seq:
            batch_cont  = batch_cont.to(device)
            batch_bin   = batch_bin.to(device)
            batch_year  = batch_year.to(device)
            batch_stock = batch_stock.to(device)
            batch_target= batch_target.to(device)        

            optimizer_company.zero_grad()
            pred = company_model(batch_cont, batch_bin, batch_year, batch_stock)

            pred_cont, pred_bin = pred[:, :, :cont_input_dim], pred[:, :, cont_input_dim:]
            tgt_cont , tgt_bin  = batch_target[:, :, :cont_input_dim], batch_target[:, :, cont_input_dim:]

            loss_cont = mse_cont(pred_cont, tgt_cont)
            loss_bin  = bce_bin(pred_bin, tgt_bin)
            loss      = loss_cont + λ_bin_enc * loss_bin

            loss.backward()
            
            if writer_company and (epoch % (EPOCHS_COMPANY_MODEL // 5 if EPOCHS_COMPANY_MODEL >=5 else 1) == 0 or epoch == EPOCHS_COMPANY_MODEL -1): 
                for name, param in company_model.named_parameters():
                    if param.grad is not None:
                        writer_company.add_histogram(f'Gradients_Company/{name.replace(".", "/")}', param.grad, epoch)
            
            if optimizer_company:  # Ensure optimizer is initialized
                optimizer_company.step()
            
            epoch_loss_cont += loss_cont.item()
            epoch_loss_bin += loss_bin.item()
            num_batches += 1

        avg_loss_cont = epoch_loss_cont / num_batches if num_batches > 0 else 0
        avg_loss_bin = epoch_loss_bin / num_batches if num_batches > 0 else 0
        total_avg_loss = avg_loss_cont + λ_bin_enc * avg_loss_bin 

        if writer_company:
            writer_company.add_scalar('Loss_Company/Continuous_Train', avg_loss_cont, epoch)
            writer_company.add_scalar('Loss_Company/Binary_Train', avg_loss_bin, epoch)
            writer_company.add_scalar('Loss_Company/Total_Train', total_avg_loss, epoch)

        if epoch % (EPOCHS_COMPANY_MODEL // 5 if EPOCHS_COMPANY_MODEL >=5 else 1) == 0 or epoch == EPOCHS_COMPANY_MODEL -1:
            print(f"[CompanySequenceModel] Epoch {epoch:03d} | "
                  f"loss_cont={avg_loss_cont:.4f}  "
                  f"loss_bin={avg_loss_bin:.4f}")
            
            if writer_company:
                for name, param in company_model.named_parameters():
                    if param.requires_grad:
                        writer_company.add_histogram(f'Weights_Company/{name.replace(".", "/")}', param.data, epoch)
    
    if writer_company: writer_company.close() 
else:
    print("Skipping CompanySequenceModel training as model or dataloader is not initialized.")

### 7.3 학습 완료 후 결과 수집 및 평탄화

In [None]:
all_outputs = []
all_year_outputs = [] 
all_stock_outputs = [] 

if company_model and dataloader_seq: # Check if model and dataloader are initialized
    company_model.eval()
    with torch.no_grad():
        for batch_cont, batch_bin, batch_year, batch_stock, _ in dataloader_seq:
            batch_cont = batch_cont.to(device)
            batch_bin = batch_bin.to(device)
            batch_year = batch_year.to(device)
            batch_stock = batch_stock.to(device)
            out = company_model(batch_cont, batch_bin, batch_year, batch_stock)  
            all_outputs.append(out.cpu())
            all_year_outputs.append(batch_year.cpu())
            batch_stock_expanded = batch_stock.unsqueeze(1).repeat(1, NUM_YEARS_PARAM)
            all_stock_outputs.append(batch_stock_expanded.cpu())

    output_tensor_seq = torch.cat(all_outputs, dim=0)    
    year_tensor_seq_output = torch.cat(all_year_outputs, dim=0)           
    stock_tensor_seq_expanded = torch.cat(all_stock_outputs, dim=0) 

    output_tensor_flat = output_tensor_seq.reshape(-1, FT_OUT_DIM_PARAM)           
    year_tensor_flat = year_tensor_seq_output.reshape(-1)  
    year_embed_flat = get_sine_cosine_year_embedding(year_tensor_flat, dim=YEAR_DIM_PARAM) 
    stock_tensor_flat = stock_tensor_seq_expanded.reshape(-1)                  

    print(f"output_tensor_seq shape: {output_tensor_seq.shape}")
    print(f"output_tensor_flat shape: {output_tensor_flat.shape}")
else:
    print("Skipping CompanySequenceModel output collection as model or dataloader is not initialized.")
    output_tensor_seq = torch.empty(0, NUM_YEARS_PARAM, FT_OUT_DIM_PARAM) 
    year_tensor_seq_output = torch.empty(0, NUM_YEARS_PARAM)
    stock_tensor_seq_expanded = torch.empty(0, NUM_YEARS_PARAM)

## 8'. Transformer-Denoiser 기반 Diffusion (시계열 컨텍스트 활용)

### 8.1 Diffusion 하이퍼파라미터

In [None]:
T_diff      = 10
beta_start  = 1e-4
beta_end    = 2e-2
betas       = torch.linspace(beta_start, beta_end, T_diff, device=device)      
alphas      = 1.0 - betas
alpha_bars  = torch.cumprod(alphas, dim=0)                                     

### 8.2 학습/평가용 Dataset (Diffusion)

In [None]:
if not df_filtered.empty and NUM_STOCK_EMBEDDINGS > 0: # Ensure original data was processed
    stock_scalar_seq_diff = torch.tensor(stock_seq, dtype=torch.long) 
    bin_label_tensor_seq_diff = torch.tensor(X_bin_seq, dtype=torch.float32) 
else: 
    stock_scalar_seq_diff = torch.empty(0, dtype=torch.long)
    bin_label_tensor_seq_diff = torch.empty(0, NUM_YEARS_PARAM, bin_input_dim)

if output_tensor_seq.numel() > 0: # Check if output_tensor_seq is not empty
    diff_dataset = TensorDataset(
        output_tensor_seq,          
        year_tensor_seq_output,     
        stock_scalar_seq_diff,      
        bin_label_tensor_seq_diff   
    )
    diff_dataloader = DataLoader(diff_dataset, batch_size=BATCH_SIZE, shuffle=True)
    print(f"Diffusion dataset size: {len(diff_dataset)}")
else:
    print("Skipping Diffusion DataLoader creation as CompanySequenceModel output is empty.")
    diff_dataloader = [] # Or an empty DataLoader

### 8.3 Sinusoidal 시간-스텝 임베딩 (`TimeEmbedding`)

In [None]:
class TimeEmbedding(nn.Module):
    def __init__(self, d_model: int):
        super().__init__()
        half_d_model = d_model // 2
        inv_freq = 1. / (10000 ** (torch.arange(0, half_d_model, dtype=torch.float32) / half_d_model))
        self.register_buffer("inv_freq", inv_freq)
        self.d_model = d_model

    def forward(self, t: torch.Tensor) -> torch.Tensor: 
        sinusoid_input = t * self.inv_freq            
        emb = torch.cat([torch.sin(sinusoid_input), torch.cos(sinusoid_input)], dim=-1)  
        if self.d_model % 2 == 1:
            emb = torch.cat([emb, torch.zeros_like(emb[:, :1])], dim=-1)
        return emb

### 8.4 Transformer-Denoiser

In [None]:
class TransformerDenoiser(nn.Module):
    def __init__(
        self,
        num_stock_embeddings_denoiser, # Added parameter
        feat_dim=FT_OUT_DIM_PARAM,         
        d_model=DENOISER_D_MODEL,
        nhead=4, num_layers=4,
        stock_emb_dim=STOCK_DIM_PARAM,
        year_pos_dim=YEAR_DIM_PARAM, 
    ):
        super().__init__()
        self.feat_dim = feat_dim
        self.d_model = d_model

        self.year_proj = nn.Linear(year_pos_dim, d_model) 
        self.stock_emb = nn.Embedding(num_stock_embeddings_denoiser, stock_emb_dim)
        self.in_proj = nn.Linear(feat_dim + stock_emb_dim, d_model)

        self.t_embed = nn.Sequential(
            TimeEmbedding(d_model), nn.Linear(d_model, d_model), nn.SiLU()
        )
        self.pos_enc = PositionalEncoding(d_model, max_len=NUM_YEARS_PARAM) 

        enc_layer = nn.TransformerEncoderLayer(
            d_model=d_model, nhead=nhead,
            dim_feedforward=d_model * 4,
            dropout=0.1, batch_first=True
        )
        self.encoder = nn.TransformerEncoder(enc_layer, num_layers=num_layers)

        self.out_cont = nn.Linear(d_model, cont_input_dim)   
        self.out_bin  = nn.Linear(d_model,  bin_input_dim)   

    def forward(self, x_t, years, stock_id, t_norm):
        B, S, _ = x_t.shape 
        year_embed_raw = get_sine_cosine_year_embedding(
            years, dim=self.year_proj.in_features 
        ) 
        year_embed = self.year_proj(year_embed_raw) 
        stock_embed_val = self.stock_emb(stock_id).unsqueeze(1).repeat(1, S, 1)
        h = self.in_proj(torch.cat([x_t, stock_embed_val], dim=-1)) 
        h = self.pos_enc(h) + year_embed + self.t_embed(t_norm).unsqueeze(1) 
        h = self.encoder(h) 
        return self.out_cont(h), self.out_bin(h)    

### 8.5 Forward Diffusion (`q_sample`)

In [None]:
def q_sample(x0_seq, t_int):
    t_idx = t_int.long() - 1 
    sqrt_ab  = torch.sqrt(alpha_bars[t_idx]).view(-1,1,1)      
    sqrt_1m_ab  = torch.sqrt(1-alpha_bars[t_idx]).view(-1,1,1) 
    noise    = torch.randn_like(x0_seq)
    return sqrt_ab*x0_seq + sqrt_1m_ab*noise, noise

### 8.6 Denoiser 학습 설정 및 `snr_weight`

In [None]:
if NUM_STOCK_EMBEDDINGS > 0: # Ensure there are stocks to create embeddings for
    denoiser = TransformerDenoiser(
        num_stock_embeddings_denoiser=NUM_STOCK_EMBEDDINGS, # Pass the number of unique stocks
        feat_dim=FT_OUT_DIM_PARAM, 
        d_model=DENOISER_D_MODEL, 
        year_pos_dim=YEAR_DIM_PARAM,
        stock_emb_dim=STOCK_DIM_PARAM 
    ).to(device)
    opt_denoiser = optim.AdamW(denoiser.parameters(), lr=LEARNING_RATE_DENOISER) # Renamed optimizer
    current_time_str_denoiser = time.strftime("%Y%m%d-%H%M%S")
    writer_denoiser = SummaryWriter(f'{TENSORBOARD_LOG_DIR_DENOISER}_{current_time_str_denoiser}')
else:
    print("Skipping Denoiser model initialization as there are no stocks.")
    denoiser = None
    opt_denoiser = None
    writer_denoiser = None

def snr_weight(t_idx: torch.Tensor, 
               alpha_bars_local: torch.Tensor, 
               strategy: str = "karras",
               rho: float = 1.2) -> torch.Tensor:
    ab = alpha_bars_local[t_idx]                       
    snr = ab / (1.0 - ab)
    if strategy == "karras":                         
        weight = (snr + 1.0).pow(-rho)
    elif strategy == "simple":
        weight = 1.0 / (snr + 1.0)
    else:
        raise ValueError(f"unknown strategy {strategy}")
    return weight

### 8.7 Denoiser 학습 루프

In [None]:
criterion_c = nn.MSELoss(reduction='none')                   
bce_fn      = nn.BCEWithLogitsLoss(pos_weight=pos_w,         
                                   reduction='none')

λ_bin_denoiser = 10.0   

if denoiser and diff_dataloader: # Check if model and dataloader are initialized
    if writer_denoiser: # Check if writer is initialized
        try:
            data_iter_denoiser = iter(diff_dataloader) 
            sample_x0_diff, sample_yrs_diff, sample_st_diff, _ = next(data_iter_denoiser)
            sample_t_norm = torch.rand(sample_x0_diff.size(0), 1, device=device) 
            writer_denoiser.add_graph(denoiser, [sample_x0_diff.to(device), sample_yrs_diff.to(device), sample_st_diff.to(device), sample_t_norm])
            del data_iter_denoiser 
        except Exception as e:
            print(f"Error adding Denoiser graph to TensorBoard: {e}")

    for ep in range(EPOCHS_DENOISER_MODEL):
        epoch_loss_denoiser = 0.0
        num_batches_denoiser = 0
        for x0_diff, yrs_diff, st_diff, bin_true_diff in diff_dataloader:
            x0_diff, yrs_diff, st_diff = x0_diff.to(device), yrs_diff.to(device), st_diff.to(device)
            bin_true_diff = bin_true_diff.to(device)
            B = x0_diff.size(0)

            t_int_rand, _ = torch.sort(torch.randint(1, T_diff + 1, (B,), device=device))
            x_t, _   = q_sample(x0_diff, t_int_rand)                 
            t_norm   = t_int_rand.float().unsqueeze(1) / T_diff 

            cont_hat, bin_hat = denoiser(x_t, yrs_diff, st_diff, t_norm)   
            bin_hat = bin_hat.clamp(-15, 15) 

            cont_tgt = x0_diff[:, :, :cont_input_dim] 
            bin_tgt  = bin_true_diff 

            mse = criterion_c(cont_hat, cont_tgt).mean(dim=(1,2))          
            bce = bce_fn(bin_hat, bin_tgt).mean(dim=(1, 2))              
            w   = snr_weight(t_int_rand - 1, alpha_bars, "karras", rho=1.2)       
            loss = (w * mse + λ_bin_denoiser * bce).mean() 

            opt_denoiser.zero_grad()
            loss.backward()
            
            if writer_denoiser and (ep % (EPOCHS_DENOISER_MODEL // 5 if EPOCHS_DENOISER_MODEL >=5 else 1) == 0 or ep == EPOCHS_DENOISER_MODEL -1): 
                for name, param in denoiser.named_parameters():
                    if param.grad is not None:
                        writer_denoiser.add_histogram(f'Gradients_Denoiser/{name.replace(".", "/")}', param.grad, ep)
            
            opt_denoiser.step()
            
            epoch_loss_denoiser += loss.item()
            num_batches_denoiser +=1

        avg_loss_denoiser = epoch_loss_denoiser / num_batches_denoiser if num_batches_denoiser > 0 else 0
        if writer_denoiser:
            writer_denoiser.add_scalar('Loss_Denoiser/Total_Train', avg_loss_denoiser, ep)

        if ep % (EPOCHS_DENOISER_MODEL // 5 if EPOCHS_DENOISER_MODEL >=5 else 1) == 0 or ep == EPOCHS_DENOISER_MODEL -1:
            print(f"[Denoiser] ep {ep:03d} | loss {avg_loss_denoiser:.5f}")
            
            if writer_denoiser:
                for name, param in denoiser.named_parameters():
                    if param.requires_grad:
                        writer_denoiser.add_histogram(f'Weights_Denoiser/{name.replace(".", "/")}', param.data, ep)
    
    if writer_denoiser: writer_denoiser.close() 
else:
    print("Skipping Denoiser training as model or dataloader is not initialized.")

### 8.8 Reverse Diffusion Sampler (`p_sample_loop`)

In [None]:
@torch.no_grad()
def p_sample_loop(model, years_vec, stock_id, seq_len=NUM_YEARS_PARAM):
    model.eval() 
    x = torch.randn(1, seq_len, model.feat_dim, device=device) 
    years = years_vec.unsqueeze(0).to(device)         
    stock = torch.tensor([stock_id], device=device)   

    for t_val in range(T_diff, 0, -1): 
        t_norm_sample = torch.full((1, 1), t_val / T_diff, device=device)
        t_idx_current = t_val - 1 

        cont_hat, bin_hat = model(x, years, stock, t_norm_sample)  
        x0_hat = torch.cat([cont_hat, bin_hat], dim=-1)     

        alpha_bar_t = alpha_bars[t_idx_current]
        eps_hat = (x - alpha_bar_t.sqrt() * x0_hat) / torch.sqrt(1 - alpha_bar_t)

        beta_t, alpha_t = betas[t_idx_current], alphas[t_idx_current]
        mean = (1 / alpha_t.sqrt()) * (x - beta_t * eps_hat / torch.sqrt(1 - alpha_bar_t))

        if t_val > 1:
            x = mean + beta_t.sqrt() * torch.randn_like(x)
        else:
            x = mean                                    

    cont_final, bin_logit = x[:, :, :cont_input_dim], x[:, :, cont_input_dim:]
    bin_prob = torch.sigmoid(bin_logit)
    bin_final = (bin_prob > 0.5).float()

    x0_final = torch.cat([cont_final, bin_final], dim=-1)   
    return x0_final.squeeze(0)                              

### 8.9 Inverse Transform

In [None]:
def inverse_transform(data_np: np.ndarray, scaler_cont) -> np.ndarray: 
    data_np_transformed = data_np.copy()
    data_np_transformed[:, :cont_input_dim] = 1.0 / (1.0 + np.exp(-data_np_transformed[:, :cont_input_dim])) 
    if data_np_transformed[:, :cont_input_dim].size > 0: 
        data_np_transformed[:, :cont_input_dim] = scaler_cont.inverse_transform(data_np_transformed[:, :cont_input_dim]) 
    data_np_transformed[:, cont_input_dim:] = data_np_transformed[:, cont_input_dim:].astype(int)
    return data_np_transformed

## 9. 대량 기업 생성 & CSV 저장

### 9.1 `generate_synthetic_companies` 함수

In [None]:
def generate_synthetic_companies(model, scaler_cont_features, num_companies_to_gen=500, 
                                 seq_len=NUM_YEARS_PARAM, start_vid=10000):
    if model is None:
        print("Denoiser model is not initialized. Skipping synthetic data generation.")
        return np.empty((0, 2 + cont_input_dim + bin_input_dim))
        
    model.eval()
    n_stock_real = NUM_STOCK_EMBEDDINGS if NUM_STOCK_EMBEDDINGS > 0 else 1
        
    all_rows = []
    years_vec_gen = torch.arange(2011, 2011+seq_len, dtype=torch.float32)

    for i in range(num_companies_to_gen):
        virt_id   = start_vid + i
        stock_real_id = virt_id % n_stock_real 
        
        x_gen     = p_sample_loop(model, years_vec_gen, stock_real_id, seq_len)
        x_np      = inverse_transform(x_gen.cpu().numpy(), scaler_cont_features)
        rows      = np.hstack([
                      np.full((seq_len,1), virt_id),
                      years_vec_gen.reshape(-1,1).numpy(), 
                      x_np
                    ])
        all_rows.append(rows)
        if (i+1) % (num_companies_to_gen // 10 if num_companies_to_gen >=10 else 1) == 0 or i == num_companies_to_gen -1:
            print(f"  • {i+1}/{num_companies_to_gen} synthetic companies generated")

    if not all_rows:
        return np.empty((0, 2 + cont_input_dim + bin_input_dim))
    return np.vstack(all_rows)

### 9.2 평가: 이진 변수 확률 히스토그램

In [None]:
if denoiser and diff_dataloader and len(diff_dataloader) > 0:
    denoiser.eval()
    all_prob = []       
    with torch.no_grad():
        for x0_eval, yrs_eval, st_eval, _ in diff_dataloader: 
            x_t_eval, _ = q_sample(x0_eval.to(device),
                              torch.ones(len(x0_eval), device=device, dtype=torch.long))  
            _, bin_hat_eval = denoiser(
                x_t_eval,
                yrs_eval.to(device),
                st_eval.to(device),
                torch.ones(len(x0_eval), 1, device=device) / T_diff 
            )
            prob = torch.sigmoid(bin_hat_eval).cpu().numpy().ravel()    
            all_prob.append(prob)

    all_prob = np.concatenate(all_prob)          
    hist, bin_edges = np.histogram(all_prob, bins=10, range=(0.0, 1.0))

    print("\nProbabilities histogram for binary features (0~1):")
    for i in range(10):
        print(f"{bin_edges[i]:.1f}–{bin_edges[i+1]:.1f}: {hist[i]}")
else:
    print("Skipping probability histogram generation as Denoiser model or diff_dataloader is not initialized or empty.")

### 9.3 평가: `positive_ratio` 함수 및 호출

In [None]:
@torch.no_grad()
def positive_ratio(model, dataloader, threshold=0.5):
    if model is None or not dataloader:
        print("Skipping positive_ratio calculation as model or dataloader is not initialized or empty.")
        return
        
    model.eval()
    n_pred_pos = torch.zeros(bin_input_dim, device=device)   
    n_true_pos = torch.zeros(bin_input_dim, device=device)
    n_total    = 0

    for x0_pr, yrs_pr, st_pr, bin_true_pr in dataloader:
        x0_pr, yrs_pr, st_pr = x0_pr.to(device), yrs_pr.to(device), st_pr.to(device)
        bin_true_pr = bin_true_pr.to(device)
        B, S, _ = x0_pr.shape 

        t_int_pr  = torch.ones(B, device=device, dtype=torch.long) 
        x_t_pr, _ = q_sample(x0_pr, t_int_pr)
        t_norm_pr = t_int_pr.float().unsqueeze(1) / T_diff

        _, bin_logit_pr = model(x_t_pr, yrs_pr, st_pr, t_norm_pr)        
        prob_pr = torch.sigmoid(bin_logit_pr)                   
        pred_pos_pr = (prob_pr > threshold).sum(dim=(0, 1))     
        true_pos_pr = (bin_true_pr > 0.5).sum(dim=(0,1))
        
        n_pred_pos += pred_pos_pr
        n_true_pos += true_pos_pr
        n_total    += B * S

    ratio_pred = (n_pred_pos.cpu() / n_total).numpy() if n_total > 0 else np.zeros(bin_input_dim)
    ratio_true = (n_true_pos.cpu() / n_total).numpy() if n_total > 0 else np.zeros(bin_input_dim)
    diff       = ratio_pred - ratio_true

    print(f"\n★ Positive-ratio check (thr={threshold})")
    for i, name in enumerate(binary_features):
        print(f" {name:5s}  pred={ratio_pred[i]:.3%}  "
              f"true={ratio_true[i]:.3%}  diff={diff[i]:+.2%}")
    print()

positive_ratio(denoiser, diff_dataloader, threshold=0.5)

### 9.4 최종 데이터 생성 및 CSV 저장

In [None]:
print("\n▶  가짜 기업 생성 시작 …")
if not df_filtered.empty and minmax_scaler is not None and denoiser is not None: 
    fake_data = generate_synthetic_companies(denoiser, minmax_scaler, num_companies_to_gen=500)
    
    if fake_data.shape[0] > 0: # Check if fake_data is not empty
        raw_cols = ["Stock_ID", "YEAR"] + continuous_features + binary_features
        df_fake  = pd.DataFrame(fake_data, columns=raw_cols)

        final_cols_order = ["OWN", "FORN", "BIG4","SIZE", "LEV", "CUR", "GRW", "ROA",
                      "ROE","CFO", "PPE", "AGE", "INVREC", "MB", "TQ", "LOSS"]
        df_fake = df_fake[["Stock_ID", "YEAR"] + final_cols_order] 

        df_fake["Stock_ID"] = df_fake["Stock_ID"].astype(int)
        df_fake["YEAR"]     = df_fake["YEAR"].astype(int)

        for col in final_cols_order:
            if col in df_fake.columns:
                 if df_fake[col].dtype == 'float64' or df_fake[col].dtype == 'float32':
                    df_fake[col] = df_fake[col].round(8)

        output_csv_path = "generated_synthetic_companies.csv"
        df_fake.to_csv(output_csv_path, index=False)
        print(f"✅ {len(df_fake)//NUM_YEARS_PARAM}개 기업 × {NUM_YEARS_PARAM}년 시계열 저장 완료: {output_csv_path}")
        display(df_fake.head()) # Use display for better notebook output
    else:
        print("No fake data was generated.")
else:
    print("Skipping fake data generation and saving as original data was empty, scaler not fitted, or denoiser model not initialized.")