In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

from torch_geometric.nn import GCNConv
from torch_geometric.data import Data
from torch_geometric.utils import from_networkx

from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import haversine_distances
import networkx as nx

from tqdm import tqdm
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, mean_absolute_error

In [2]:
# 1. ‡πÇ‡∏´‡∏•‡∏î‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏•
all_data = pd.read_parquet('/project/ai901504-ai0004/507a/week6/train_data.parquet')
# all_data = all_data.rename(columns={'Station':'station_id'})
coor_df = pd.read_csv('/project/ai901504-ai0004/kaggle_competition/week6/Coor_HII_495sta.csv')

# --- ‡∏Ç‡∏±‡πâ‡∏ô‡∏ï‡∏≠‡∏ô‡∏ó‡∏µ‡πà 1: ‡πÅ‡∏Å‡πâ‡πÑ‡∏Ç‡∏Å‡∏≤‡∏£‡∏™‡∏£‡πâ‡∏≤‡∏á datetime ‡πÉ‡∏´‡πâ‡∏£‡∏ß‡∏° 'Hour' ‡πÄ‡∏Ç‡πâ‡∏≤‡πÑ‡∏õ‡∏î‡πâ‡∏ß‡∏¢ ---
all_data['datetime'] = pd.to_datetime(all_data[['Year', 'Month', 'Day', 'Hour']])

# --- ‡∏Ç‡∏±‡πâ‡∏ô‡∏ï‡∏≠‡∏ô‡∏ó‡∏µ‡πà 2: ‡∏•‡∏ö‡πÅ‡∏ñ‡∏ß‡∏ó‡∏µ‡πà‡∏ã‡πâ‡∏≥‡∏ã‡πâ‡∏≠‡∏ô (‡πÄ‡∏ú‡∏∑‡πà‡∏≠‡πÑ‡∏ß‡πâ‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö‡∏Ñ‡∏ß‡∏≤‡∏°‡∏ú‡∏¥‡∏î‡∏û‡∏•‡∏≤‡∏î‡∏≠‡∏∑‡πà‡∏ô‡πÜ ‡πÉ‡∏ô‡πÑ‡∏ü‡∏•‡πå) ---
if all_data.duplicated(subset=['station_id', 'datetime']).any():
    print(f"‡∏Ç‡∏ô‡∏≤‡∏î‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏•‡πÄ‡∏î‡∏¥‡∏°: {len(all_data)}")
    all_data = all_data.drop_duplicates(subset=['station_id', 'datetime'], keep='first')
    print(f"‡∏Ç‡∏ô‡∏≤‡∏î‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏•‡∏´‡∏•‡∏±‡∏á‡∏•‡∏ö‡πÅ‡∏ñ‡∏ß‡∏ã‡πâ‡∏≥: {len(all_data)}")

# --- ‡πÇ‡∏Ñ‡πâ‡∏î‡∏ó‡∏µ‡πà‡πÄ‡∏´‡∏•‡∏∑‡∏≠‡πÄ‡∏´‡∏°‡∏∑‡∏≠‡∏ô‡πÄ‡∏î‡∏¥‡∏° ---
all_data = all_data.sort_values(['station_id', 'datetime']).reset_index(drop=True)

features_to_lag = ['GSMaP']

lag_periods = [1, 5, 7] # days 

for feature in features_to_lag:
    for lag in lag_periods:
        new_col_name = f'{feature}_lag_{lag}h'
        # ‡∏Å‡∏≤‡∏£ lag ‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏•‡∏£‡∏≤‡∏¢‡∏ä‡∏±‡πà‡∏ß‡πÇ‡∏°‡∏á ‡∏ï‡πâ‡∏≠‡∏á shift 24 * lag
        all_data[new_col_name] = all_data.groupby('station_id')[feature].shift(lag)

all_data.fillna(0, inplace=True) 

original_features = ['GSMaP', 'humidity', 'pressure', 'temperature', 'Hour', 'Day']
new_lagged_features = [f'{f}_lag_{l}h' for f in features_to_lag for l in lag_periods]
features = original_features + new_lagged_features

# ‡πÅ‡∏ô‡∏∞‡∏ô‡∏≥‡πÉ‡∏´‡πâ‡πÄ‡∏õ‡∏¥‡∏î‡πÉ‡∏ä‡πâ‡∏á‡∏≤‡∏ô Scaler ‡πÄ‡∏û‡∏∑‡πà‡∏≠‡πÉ‡∏´‡πâ‡πÇ‡∏°‡πÄ‡∏î‡∏•‡πÄ‡∏£‡∏µ‡∏¢‡∏ô‡∏£‡∏π‡πâ‡πÑ‡∏î‡πâ‡∏î‡∏µ‡∏Ç‡∏∂‡πâ‡∏ô
scaler = StandardScaler()
all_data[features] = scaler.fit_transform(all_data[features])

all_data['y_class'] = (all_data['Groundtruth'] > 0).astype(int)
all_data['y_reg'] = np.log1p(all_data['Groundtruth'])

print("Features ‡∏ó‡∏µ‡πà‡πÉ‡∏ä‡πâ:", features)
print(f"‡∏à‡∏≥‡∏ô‡∏ß‡∏ô Features: {len(features)}")
display(all_data.head())

Features ‡∏ó‡∏µ‡πà‡πÉ‡∏ä‡πâ: ['GSMaP', 'humidity', 'pressure', 'temperature', 'Hour', 'Day', 'GSMaP_lag_1h', 'GSMaP_lag_5h', 'GSMaP_lag_7h']
‡∏à‡∏≥‡∏ô‡∏ß‡∏ô Features: 9


Unnamed: 0,Day,Month,Year,Hour,station_id,GSMaP,humidity,pressure,temperature,long,lat,Groundtruth,datetime,GSMaP_lag_1h,GSMaP_lag_5h,GSMaP_lag_7h,y_class,y_reg
0,-1.673906,1,2015,-1.661325,ACRU,-0.130052,0.112909,0.321055,-1.732527,104.64,15.79,0.0,2015-01-01 00:00:00,-0.130052,-0.130051,-0.130051,0,0.0
1,-1.673906,1,2015,-1.516862,ACRU,-0.130052,0.006538,0.321055,-1.82498,104.64,15.79,0.0,2015-01-01 01:00:00,-0.130052,-0.130051,-0.130051,0,0.0
2,-1.673906,1,2015,-1.372399,ACRU,-0.130052,-0.046648,0.264036,-1.917433,104.64,15.79,0.0,2015-01-01 02:00:00,-0.130052,-0.130051,-0.130051,0,0.0
3,-1.673906,1,2015,-1.227936,ACRU,-0.130052,-0.25939,0.264036,-1.917433,104.64,15.79,0.0,2015-01-01 03:00:00,-0.130052,-0.130051,-0.130051,0,0.0
4,-1.673906,1,2015,-1.083473,ACRU,-0.130052,-0.153019,0.264036,-2.083849,104.64,15.79,0.0,2015-01-01 04:00:00,-0.130052,-0.130051,-0.130051,0,0.0


In [3]:
coor_rad = np.radians(coor_df[['lat', 'long']].values)
distance_matrix_km = haversine_distances(coor_rad) * 6371 # ‡∏£‡∏±‡∏®‡∏°‡∏µ‡πÇ‡∏•‡∏Å ~6371 km

distance_threshold = 30 # km (‡∏õ‡∏£‡∏±‡∏ö‡∏Ñ‡πà‡∏≤‡∏ô‡∏µ‡πâ‡πÑ‡∏î‡πâ‡∏ï‡∏≤‡∏°‡∏Ñ‡∏ß‡∏≤‡∏°‡πÄ‡∏´‡∏°‡∏≤‡∏∞‡∏™‡∏°)
adjacency_matrix = (distance_matrix_km > 0) & (distance_matrix_km < distance_threshold)
np.fill_diagonal(adjacency_matrix, 0) 
G = nx.from_numpy_array(adjacency_matrix)
edge_index = torch.tensor(list(G.edges), dtype=torch.long).t().contiguous()

station_id_to_idx = {sid: i for i, sid in enumerate(coor_df['Station'])}
all_data['station_idx'] = all_data['station_id'].map(station_id_to_idx)


print(f"‡∏™‡∏£‡πâ‡∏≤‡∏á‡∏Å‡∏£‡∏≤‡∏ü‡∏™‡∏≥‡πÄ‡∏£‡πá‡∏à ‡∏°‡∏µ‡∏ó‡∏±‡πâ‡∏á‡∏´‡∏°‡∏î {adjacency_matrix.shape[0]} ‡πÇ‡∏´‡∏ô‡∏î ‡πÅ‡∏•‡∏∞ {edge_index.shape[1]} ‡πÄ‡∏™‡πâ‡∏ô‡πÄ‡∏ä‡∏∑‡πà‡∏≠‡∏°")

‡∏™‡∏£‡πâ‡∏≤‡∏á‡∏Å‡∏£‡∏≤‡∏ü‡∏™‡∏≥‡πÄ‡∏£‡πá‡∏à ‡∏°‡∏µ‡∏ó‡∏±‡πâ‡∏á‡∏´‡∏°‡∏î 495 ‡πÇ‡∏´‡∏ô‡∏î ‡πÅ‡∏•‡∏∞ 717 ‡πÄ‡∏™‡πâ‡∏ô‡πÄ‡∏ä‡∏∑‡πà‡∏≠‡∏°


In [4]:
num_edges = edge_index.size(1)
num_edges

717

In [5]:
all_datetimes = all_data['datetime'].unique()
split_ratio = 0.8
split_point = int(len(all_datetimes) * split_ratio)

train_datetimes = all_datetimes[:split_point]
val_datetimes = all_datetimes[split_point:]

# 2. ‡πÅ‡∏ö‡πà‡∏á DataFrame
train_df = all_data[all_data['datetime'].isin(train_datetimes)]
val_df = all_data[all_data['datetime'].isin(val_datetimes)]

print(f"Total timesteps: {len(all_datetimes)}")
print(f"Training timesteps: {len(train_datetimes)}")
print(f"Validation timesteps: {len(val_datetimes)}")

Total timesteps: 52608
Training timesteps: 42086
Validation timesteps: 10522


In [6]:
class RainfallDataset(Dataset):
    def __init__(self, df, features, seq_len=24, pred_len=1):
        super().__init__()
        self.features = features
        self.seq_len = seq_len
        self.pred_len = pred_len


        all_stations = df['station_id'].unique()
        all_datetimes = df['datetime'].unique()
        
        self.num_stations = len(all_stations)
        self.num_timesteps = len(all_datetimes)

        # 2. ‡∏™‡∏£‡πâ‡∏≤‡∏á MultiIndex ‡πÄ‡∏û‡∏∑‡πà‡∏≠‡∏ö‡∏±‡∏á‡∏Ñ‡∏±‡∏ö‡πÉ‡∏´‡πâ DataFrame ‡πÄ‡∏õ‡πá‡∏ô‡∏ï‡∏≤‡∏£‡∏≤‡∏á‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏•‡∏ó‡∏µ‡πà‡∏™‡∏°‡∏ö‡∏π‡∏£‡∏ì‡πå (Dense Grid)
        multi_index = pd.MultiIndex.from_product(
            [all_stations, all_datetimes],
            names=['station_id', 'datetime']
        )

        # 3. ‡∏ï‡∏±‡πâ‡∏á‡∏Ñ‡πà‡∏≤ index ‡πÅ‡∏•‡∏∞‡∏ó‡∏≥‡∏Å‡∏≤‡∏£ reindex ‡πÄ‡∏û‡∏∑‡πà‡∏≠‡πÄ‡∏ï‡∏¥‡∏°‡πÄ‡∏ï‡πá‡∏°‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏•‡∏ó‡∏µ‡πà‡∏Ç‡∏≤‡∏î‡∏´‡∏≤‡∏¢‡πÑ‡∏õ (‡∏î‡πâ‡∏ß‡∏¢ 0)
        df_indexed = df.set_index(['station_id', 'datetime'])
        df_pivoted = df_indexed.reindex(multi_index).fillna(0.0)

        # 4. ‡∏™‡∏£‡πâ‡∏≤‡∏á Tensors ‡∏à‡∏≤‡∏Å DataFrame ‡∏ó‡∏µ‡πà‡∏à‡∏±‡∏î‡πÄ‡∏£‡∏µ‡∏¢‡∏á‡πÄ‡∏£‡∏µ‡∏¢‡∏ö‡∏£‡πâ‡∏≠‡∏¢‡πÅ‡∏•‡πâ‡∏ß
        # .values ‡∏à‡∏∞‡∏î‡∏∂‡∏á‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏•‡∏ï‡∏≤‡∏°‡∏•‡∏≥‡∏î‡∏±‡∏ö‡∏Ç‡∏≠‡∏á index ‡∏ó‡∏µ‡πà‡∏à‡∏±‡∏î‡πÑ‡∏ß‡πâ ‡∏ó‡∏≥‡πÉ‡∏´‡πâ shape ‡∏ñ‡∏π‡∏Å‡∏ï‡πâ‡∏≠‡∏á‡πÄ‡∏™‡∏°‡∏≠
        feature_values = df_pivoted[self.features].values
        self.feature_tensor = torch.tensor(feature_values, dtype=torch.float).reshape(
            self.num_stations, self.num_timesteps, -1
        )
        
        y_class_values = df_pivoted['y_class'].values
        self.y_class_tensor = torch.tensor(y_class_values, dtype=torch.float).reshape(
            self.num_stations, self.num_timesteps
        )
        
        y_reg_values = df_pivoted['y_reg'].values
        self.y_reg_tensor = torch.tensor(y_reg_values, dtype=torch.float).reshape(
            self.num_stations, self.num_timesteps
        )
        # --- ‡∏™‡∏¥‡πâ‡∏ô‡∏™‡∏∏‡∏î‡∏™‡πà‡∏ß‡∏ô‡∏ó‡∏µ‡πà‡∏õ‡∏£‡∏±‡∏ö‡∏õ‡∏£‡∏∏‡∏á ---

    def __len__(self):
        return self.num_timesteps - self.seq_len - self.pred_len + 1

    def __getitem__(self, index):
        start = index
        end = start + self.seq_len
        target_end = end + self.pred_len

        x_seq = self.feature_tensor[:, start:end, :]
        y_class = self.y_class_tensor[:, end:target_end].squeeze()
        y_reg = self.y_reg_tensor[:, end:target_end].squeeze()
        
        return x_seq, y_class, y_reg

In [7]:
class GNNTransformer(nn.Module):
    def __init__(self, num_features, gcn_hidden_dim, transformer_hidden_dim, nhead=4, num_layers=2):
        super(GNNTransformer, self).__init__()
        self.num_features = num_features
        self.gcn_hidden_dim = gcn_hidden_dim

        # GNN Layer
        self.gcn = GCNConv(num_features, gcn_hidden_dim)
        
        # Transformer Encoder
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=gcn_hidden_dim, 
            nhead=nhead, 
            dim_feedforward=transformer_hidden_dim,
            batch_first=True # ‡∏™‡∏≥‡∏Ñ‡∏±‡∏ç‡∏°‡∏≤‡∏Å! ‡∏ó‡∏≥‡πÉ‡∏´‡πâ input ‡πÄ‡∏õ‡πá‡∏ô (batch, seq, feature)
        )
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        
        # Output Heads
        self.classification_head = nn.Linear(gcn_hidden_dim, 1)
        self.regression_head = nn.Linear(gcn_hidden_dim, 1)
        
        self.relu = nn.ReLU()

    def forward(self, x, edge_index):
        # x shape: [num_stations, seq_len, num_features]
        seq_len = x.shape[1]
        
        # 1. Process each timestep with GCN
        gcn_outputs = []
        for t in range(seq_len):
            x_t = x[:, t, :] # Features at timestep t for all stations
            gcn_out = self.gcn(x_t, edge_index)
            gcn_out = self.relu(gcn_out)
            gcn_outputs.append(gcn_out)
        
        # Stack to form a sequence: [num_stations, seq_len, gcn_hidden_dim]
        x_seq_emb = torch.stack(gcn_outputs, dim=1)
        
        transformer_out = self.transformer_encoder(x_seq_emb)
        
        last_timestep_out = transformer_out[:, -1, :] # Shape: [num_stations, gcn_hidden_dim]
        
        # 4. Get predictions from both heads
        class_pred = self.classification_head(last_timestep_out).squeeze() # Shape: [num_stations]
        reg_pred = self.regression_head(last_timestep_out).squeeze()     # Shape: [num_stations]
        reg_pred = self.relu(reg_pred)
        
        return class_pred, reg_pred

In [8]:

SEQ_LEN = 24
PRED_LEN = 1
GCN_HIDDEN = 128
TRANS_HIDDEN = 256
N_HEADS = 4
N_LAYERS = 3
LEARNING_RATE = 0.00001
EPOCHS = 1
REG_LOSS_WEIGHT = 0.5

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# ‡∏™‡∏°‡∏°‡∏ï‡∏¥‡∏ß‡πà‡∏≤ train_df, val_df, features, edge_index ‡∏ñ‡∏π‡∏Å‡∏™‡∏£‡πâ‡∏≤‡∏á‡πÑ‡∏ß‡πâ‡πÄ‡∏£‡∏µ‡∏¢‡∏ö‡∏£‡πâ‡∏≠‡∏¢‡πÅ‡∏•‡πâ‡∏ß
train_dataset = RainfallDataset(train_df, features, seq_len=SEQ_LEN, pred_len=PRED_LEN)
val_dataset = RainfallDataset(val_df, features, seq_len=SEQ_LEN, pred_len=PRED_LEN)
train_dataloader = DataLoader(train_dataset, batch_size=1, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=1, shuffle=False)
model = GNNTransformer(len(features), GCN_HIDDEN, TRANS_HIDDEN, N_HEADS, N_LAYERS).to(device)
model.regression_head.bias.data.fill_(0.5)
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
y_class_counts = train_df['y_class'].value_counts()
pos_weight_value = (y_class_counts[0] / y_class_counts[1]) * 0.5
pos_weight = torch.tensor([pos_weight_value], device=device)
print(f"Positive weight for BCE loss: {pos_weight_value:.2f}")
# criterion_class = nn.BCEWithLogitsLoss(pos_weight=pos_weight) # <--- ‡πÅ‡∏Å‡πâ‡πÑ‡∏Ç‡∏ö‡∏£‡∏£‡∏ó‡∏±‡∏î‡∏ô‡∏µ‡πâ
criterion_class = nn.BCEWithLogitsLoss(pos_weight=torch.tensor(3.0, device=device))
criterion_reg = nn.L1Loss()
edge_index = edge_index.to(device)
all_stations = all_data['station_id'].unique()


# --- Training & Validation Loop with TQDM ---

# 2. ‡∏Ñ‡∏£‡∏≠‡∏ö range(EPOCHS) ‡∏î‡πâ‡∏ß‡∏¢ tqdm ‡πÄ‡∏û‡∏∑‡πà‡∏≠‡πÅ‡∏™‡∏î‡∏á progress bar ‡∏Ç‡∏≠‡∏á epoch ‡∏ó‡∏±‡πâ‡∏á‡∏´‡∏°‡∏î
for epoch in tqdm(range(EPOCHS), desc="Overall Progress"):
    
    # --- TRAINING PHASE ---
    model.train()
    total_train_loss = 0
    
    train_iterator = tqdm(train_dataloader, desc=f"Epoch {epoch+1}/{EPOCHS} [Training]", leave=False)
    for i, (x_seq, y_class, y_reg) in enumerate(train_iterator):
        x_seq, y_class, y_reg = x_seq.squeeze(0).to(device), y_class.squeeze(0).to(device), y_reg.squeeze(0).to(device)
        
        optimizer.zero_grad()
        class_pred_logits, reg_pred = model(x_seq, edge_index)
        
        loss_class = criterion_class(class_pred_logits, y_class)
        rain_mask = y_class > 0
        loss_reg = torch.tensor(0.0).to(device)

        if rain_mask.sum() > 0:
            loss_reg = criterion_reg(reg_pred[rain_mask], y_reg[rain_mask])
        else:
            loss_reg = torch.tensor(0., device=device)
        loss = loss_class + REG_LOSS_WEIGHT * loss_reg
        loss.backward()
        optimizer.step()
        
        total_train_loss += loss.item()
        
    avg_train_loss = total_train_loss / len(train_dataloader)

    # --- VALIDATION PHASE ---
    model.eval()
    val_results_list = []
    all_y_true_class = []
    all_y_pred_class = []
    all_y_true_reg_rainy = []
    all_y_pred_reg_rainy = []

    with torch.no_grad():
        # 4. ‡∏Ñ‡∏£‡∏≠‡∏ö val_dataloader ‡∏î‡πâ‡∏ß‡∏¢ tqdm
        val_iterator = tqdm(val_dataloader, desc=f"Epoch {epoch+1}/{EPOCHS} [Validation]", leave=False)
        for i, (x_seq, y_class, y_reg) in enumerate(val_iterator):
            x_seq, y_class, y_reg = x_seq.squeeze(0).to(device), y_class.squeeze(0).to(device), y_reg.squeeze(0).to(device)
            class_pred_logits, reg_pred = model(x_seq, edge_index)
            class_pred_prob = torch.sigmoid(class_pred_logits)
            class_pred_labels = (class_pred_prob > 0.8).int()

            y_class_cpu = y_class.cpu().numpy()
            y_true_reg_original = np.expm1(y_reg.cpu().numpy())
            class_pred_labels_cpu = class_pred_labels.cpu().numpy()
            y_pred_reg_original = np.expm1(reg_pred.cpu().numpy())
            y_pred_reg_original[y_pred_reg_original < 0] = 0 # ‡∏õ‡πâ‡∏≠‡∏á‡∏Å‡∏±‡∏ô‡∏Ñ‡πà‡∏≤‡∏ï‡∏¥‡∏î‡∏•‡∏ö
            all_y_true_class.extend(y_class_cpu)
            all_y_pred_class.extend(class_pred_labels_cpu)
            rain_mask_cpu = y_class_cpu > 0
            if rain_mask_cpu.sum() > 0:
              all_y_true_reg_rainy.extend(y_true_reg_original[rain_mask_cpu])
              all_y_pred_reg_rainy.extend(y_pred_reg_original[rain_mask_cpu])

            current_datetime = val_datetimes[i + SEQ_LEN]
            for station_idx in range(len(all_stations)):
              val_results_list.append({
                 'datetime': current_datetime,
                 'station_id': all_stations[station_idx],
                 'y_true_class': y_class_cpu[station_idx],
                 'y_pred_class': class_pred_labels_cpu[station_idx],
                 'y_true_reg': y_true_reg_original[station_idx],   # <--- ‡πÉ‡∏ä‡πâ‡∏Ñ‡πà‡∏≤‡∏ó‡∏µ‡πà‡πÅ‡∏õ‡∏•‡∏á‡∏Å‡∏•‡∏±‡∏ö
                 'y_pred_reg': y_pred_reg_original[station_idx]    # <--- ‡πÉ‡∏ä‡πâ‡∏Ñ‡πà‡∏≤‡∏ó‡∏µ‡πà‡πÅ‡∏õ‡∏•‡∏á‡∏Å‡∏•‡∏±‡∏ö
    })

    # ‡∏Ñ‡∏≥‡∏ô‡∏ß‡∏ì Metrics (‡πÄ‡∏´‡∏°‡∏∑‡∏≠‡∏ô‡πÄ‡∏î‡∏¥‡∏°)
    val_accuracy = accuracy_score(all_y_true_class, all_y_pred_class) * 100
    val_precision = precision_score(all_y_true_class, all_y_pred_class, zero_division=0)
    val_recall = recall_score(all_y_true_class, all_y_pred_class, zero_division=0)
    val_f1 = f1_score(all_y_true_class, all_y_pred_class, zero_division=0)
    val_mae = mean_absolute_error(all_y_true_reg_rainy, all_y_pred_reg_rainy) if len(all_y_true_reg_rainy) > 0 else 0

    val_results_df = pd.DataFrame(val_results_list)

    val_results_df['y_pred_reg_adjusted'] = val_results_df.apply(
                lambda row: 0 if row['y_pred_class'] == 0 else row['y_pred_reg'],
                axis=1
             )
    val_results_df['y_pred_reg_adjusted'] = val_results_df['y_pred_reg_adjusted'].clip(lower=0)

    # ‡∏Å‡∏≤‡∏£‡πÅ‡∏™‡∏î‡∏á‡∏ú‡∏•‡∏•‡∏±‡∏û‡∏ò‡πå (‡πÄ‡∏´‡∏°‡∏∑‡∏≠‡∏ô‡πÄ‡∏î‡∏¥‡∏°)
    print(f"--- Epoch [{epoch+1}/{EPOCHS}] ---")
    print(f"Train Loss: {avg_train_loss:.4f}")
    print(f"Validation Accuracy: {val_accuracy:.2f}%")
    print(f"Validation F1-Score: {val_f1:.4f} (Precision: {val_precision:.4f}, Recall: {val_recall:.4f})")
    print(f"Validation MAE (on rainy days): {val_mae:.4f}")

val_results_df.to_csv('/project/ai901504-ai0004/507a/week6/validation_predictions_adjusted.csv', index=False)
print("\nValidation predictions (adjusted) saved to validation_predictions_adjusted.csv üéâ")


Using device: cuda
Positive weight for BCE loss: 9.23


Overall Progress: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [20:18<00:00, 1218.35s/it]

--- Epoch [1/1] ---
Train Loss: 0.5748
Validation Accuracy: 94.89%
Validation F1-Score: 0.0046 (Precision: 0.6332, Recall: 0.0023)
Validation MAE (on rainy days): 1.7735






Validation predictions (adjusted) saved to validation_predictions_adjusted.csv üéâ


In [9]:
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report, mean_absolute_error, mean_squared_error

def analyze_model_performance(csv_filepath='/project/ai901504-ai0004/507a/week6/validation_predictions_adjusted.csv'):
    """
    ‡∏ß‡∏¥‡πÄ‡∏Ñ‡∏£‡∏≤‡∏∞‡∏´‡πå‡∏õ‡∏£‡∏∞‡∏™‡∏¥‡∏ó‡∏ò‡∏¥‡∏†‡∏≤‡∏û‡∏Ç‡∏≠‡∏á‡πÇ‡∏°‡πÄ‡∏î‡∏•‡∏û‡∏¢‡∏≤‡∏Å‡∏£‡∏ì‡πå‡∏ù‡∏ô‡∏à‡∏≤‡∏Å‡πÑ‡∏ü‡∏•‡πå CSV ‡∏ó‡∏µ‡πà‡∏°‡∏µ‡∏Ñ‡πà‡∏≤‡∏à‡∏£‡∏¥‡∏á‡πÅ‡∏•‡∏∞‡∏Ñ‡πà‡∏≤‡∏ó‡∏µ‡πà‡∏ó‡∏≥‡∏ô‡∏≤‡∏¢
    ‡πÇ‡∏î‡∏¢‡∏°‡∏µ‡∏Å‡∏≤‡∏£‡πÅ‡∏™‡∏î‡∏á‡∏†‡∏≤‡∏û‡∏£‡∏ß‡∏°‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏•, ‡∏õ‡∏£‡∏∞‡∏™‡∏¥‡∏ó‡∏ò‡∏¥‡∏†‡∏≤‡∏û‡∏Å‡∏≤‡∏£‡∏à‡∏≥‡πÅ‡∏ô‡∏Å, ‡∏õ‡∏£‡∏∞‡∏™‡∏¥‡∏ó‡∏ò‡∏¥‡∏†‡∏≤‡∏û‡∏Å‡∏≤‡∏£‡∏ó‡∏≥‡∏ô‡∏≤‡∏¢‡∏õ‡∏£‡∏¥‡∏°‡∏≤‡∏ì,
    ‡πÅ‡∏•‡∏∞‡∏ß‡∏¥‡πÄ‡∏Ñ‡∏£‡∏≤‡∏∞‡∏´‡πå‡∏Ñ‡∏ß‡∏≤‡∏°‡πÄ‡∏≠‡∏ô‡πÄ‡∏≠‡∏µ‡∏¢‡∏á‡∏Ç‡∏≠‡∏á‡πÇ‡∏°‡πÄ‡∏î‡∏•
    """
    try:
        df = pd.read_csv(csv_filepath)
    except FileNotFoundError:
        print(f"‡πÑ‡∏°‡πà‡∏û‡∏ö‡πÑ‡∏ü‡∏•‡πå '{csv_filepath}'! ‡∏Å‡∏£‡∏∏‡∏ì‡∏≤‡∏ï‡∏£‡∏ß‡∏à‡∏™‡∏≠‡∏ö‡∏ß‡πà‡∏≤‡πÑ‡∏ü‡∏•‡πå‡∏≠‡∏¢‡∏π‡πà‡πÉ‡∏ô‡∏ï‡∏≥‡πÅ‡∏´‡∏ô‡πà‡∏á‡∏ó‡∏µ‡πà‡∏ñ‡∏π‡∏Å‡∏ï‡πâ‡∏≠‡∏á")
        return None

    print("--- ‡∏™‡∏£‡∏∏‡∏õ‡∏ú‡∏•‡∏Å‡∏≤‡∏£‡∏ß‡∏¥‡πÄ‡∏Ñ‡∏£‡∏≤‡∏∞‡∏´‡πå‡∏õ‡∏£‡∏∞‡∏™‡∏¥‡∏ó‡∏ò‡∏¥‡∏†‡∏≤‡∏û‡πÇ‡∏°‡πÄ‡∏î‡∏• ---")
    print("-" * 40)

    # ==================
    # 0. Data Overview (‡∏™‡πà‡∏ß‡∏ô‡∏ó‡∏µ‡πà‡πÄ‡∏û‡∏¥‡πà‡∏°‡πÄ‡∏Ç‡πâ‡∏≤‡∏°‡∏≤‡πÉ‡∏´‡∏°‡πà)
    # ==================
    print("[0] ‡∏†‡∏≤‡∏û‡∏£‡∏ß‡∏°‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏•‡πÄ‡∏ö‡∏∑‡πâ‡∏≠‡∏á‡∏ï‡πâ‡∏ô (Data Overview)")
    print(f"  - y_true_class (‡∏Ñ‡πà‡∏≤‡∏à‡∏£‡∏¥‡∏á):   Min={df['y_true_class'].min()}, Max={df['y_true_class'].max()}")
    print(f"  - y_pred_class (‡∏Ñ‡πà‡∏≤‡∏ó‡∏≤‡∏¢):   Min={df['y_pred_class'].min()}, Max={df['y_pred_class'].max()}")
    print(f"  - y_true_reg (‡∏Ñ‡πà‡∏≤‡∏à‡∏£‡∏¥‡∏á):   Min={df['y_true_reg'].min():.2f}, Max={df['y_true_reg'].max():.2f} mm")
    print(f"  - y_pred_reg (‡∏Ñ‡πà‡∏≤‡∏ó‡∏≤‡∏¢):   Min={df['y_pred_reg'].min():.2f}, Max={df['y_pred_reg'].max():.2f} mm")

    # ==================
    # 1. Classification Performance
    # ==================
    print("\n[1] ‡∏õ‡∏£‡∏∞‡∏™‡∏¥‡∏ó‡∏ò‡∏¥‡∏†‡∏≤‡∏û‡∏Å‡∏≤‡∏£‡∏à‡∏≥‡πÅ‡∏ô‡∏Å (‡∏ù‡∏ô‡∏ï‡∏Å / ‡πÑ‡∏°‡πà‡∏ï‡∏Å)")
    y_true_class = df['y_true_class']
    y_pred_class = df['y_pred_class']
    
    report_dict = classification_report(y_true_class, y_pred_class, target_names=['No Rain', 'Rain'], output_dict=True, zero_division=0)
    
    accuracy = report_dict['accuracy'] * 100
    rain_metrics = report_dict.get('Rain', {})
    
    print(f"‡∏ó‡∏≤‡∏¢‡∏ñ‡∏π‡∏Å‡πÇ‡∏î‡∏¢‡∏£‡∏ß‡∏° (Overall Accuracy): {accuracy:.2f}%")
    
    # ‡πÄ‡∏à‡∏≤‡∏∞‡∏à‡∏á‡πÄ‡∏â‡∏û‡∏≤‡∏∞ Class 'Rain'
    recall_rain = rain_metrics.get('recall', 0) * 100
    precision_rain = rain_metrics.get('precision', 0) * 100
    f1_rain = rain_metrics.get('f1-score', 0)
    
    print(f"  - ‡∏Ñ‡∏ß‡∏≤‡∏°‡∏™‡∏≤‡∏°‡∏≤‡∏£‡∏ñ‡πÉ‡∏ô‡∏Å‡∏≤‡∏£‡∏ï‡∏£‡∏ß‡∏à‡∏à‡∏±‡∏ö‡∏ù‡∏ô (Recall 'Rain'): {recall_rain:.2f}%")
    print(f"  - ‡∏Ñ‡∏ß‡∏≤‡∏°‡πÅ‡∏°‡πà‡∏ô‡∏¢‡∏≥‡πÄ‡∏°‡∏∑‡πà‡∏≠‡∏ó‡∏≤‡∏¢‡∏ß‡πà‡∏≤‡∏ù‡∏ô‡∏ï‡∏Å (Precision 'Rain'): {precision_rain:.2f}%")
    print(f"  - F1-Score 'Rain' (‡∏Ñ‡πà‡∏≤‡πÄ‡∏â‡∏•‡∏µ‡πà‡∏¢ Recall & Precision): {f1_rain:.4f}")

    # --- ‡πÄ‡∏õ‡∏£‡∏µ‡∏¢‡∏ö‡πÄ‡∏ó‡∏µ‡∏¢‡∏ö‡∏Å‡∏≤‡∏£‡∏ó‡∏≤‡∏¢ '‡πÑ‡∏°‡πà‡∏ï‡∏Å' (‡∏Ñ‡πà‡∏≤ 0) (‡∏™‡πà‡∏ß‡∏ô‡∏ó‡∏µ‡πà‡πÄ‡∏û‡∏¥‡πà‡∏°‡πÄ‡∏Ç‡πâ‡∏≤‡∏°‡∏≤‡πÉ‡∏´‡∏°‡πà) ---
    true_no_rain_count = (y_true_class == 0).sum()
    pred_no_rain_count = (y_pred_class == 0).sum()
    print("\n  [‡πÄ‡∏õ‡∏£‡∏µ‡∏¢‡∏ö‡πÄ‡∏ó‡∏µ‡∏¢‡∏ö‡∏Å‡∏≤‡∏£‡∏ó‡∏≤‡∏¢ '‡πÑ‡∏°‡πà‡∏ï‡∏Å']")
    print(f"    - ‡∏ß‡∏±‡∏ô‡∏ó‡∏µ‡πà‡∏ù‡∏ô '‡πÑ‡∏°‡πà‡∏ï‡∏Å' ‡∏à‡∏£‡∏¥‡∏á:    {true_no_rain_count} ‡πÅ‡∏ñ‡∏ß")
    print(f"    - ‡∏ß‡∏±‡∏ô‡∏ó‡∏µ‡πà‡πÇ‡∏°‡πÄ‡∏î‡∏•‡∏ó‡∏≤‡∏¢‡∏ß‡πà‡∏≤ '‡πÑ‡∏°‡πà‡∏ï‡∏Å': {pred_no_rain_count} ‡πÅ‡∏ñ‡∏ß")
    if pred_no_rain_count > true_no_rain_count:
        print("    -> ‡∏Ç‡πâ‡∏≠‡∏™‡∏±‡∏á‡πÄ‡∏Å‡∏ï: ‡πÇ‡∏°‡πÄ‡∏î‡∏•‡∏°‡∏µ‡πÅ‡∏ô‡∏ß‡πÇ‡∏ô‡πâ‡∏°‡∏ó‡∏≤‡∏¢‡∏ß‡πà‡∏≤ '‡∏ù‡∏ô‡πÑ‡∏°‡πà‡∏ï‡∏Å' ‡∏ö‡πà‡∏≠‡∏¢‡∏Å‡∏ß‡πà‡∏≤‡∏Ñ‡∏ß‡∏≤‡∏°‡πÄ‡∏õ‡πá‡∏ô‡∏à‡∏£‡∏¥‡∏á")

    # ==================
    # 2. Regression Performance
    # ==================
    print("\n[2] ‡∏õ‡∏£‡∏∞‡∏™‡∏¥‡∏ó‡∏ò‡∏¥‡∏†‡∏≤‡∏û‡∏Å‡∏≤‡∏£‡∏ó‡∏≥‡∏ô‡∏≤‡∏¢ '‡∏õ‡∏£‡∏¥‡∏°‡∏≤‡∏ì' ‡∏ô‡πâ‡∏≥‡∏ù‡∏ô (‡πÄ‡∏â‡∏û‡∏≤‡∏∞‡∏ß‡∏±‡∏ô‡∏ó‡∏µ‡πà‡∏ù‡∏ô‡∏ï‡∏Å‡∏à‡∏£‡∏¥‡∏á)")
    rainy_df = df[df['y_true_class'] == 1].copy()

    if not rainy_df.empty:
        mae = mean_absolute_error(rainy_df['y_true_reg'], rainy_df['y_pred_reg'])
        rmse = np.sqrt(mean_squared_error(rainy_df['y_true_reg'], rainy_df['y_pred_reg']))
        max_true_rain = rainy_df['y_true_reg'].max()
        max_pred_rain = rainy_df['y_pred_reg'].max()

        print(f"‡∏Ñ‡πà‡∏≤‡πÄ‡∏â‡∏•‡∏µ‡πà‡∏¢‡∏Ñ‡∏ß‡∏≤‡∏°‡∏ú‡∏¥‡∏î‡∏û‡∏•‡∏≤‡∏î (MAE): {mae:.4f} mm")
        print(f"‡∏Ñ‡πà‡∏≤‡∏ù‡∏ô‡∏™‡∏π‡∏á‡∏™‡∏∏‡∏î‡∏ó‡∏µ‡πà‡∏ó‡∏≤‡∏¢‡πÑ‡∏î‡πâ ‡πÄ‡∏ó‡∏µ‡∏¢‡∏ö‡∏Å‡∏±‡∏ö‡∏Ñ‡πà‡∏≤‡∏à‡∏£‡∏¥‡∏á: {max_pred_rain:.2f} mm vs {max_true_rain:.2f} mm")
    else:
        print("‡πÑ‡∏°‡πà‡∏û‡∏ö‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏•‡∏ß‡∏±‡∏ô‡∏ó‡∏µ‡πà‡∏ù‡∏ô‡∏ï‡∏Å‡πÉ‡∏ô‡∏ä‡∏∏‡∏î‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏• Validation")

    # ==================
    # 3. Bias Analysis
    # ==================
    print("\n[3] ‡∏Å‡∏≤‡∏£‡∏ï‡∏£‡∏ß‡∏à‡∏™‡∏≠‡∏ö‡∏Ñ‡∏ß‡∏≤‡∏°‡πÄ‡∏≠‡∏ô‡πÄ‡∏≠‡∏µ‡∏¢‡∏á (Bias)")
    
    # Classification Bias (‡∏î‡∏π‡∏à‡∏≤‡∏Å Recall)
    if recall_rain < 20: # ‡∏ñ‡πâ‡∏≤‡∏ï‡∏£‡∏ß‡∏à‡∏à‡∏±‡∏ö‡∏ù‡∏ô‡∏ï‡∏Å‡πÑ‡∏î‡πâ‡∏ï‡πà‡∏≥‡∏Å‡∏ß‡πà‡∏≤ 20%
        print("  - Classification Bias: '‡∏°‡∏µ‡πÅ‡∏ô‡∏ß‡πÇ‡∏ô‡πâ‡∏°‡∏™‡∏π‡∏á' ‡∏ó‡∏µ‡πà‡∏à‡∏∞‡πÄ‡∏≠‡∏ô‡πÄ‡∏≠‡∏µ‡∏¢‡∏á‡πÑ‡∏õ‡∏ó‡∏≤‡∏á‡∏Å‡∏≤‡∏£‡∏ó‡∏≤‡∏¢‡∏ß‡πà‡∏≤ '‡∏ù‡∏ô‡πÑ‡∏°‡πà‡∏ï‡∏Å' (Recall ‡∏ï‡πà‡∏≥‡∏°‡∏≤‡∏Å)")
    else:
        print("  - Classification Bias: '‡∏î‡∏π‡πÄ‡∏´‡∏°‡∏∑‡∏≠‡∏ô‡πÑ‡∏°‡πà‡πÄ‡∏≠‡∏ô‡πÄ‡∏≠‡∏µ‡∏¢‡∏á' ‡∏ó‡∏µ‡πà‡∏à‡∏∞‡∏ó‡∏≤‡∏¢‡∏ß‡πà‡∏≤ '‡∏ù‡∏ô‡πÑ‡∏°‡πà‡∏ï‡∏Å' ‡∏°‡∏≤‡∏Å‡πÄ‡∏Å‡∏¥‡∏ô‡πÑ‡∏õ")

    # Regression Bias (‡∏î‡∏π‡∏à‡∏≤‡∏Å‡∏Ñ‡πà‡∏≤‡πÄ‡∏â‡∏•‡∏µ‡πà‡∏¢‡∏Ç‡∏≠‡∏á error)
    if not rainy_df.empty:
        mean_error = (rainy_df['y_pred_reg'] - rainy_df['y_true_reg']).mean()
        if mean_error > 0.5:
            print(f"  - Regression Bias: '‡∏°‡∏µ‡πÅ‡∏ô‡∏ß‡πÇ‡∏ô‡πâ‡∏°‡∏ó‡∏≤‡∏¢‡∏õ‡∏£‡∏¥‡∏°‡∏≤‡∏ì‡∏ù‡∏ô‡∏™‡∏π‡∏á‡∏Å‡∏ß‡πà‡∏≤‡∏à‡∏£‡∏¥‡∏á' (Over-prediction) ‡πÇ‡∏î‡∏¢‡πÄ‡∏â‡∏•‡∏µ‡πà‡∏¢ {mean_error:.2f} mm")
        elif mean_error < -0.5:
            print(f"  - Regression Bias: '‡∏°‡∏µ‡πÅ‡∏ô‡∏ß‡πÇ‡∏ô‡πâ‡∏°‡∏ó‡∏≤‡∏¢‡∏õ‡∏£‡∏¥‡∏°‡∏≤‡∏ì‡∏ù‡∏ô‡∏ï‡πà‡∏≥‡∏Å‡∏ß‡πà‡∏≤‡∏à‡∏£‡∏¥‡∏á' (Under-prediction) ‡πÇ‡∏î‡∏¢‡πÄ‡∏â‡∏•‡∏µ‡πà‡∏¢ {abs(mean_error):.2f} mm")
        else:
            print("  - Regression Bias: '‡πÑ‡∏°‡πà‡∏°‡∏µ‡πÅ‡∏ô‡∏ß‡πÇ‡∏ô‡πâ‡∏°‡∏ó‡∏≤‡∏¢‡∏™‡∏π‡∏á‡∏´‡∏£‡∏∑‡∏≠‡∏ï‡πà‡∏≥‡∏Å‡∏ß‡πà‡∏≤‡∏à‡∏£‡∏¥‡∏á‡∏≠‡∏¢‡πà‡∏≤‡∏á‡∏ä‡∏±‡∏î‡πÄ‡∏à‡∏ô'")
            
    print("-" * 40)
    
    return report_dict, df # ‡∏Ñ‡∏∑‡∏ô‡∏Ñ‡πà‡∏≤‡∏ú‡∏•‡∏•‡∏±‡∏û‡∏ò‡πå‡πÄ‡∏ú‡∏∑‡πà‡∏≠‡πÉ‡∏ä‡πâ‡∏ï‡πà‡∏≠

# --- ‡∏£‡∏±‡∏ô‡∏ü‡∏±‡∏á‡∏Å‡πå‡∏ä‡∏±‡∏ô‡∏ß‡∏¥‡πÄ‡∏Ñ‡∏£‡∏≤‡∏∞‡∏´‡πå ---
# ‡∏´‡∏≤‡∏Å‡∏ï‡πâ‡∏≠‡∏á‡∏Å‡∏≤‡∏£‡∏£‡∏±‡∏ô ‡πÉ‡∏´‡πâ‡πÉ‡∏™‡πà path ‡∏Ç‡∏≠‡∏á‡πÑ‡∏ü‡∏•‡πå‡πÉ‡∏´‡πâ‡∏ñ‡∏π‡∏Å‡∏ï‡πâ‡∏≠‡∏á‡πÅ‡∏•‡πâ‡∏ß‡πÄ‡∏≠‡∏≤ comment ‡∏≠‡∏≠‡∏Å
results = analyze_model_performance(csv_filepath='/project/ai901504-ai0004/507a/week6/validation_predictions_adjusted.csv')

--- ‡∏™‡∏£‡∏∏‡∏õ‡∏ú‡∏•‡∏Å‡∏≤‡∏£‡∏ß‡∏¥‡πÄ‡∏Ñ‡∏£‡∏≤‡∏∞‡∏´‡πå‡∏õ‡∏£‡∏∞‡∏™‡∏¥‡∏ó‡∏ò‡∏¥‡∏†‡∏≤‡∏û‡πÇ‡∏°‡πÄ‡∏î‡∏• ---
----------------------------------------
[0] ‡∏†‡∏≤‡∏û‡∏£‡∏ß‡∏°‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏•‡πÄ‡∏ö‡∏∑‡πâ‡∏≠‡∏á‡∏ï‡πâ‡∏ô (Data Overview)
  - y_true_class (‡∏Ñ‡πà‡∏≤‡∏à‡∏£‡∏¥‡∏á):   Min=0.0, Max=1.0
  - y_pred_class (‡∏Ñ‡πà‡∏≤‡∏ó‡∏≤‡∏¢):   Min=0, Max=1
  - y_true_reg (‡∏Ñ‡πà‡∏≤‡∏à‡∏£‡∏¥‡∏á):   Min=0.00, Max=128.40 mm
  - y_pred_reg (‡∏Ñ‡πà‡∏≤‡∏ó‡∏≤‡∏¢):   Min=0.05, Max=1.55 mm

[1] ‡∏õ‡∏£‡∏∞‡∏™‡∏¥‡∏ó‡∏ò‡∏¥‡∏†‡∏≤‡∏û‡∏Å‡∏≤‡∏£‡∏à‡∏≥‡πÅ‡∏ô‡∏Å (‡∏ù‡∏ô‡∏ï‡∏Å / ‡πÑ‡∏°‡πà‡∏ï‡∏Å)
‡∏ó‡∏≤‡∏¢‡∏ñ‡∏π‡∏Å‡πÇ‡∏î‡∏¢‡∏£‡∏ß‡∏° (Overall Accuracy): 94.89%
  - ‡∏Ñ‡∏ß‡∏≤‡∏°‡∏™‡∏≤‡∏°‡∏≤‡∏£‡∏ñ‡πÉ‡∏ô‡∏Å‡∏≤‡∏£‡∏ï‡∏£‡∏ß‡∏à‡∏à‡∏±‡∏ö‡∏ù‡∏ô (Recall 'Rain'): 0.23%
  - ‡∏Ñ‡∏ß‡∏≤‡∏°‡πÅ‡∏°‡πà‡∏ô‡∏¢‡∏≥‡πÄ‡∏°‡∏∑‡πà‡∏≠‡∏ó‡∏≤‡∏¢‡∏ß‡πà‡∏≤‡∏ù‡∏ô‡∏ï‡∏Å (Precision 'Rain'): 63.32%
  - F1-Score 'Rain' (‡∏Ñ‡πà‡∏≤‡πÄ‡∏â‡∏•‡∏µ‡πà‡∏¢ Recall & Precision): 0.0046

  [‡πÄ‡∏õ‡∏£‡∏µ‡∏¢‡∏ö‡πÄ‡∏ó‡∏µ‡∏¢‡∏ö‡∏Å‡∏≤‡∏£‡∏ó‡∏≤‡∏¢ '‡πÑ‡∏°‡πà‡∏ï‡∏Å']
   

In [10]:
# --- Load and preprocess test dataset ---
test_df = pd.read_csv('/project/ai901504-ai0004/507a/week6/test_data.csv')
test_df['datetime'] = pd.to_datetime(test_df[['Year', 'Month', 'Day', 'Hour']])
test_df = test_df.sort_values(['station_id', 'datetime']).reset_index(drop=True)

# Apply lag features (same as train)
for feature in features_to_lag:
    for lag in lag_periods:
        new_col_name = f'{feature}_lag_{lag}h'
        test_df[new_col_name] = test_df.groupby('station_id')[feature].shift(lag)

test_df.fillna(0, inplace=True)
test_df[features] = scaler.transform(test_df[features])  # Apply same scaler

# Add dummy target columns for compatibility
test_df['y_class'] = 0
test_df['y_reg'] = 0
test_df['station_idx'] = test_df['station_id'].map(station_id_to_idx)

# --- Create test dataset and dataloader ---
test_dataset = RainfallDataset(test_df, features, seq_len=SEQ_LEN, pred_len=PRED_LEN)
test_dataloader = DataLoader(test_dataset, batch_size=1, shuffle=False)

# --- Predict using the trained model ---
model.eval()
all_preds = []

with torch.no_grad():
    for x_seq, _, _ in tqdm(test_dataloader, desc="Predicting on test set"):
        x_seq = x_seq.squeeze(0).to(device)  # Shape: [num_stations, seq_len, num_features]
        class_logits, reg_pred = model(x_seq, edge_index)
        rain_pred = (torch.sigmoid(class_logits) > 0.9).float()
        rainfall_mm = (torch.expm1(reg_pred) * rain_pred).cpu().numpy()  # Only predict if rain
        all_preds.append(rainfall_mm)

# --- Reconstruct submission dataframe ---
# Flatten predictions: [num_samples, num_stations] ‚Üí long-form dataframe
preds_array = np.stack(all_preds)  # Shape: [T, S]
preds_flat = preds_array.reshape(-1)  # Flatten

# Reconstruct datetime and station grid
unique_datetimes = test_df['datetime'].unique()
num_stations = len(all_stations)
timesteps = len(test_dataset)

datetime_repeat = np.repeat(unique_datetimes[SEQ_LEN:SEQ_LEN+timesteps], num_stations)
station_tile = np.tile(all_stations, timesteps)

submission_df = pd.DataFrame({
    'datetime': datetime_repeat,
    'station_id': station_tile,
    'Predicted_Groundtruth': preds_flat
})

# Optional: sort and save
submission_df = submission_df.sort_values(['datetime', 'station_id'])
submission_df.to_csv('/project/ai901504-ai0004/507a/week6/satu_submission_f.csv', index=False)
print("‚úÖ Saved predictions to predicted_submission.csv")


Predicting on test set: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 35040/35040 [08:29<00:00, 68.78it/s]


‚úÖ Saved predictions to predicted_submission.csv


In [11]:
submission_df

Unnamed: 0,datetime,station_id,Predicted_Groundtruth
0,2021-01-02 00:00:00,ACRU,0.0
1,2021-01-02 00:00:00,BAKI,0.0
2,2021-01-02 00:00:00,BARI,0.0
3,2021-01-02 00:00:00,BBHN,0.0
4,2021-01-02 00:00:00,BBON,0.0
...,...,...,...
17344795,2024-12-31 23:00:00,YOM006,0.0
17344796,2024-12-31 23:00:00,YOM007,0.0
17344797,2024-12-31 23:00:00,YOM008,0.0
17344798,2024-12-31 23:00:00,YOM009,0.0


In [12]:
# Load your prediction result CSV
df = submission_df  # Replace with your filename

# Parse datetime into components
df['datetime'] = pd.to_datetime(df['datetime'])
df['Year'] = df['datetime'].dt.year
df['Month'] = df['datetime'].dt.month
df['Day'] = df['datetime'].dt.day
df['Hour'] = df['datetime'].dt.hour

# Create a string-based index column like "2021-1-1-0"
df['index'] = df['Year'].astype(str) + '-' + df['Month'].astype(str) + '-' + df['Day'].astype(str) + '-' + df['Hour'].astype(str)

# Pivot the table to wide format
df_pivot = df.pivot_table(index='index',
                          columns='station_id',
                          values='Predicted_Groundtruth',
                          aggfunc='first').reset_index()

# Merge back the datetime components for Year/Month/Day/Hour
datetime_parts = df[['index', 'Year', 'Month', 'Day', 'Hour']].drop_duplicates()
final_df = datetime_parts.merge(df_pivot, on='index')

# Sort by date
final_df = final_df.sort_values(by=['Year', 'Month', 'Day', 'Hour']).reset_index(drop=True)

# Save to CSV
final_df.to_csv("satu_formatted_submission_f.csv", index=False)

In [13]:
final_df

Unnamed: 0,index,Year,Month,Day,Hour,ACRU,BAKI,BARI,BBHN,BBON,...,WTSG,YGHM,YOM001,YOM003,YOM005,YOM006,YOM007,YOM008,YOM009,YOM010
0,2021-1-2-0,2021,1,2,0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2021-1-2-1,2021,1,2,1,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2021-1-2-2,2021,1,2,2,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2021-1-2-3,2021,1,2,3,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2021-1-2-4,2021,1,2,4,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35035,2024-12-31-19,2024,12,31,19,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
35036,2024-12-31-20,2024,12,31,20,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
35037,2024-12-31-21,2024,12,31,21,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
35038,2024-12-31-22,2024,12,31,22,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
