In [None]:
import os
import torch
import torch.nn as nn
import torchvision.models as models
from torchvision import transforms
from torch.utils.data import Dataset, DataLoader, Subset
from PIL import Image
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from torch.optim.lr_scheduler import OneCycleLR
from sklearn.preprocessing import StandardScaler
import math

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# Reproducibility
def seed_everything(seed=42):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    np.random.seed(seed)

seed_everything()

# Timestamp parser
def parse_timestamp(timestamp_str):
    if pd.isna(timestamp_str):
        return 0
    try:
        parts = timestamp_str.split(':')
        if len(parts) == 2:
            h, m = map(int, parts)
            return h*60 + m
        elif len(parts) == 3:
            h, m, s = map(int, parts)
            return h*60 + m + s/60
        else:
            return 0
    except:
        return 0

class MultimodalAngleDataset(Dataset):
    def __init__(self, df, img_dir, transform=None):
        self.df = df.copy().reset_index(drop=True)
        self.img_dir = img_dir
        self.transform = transform

        # parse timestamps
        self.df['time_minutes'] = self.df['timestamp'].apply(parse_timestamp)

        # fill and scale numeric
        num_feats = ['latitude','longitude','time_minutes']
        self.scaler = StandardScaler()
        for c in num_feats:
            if self.df[c].isna().any():
                self.df[c].fillna(self.df[c].median(), inplace=True)
        self.df[num_feats] = self.scaler.fit_transform(self.df[num_feats])

        # region IDs (unused embedding here, but kept as numeric)
        self.df['Region_ID'] = self.df['Region_ID'].astype(float)

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        img_path = os.path.join(self.img_dir, row['filename'])
        img = Image.open(img_path).convert('RGB')
        if self.transform:
            img = self.transform(img)
        angle = float(row['angle'])
        sin_a = math.sin(math.radians(angle))
        cos_a = math.cos(math.radians(angle))
        meta = torch.tensor([
            row['time_minutes'], row['latitude'], row['longitude'], row['Region_ID']
        ], dtype=torch.float32)
        return img, meta, torch.tensor([sin_a, cos_a], dtype=torch.float32), angle, row['filename']

# Transforms
train_transform = transforms.Compose([
    transforms.Resize((256,256)),
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(10),
    transforms.ColorJitter(0.1,0.1),
    transforms.ToTensor(),
    transforms.Normalize([0.485,0.456,0.406],[0.229,0.224,0.225])
])
val_transform = transforms.Compose([
    transforms.Resize((256,256)),
    transforms.ToTensor(),
    transforms.Normalize([0.485,0.456,0.406],[0.229,0.224,0.225])
])

# Load train and validation data
train_csv = "/kaggle/input/smainewdataset/Phase_2_data/labels_train.csv"
val_csv   = "/kaggle/input/smainewdataset/Phase_2_data/labels_val.csv"
train_img_dir = "/kaggle/input/smainewdataset/Phase_2_data/images_train/images_train"
val_img_dir   = "/kaggle/input/smainewdataset/Phase_2_data/images_val/images_val"

train_df = pd.read_csv(train_csv)
val_df   = pd.read_csv(val_csv)

# Model definition
class MultimodalAnglePredictor(nn.Module):
    def __init__(self, backbone='convnext_tiny', metadata_dim=4):
        super().__init__()
        if backbone=='resnet34':
            base = models.resnet34(pretrained=True)
            feat_dim = base.fc.in_features
            self.img_enc = nn.Sequential(*list(base.children())[:-1])
        elif backbone=='efficientnet_b0':
            base = models.efficientnet_b0(pretrained=True)
            feat_dim = base.classifier[1].in_features
            self.img_enc = nn.Sequential(*list(base.children())[:-1])
        elif backbone=='convnext_tiny':
            base = models.convnext_tiny(pretrained=True)
            feat_dim = base.classifier[2].in_features
            self.img_enc = nn.Sequential(*list(base.children())[:-1])
        else:
            raise ValueError(backbone)
        self.meta_enc = nn.Sequential(
            nn.Linear(metadata_dim,64), nn.ReLU(),
            nn.Linear(64,128), nn.ReLU()
        )
        self.head = nn.Sequential(
            nn.Linear(feat_dim+128,256), nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256,2)
        )
    def forward(self, x_img, x_meta):
        f_img = self.img_enc(x_img).flatten(1)
        f_meta = self.meta_enc(x_meta)
        return self.head(torch.cat([f_img,f_meta], dim=1))

# Region Predictor Model
class RegionPredictor:
    def __init__(self, model_path):
        self.model_path = model_path
        self.model = MultimodalAnglePredictor(backbone='efficientnet_b0')  # Assuming similar architecture
        self.model.load_state_dict(torch.load(model_path))
        self.model.to(device)
        self.model.eval()
    
    def predict(self, img, meta):
        with torch.no_grad():
            output = self.model(img, meta)
            # Assuming output structure is compatible with region prediction
            # Adjust as necessary based on the actual structure
            return torch.argmax(output, dim=1)

# Loss and metrics
def sin_cos_to_angle(sin_vals, cos_vals):
    ang = np.degrees(np.arctan2(sin_vals, cos_vals))
    return (ang + 360) % 360

def circular_distance(a, b):
    d = np.abs(a-b)
    return np.minimum(d, 360-d)

def circular_mae(a, b):
    return np.mean(circular_distance(a, b))

class CircularMSELoss(nn.Module):
    def forward(self, out, tgt):
        return ((out - tgt)**2).mean()

class CircularMAELoss(nn.Module):
    def forward(self, out, tgt):
        # Convert sin/cos to angles
        pred_angles = torch.atan2(out[:, 0], out[:, 1]) * 180 / math.pi
        pred_angles = (pred_angles + 360) % 360
        
        true_angles = torch.atan2(tgt[:, 0], tgt[:, 1]) * 180 / math.pi
        true_angles = (true_angles + 360) % 360
        
        # Calculate circular distance
        diff = torch.abs(pred_angles - true_angles)
        circular_diff = torch.min(diff, 360 - diff)
        
        return circular_diff.mean()

# Extract image ID from filename
def extract_image_id(filename):
    # Assuming filename format like "image_123.jpg"
    # Extract just the numeric part as the ID
    return int(os.path.splitext(filename)[0].split('_')[-1])

# Create datasets (without creating loaders yet - we'll create region-specific loaders)
train_ds = MultimodalAngleDataset(train_df, train_img_dir, train_transform)
val_ds = MultimodalAngleDataset(val_df, val_img_dir, val_transform)

# Get all unique region IDs
all_region_ids = sorted(train_df['Region_ID'].unique())
print(f"Found {len(all_region_ids)} unique regions: {all_region_ids}")

# Create region-specific datasets
def get_region_indices(dataset, region_id):
    """Get indices of samples belonging to specified region_id"""
    indices = []
    for i in range(len(dataset)):
        if dataset.df.iloc[i]['Region_ID'] == region_id:
            indices.append(i)
    return indices

# # Function to train a single region model
# def train_region_model(region_id, train_ds, val_ds, backbone='efficientnet_b0', epochs=100, patience=100):
#     print(f"\n{'='*50}")
#     print(f"Training model for Region {region_id}")
#     print(f"{'='*50}")
    
#     # Get indices for this region
#     train_indices = get_region_indices(train_ds, region_id)
#     val_indices = get_region_indices(val_ds, region_id)
    
#     print(f"Region {region_id} has {len(train_indices)} training samples and {len(val_indices)} validation samples")
    
#     if len(train_indices) == 0 or len(val_indices) == 0:
#         print(f"Skipping Region {region_id} due to insufficient data")
#         return None, float('inf'), None
    
#     # Create region-specific datasets
#     region_train_ds = Subset(train_ds, train_indices)
#     region_val_ds = Subset(val_ds, val_indices)
    
#     # Create data loaders
#     batch_size = min(64, len(region_train_ds))  # Adjust batch size based on dataset size
#     train_loader = DataLoader(region_train_ds, batch_size=batch_size, shuffle=True, num_workers=4, pin_memory=True)
#     val_loader = DataLoader(region_val_ds, batch_size=batch_size, shuffle=False, num_workers=4, pin_memory=True)
    
#     # Initialize model
#     model = MultimodalAnglePredictor(backbone=backbone)
#     model.to(device)
    
#     # Initialize optimizer, scheduler, and criterion
#     opt = torch.optim.AdamW(model.parameters(), lr=1e-3, weight_decay=0.01)
#     sched = OneCycleLR(opt, max_lr=1e-3, epochs=epochs, steps_per_epoch=len(train_loader))
#     crit = CircularMSELoss()
#     mae_crit = CircularMAELoss()  # For monitoring MAE directly
    
#     # Training loop with early stopping
#     best_mae = float('inf')
#     best_predictions = None
#     no_improve = 0
#     out_dir = f"region_{region_id}_models"
#     os.makedirs(out_dir, exist_ok=True)
    
#     for ep in range(epochs):
#         # Training
#         model.train()
#         running_loss = 0.0
#         running_mae = 0.0
#         for imgs, metas, sincos, angles, _ in tqdm(train_loader, desc=f"Train Epoch {ep+1}/{epochs}"):
#             imgs, metas, sincos = imgs.to(device), metas.to(device), sincos.to(device)
            
#             opt.zero_grad()
#             out = model(imgs, metas)
            
#             # Calculate losses
#             loss = crit(out, sincos)
#             mae_loss = mae_crit(out, sincos)
            
#             # Update model
#             loss.backward()
#             opt.step()
#             sched.step()
            
#             # Track metrics
#             running_loss += loss.item() * imgs.size(0)
#             running_mae += mae_loss.item() * imgs.size(0)
            
#         train_loss = running_loss / len(region_train_ds)
#         train_mae = running_mae / len(region_train_ds)
        
#         # Validation
#         model.eval()
#         running_loss = 0.0
#         true_ang, pred_ang = [], []
#         img_ids, predictions = [], []
        
#         with torch.no_grad():
#             for imgs, metas, sincos, orig_ang, filenames in tqdm(val_loader, desc=f"Val Epoch {ep+1}/{epochs}"):
#                 imgs, metas = imgs.to(device), metas.to(device)
#                 sincos = sincos.to(device)
                
#                 out = model(imgs, metas)
#                 loss = crit(out, sincos)
#                 running_loss += loss.item() * imgs.size(0)
                
#                 # Convert to angles for MAE calculation
#                 ps = out[:, 0].cpu().numpy()
#                 pc = out[:, 1].cpu().numpy()
#                 pa = sin_cos_to_angle(ps, pc)
                
#                 # Store predictions
#                 for i in range(len(imgs)):
#                     img_id = extract_image_id(filenames[i])
#                     img_ids.append(img_id)
#                     predictions.append(pa[i])
                
#                 true_ang.extend(orig_ang.numpy())
#                 pred_ang.extend(pa)
        
#         # Calculate metrics
#         val_loss = running_loss / len(region_val_ds)
#         val_mae = circular_mae(np.array(true_ang), np.array(pred_ang))
#         val_score = 1.0 / (1.0 + val_mae)
        
#         print(f"Epoch {ep+1}: Train Loss={train_loss:.4f}, Train MAE={train_mae:.4f}, Val Loss={val_loss:.4f}, Val MAE={val_mae:.4f}, Score={val_score:.4f}")
        
#         # Save best model and predictions
#         if val_mae < best_mae:
#             best_mae = val_mae
#             no_improve = 0
#             # Save model
#             torch.save(model.state_dict(), os.path.join(out_dir, f"best_model_region_{region_id}.pth"))
            
#             # Save predictions
#             best_predictions = pd.DataFrame({
#                 'id': img_ids,
#                 'angle': predictions
#             })
#             best_predictions.to_csv(os.path.join(out_dir, f"best_predictions_region_{region_id}.csv"), index=False)
#             print(f"Saved new best model with MAE={best_mae:.4f} and predictions")
#         else:
#             no_improve += 1
#             if no_improve >= patience:
#                 print(f"No improvement for {patience} epochs. Early stopping.")
#                 break
    
#     # Load best model
#     model.load_state_dict(torch.load(os.path.join(out_dir, f"best_model_region_{region_id}.pth")))
#     print(f"Region {region_id} training complete. Best MAE: {best_mae:.4f}")
    
#     return model, best_mae, best_predictions

# Function to train a single region model with improved performance
def train_region_model(region_id, train_ds, val_ds, backbone='efficientnet_b0', epochs=150, patience=20):
    print(f"\n{'='*50}")
    print(f"Training model for Region {region_id}")
    print(f"{'='*50}")
    
    # Get indices for this region
    train_indices = get_region_indices(train_ds, region_id)
    val_indices = get_region_indices(val_ds, region_id)
    
    print(f"Region {region_id} has {len(train_indices)} training samples and {len(val_indices)} validation samples")
    
    if len(train_indices) == 0 or len(val_indices) == 0:
        print(f"Skipping Region {region_id} due to insufficient data")
        return None, float('inf'), None
    
    # Create region-specific datasets
    region_train_ds = Subset(train_ds, train_indices)
    region_val_ds = Subset(val_ds, val_indices)
    
    # Create data loaders - smaller batch size for better generalization
    batch_size = min(32, len(region_train_ds))  # Reduced from 64 to 32
    train_loader = DataLoader(region_train_ds, batch_size=batch_size, shuffle=True, num_workers=4, pin_memory=True)
    val_loader = DataLoader(region_val_ds, batch_size=batch_size, shuffle=False, num_workers=4, pin_memory=True)
    
    # Try a different, more powerful backbone for region 1
    if region_id == 1:
        current_backbone = 'convnext_tiny'  # Use ConvNext for region 1
    else:
        current_backbone = backbone
        
    print(f"Using {current_backbone} backbone for region {region_id}")
    
    # Initialize model with the selected backbone
    model = MultimodalAnglePredictor(backbone=current_backbone)
    model.to(device)
    
    # Initialize optimizer with different learning rate and weight decay settings
    # Lower learning rate and stronger regularization
    if region_id == 1:
        opt = torch.optim.AdamW(model.parameters(), lr=5e-4, weight_decay=0.02)  
        # Use a different scheduler for region 1
        total_steps = epochs * len(train_loader)
        sched = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(
            opt, T_0=10, T_mult=2, eta_min=1e-6
        )
    else:
        opt = torch.optim.AdamW(model.parameters(), lr=1e-3, weight_decay=0.01)
        sched = OneCycleLR(opt, max_lr=1e-3, epochs=epochs, steps_per_epoch=len(train_loader))
    
    # Loss functions - weighted combination for region 1
    crit = CircularMSELoss()
    mae_crit = CircularMAELoss()
    
    # Training loop with early stopping
    best_mae = float('inf')
    best_epoch = 0
    best_predictions = None
    no_improve = 0
    out_dir = f"region_{region_id}_models"
    os.makedirs(out_dir, exist_ok=True)
    
    # For region 1, implement more aggressive data augmentation
    if region_id == 1:
        region_train_transform = transforms.Compose([
            transforms.Resize((256,256)),
            transforms.RandomHorizontalFlip(p=0.5),
            transforms.RandomVerticalFlip(p=0.1),  # Additional augmentation
            transforms.RandomRotation(20),  # More rotation
            transforms.RandomAffine(degrees=0, translate=(0.1, 0.1), scale=(0.9, 1.1)),  # Additional transform
            transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),  # Stronger color jitter
            transforms.ToTensor(),
            transforms.Normalize([0.485,0.456,0.406],[0.229,0.224,0.225]),
            transforms.RandomErasing(p=0.2)  # Random erasing for robustness
        ])
        
        # Apply the custom transform to the training dataset
        for i in range(len(region_train_ds)):
            region_train_ds.dataset.transform = region_train_transform
    
    for ep in range(epochs):
        # Training
        model.train()
        running_loss = 0.0
        running_mae = 0.0
        
        # Learning rate warmup for first few epochs for region 1
        if region_id == 1 and ep < 5 and isinstance(sched, torch.optim.lr_scheduler.CosineAnnealingWarmRestarts):
            for param_group in opt.param_groups:
                param_group['lr'] = (ep + 1) * 5e-4 / 5
        
        for imgs, metas, sincos, angles, _ in tqdm(train_loader, desc=f"Train Epoch {ep+1}/{epochs}"):
            imgs, metas, sincos = imgs.to(device), metas.to(device), sincos.to(device)
            
            opt.zero_grad()
            out = model(imgs, metas)
            
            # Calculate losses
            loss = crit(out, sincos)
            mae_loss = mae_crit(out, sincos)
            
            # For region 1, apply a weighted combination of losses
            if region_id == 1:
                combined_loss = 0.7 * loss + 0.3 * mae_loss  # Combine MSE and MAE losses
                combined_loss.backward()
            else:
                loss.backward()
                
            # Gradient clipping to prevent exploding gradients
            if region_id == 1:
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                
            opt.step()
            
            # Different scheduler step timing
            if region_id != 1 or not isinstance(sched, torch.optim.lr_scheduler.CosineAnnealingWarmRestarts):
                sched.step()
            
            # Track metrics
            running_loss += loss.item() * imgs.size(0)
            running_mae += mae_loss.item() * imgs.size(0)
            
        # Step scheduler once per epoch for CosineAnnealingWarmRestarts
        if region_id == 1 and isinstance(sched, torch.optim.lr_scheduler.CosineAnnealingWarmRestarts):
            sched.step()
            
        train_loss = running_loss / len(region_train_ds)
        train_mae = running_mae / len(region_train_ds)
        
        # Validation
        model.eval()
        running_loss = 0.0
        true_ang, pred_ang = [], []
        img_ids, predictions = [], []
        
        with torch.no_grad():
            for imgs, metas, sincos, orig_ang, filenames in tqdm(val_loader, desc=f"Val Epoch {ep+1}/{epochs}"):
                imgs, metas = imgs.to(device), metas.to(device)
                sincos = sincos.to(device)
                
                out = model(imgs, metas)
                loss = crit(out, sincos)
                running_loss += loss.item() * imgs.size(0)
                
                # Convert to angles for MAE calculation
                ps = out[:, 0].cpu().numpy()
                pc = out[:, 1].cpu().numpy()
                pa = sin_cos_to_angle(ps, pc)
                
                # Store predictions
                for i in range(len(imgs)):
                    img_id = extract_image_id(filenames[i])
                    img_ids.append(img_id)
                    predictions.append(pa[i])
                
                true_ang.extend(orig_ang.numpy())
                pred_ang.extend(pa)
        
        # Calculate metrics
        val_loss = running_loss / len(region_val_ds)
        val_mae = circular_mae(np.array(true_ang), np.array(pred_ang))
        val_score = 1.0 / (1.0 + val_mae)
        
        # Print current learning rate
        current_lr = opt.param_groups[0]['lr']
        print(f"Epoch {ep+1}: Train Loss={train_loss:.4f}, Train MAE={train_mae:.4f}, Val Loss={val_loss:.4f}, Val MAE={val_mae:.4f}, Score={val_score:.4f}, LR={current_lr:.6f}")
        
        # Save best model and predictions
        if val_mae < best_mae:
            best_mae = val_mae
            best_epoch = ep + 1
            no_improve = 0
            # Save model
            torch.save(model.state_dict(), os.path.join(out_dir, f"best_model_region_{region_id}.pth"))
            
            # Save predictions
            best_predictions = pd.DataFrame({
                'id': img_ids,
                'angle': predictions
            })
            best_predictions.to_csv(os.path.join(out_dir, f"best_predictions_region_{region_id}.csv"), index=False)
            print(f"Saved new best model with MAE={best_mae:.4f} and predictions")
        else:
            no_improve += 1
            if no_improve >= patience:
                print(f"No improvement for {patience} epochs. Early stopping.")
                break
    
    # Load best model
    model.load_state_dict(torch.load(os.path.join(out_dir, f"best_model_region_{region_id}.pth")))
    print(f"Region {region_id} training complete. Best MAE: {best_mae:.4f} at epoch {best_epoch}")
    
    # If we're dealing with region 1, perform model ensemble as a final step
    if region_id == 1 and best_mae > 40:  # If still struggling with region 1
        print("Performing additional model ensemble for region 1...")
        # Train a secondary model with different architecture
        second_model = MultimodalAnglePredictor(backbone='resnet34')
        second_model.to(device)
        second_opt = torch.optim.AdamW(second_model.parameters(), lr=3e-4, weight_decay=0.02)
        
        # Simplified training of second model (10 epochs)
        for ep in range(10):
            second_model.train()
            for imgs, metas, sincos, _, _ in train_loader:
                imgs, metas, sincos = imgs.to(device), metas.to(device), sincos.to(device)
                second_opt.zero_grad()
                out = second_model(imgs, metas)
                loss = crit(out, sincos)
                loss.backward()
                second_opt.step()
        
        # Ensemble prediction
        second_model.eval()
        ensemble_img_ids, ensemble_predictions = [], []
        with torch.no_grad():
            for imgs, metas, _, _, filenames in val_loader:
                imgs, metas = imgs.to(device), metas.to(device)
                
                # Predictions from both models
                out1 = model(imgs, metas)
                out2 = second_model(imgs, metas)
                
                # Ensemble outputs (average sin/cos values)
                ps1, pc1 = out1[:, 0].cpu().numpy(), out1[:, 1].cpu().numpy()
                ps2, pc2 = out2[:, 0].cpu().numpy(), out2[:, 1].cpu().numpy()
                
                # Simple average of sin/cos components
                ps_avg = (ps1 + ps2) / 2
                pc_avg = (pc1 + pc2) / 2
                
                # Convert to angles
                pa_avg = sin_cos_to_angle(ps_avg, pc_avg)
                
                # Store predictions
                for i in range(len(imgs)):
                    img_id = extract_image_id(filenames[i])
                    ensemble_img_ids.append(img_id)
                    ensemble_predictions.append(pa_avg[i])
        
        # Evaluate ensemble
        ensemble_predictions_df = pd.DataFrame({
            'id': ensemble_img_ids,
            'angle': ensemble_predictions
        })
        
        # Compare with best previous predictions and save if better
        true_angles = np.array(true_ang)
        ensemble_mae = circular_mae(true_angles, np.array(ensemble_predictions))
        
        if ensemble_mae < best_mae:
            print(f"Ensemble model improved MAE from {best_mae:.4f} to {ensemble_mae:.4f}")
            best_mae = ensemble_mae
            best_predictions = ensemble_predictions_df
            best_predictions.to_csv(os.path.join(out_dir, f"best_predictions_region_{region_id}_ensemble.csv"), index=False)
    
    return model, best_mae, best_predictions
# Train models for each region
region_models = {}
region_maes = {}
region_predictions = {}
backbone = 'efficientnet_b0'
all_region_ids={1}
for region_id in all_region_ids:
    model, best_mae, best_preds = train_region_model(
        region_id=region_id,
        train_ds=train_ds,
        val_ds=val_ds,
        backbone=backbone,
        epochs=50,
        patience=10
    )
    if model is not None:
        region_models[region_id] = model
        region_maes[region_id] = best_mae
        region_predictions[region_id] = best_preds

# Evaluate overall performance
# Create a validation loader with all data
full_val_loader = DataLoader(val_ds, batch_size=128, shuffle=False, num_workers=4, pin_memory=True)

# Load region predictor
region_predictor = RegionPredictor("/kaggle/input/regionpred/tensorflow2/default/1/region_pred_97_83.pth")

# Predictions using region-specific models
all_true_angles = []
all_pred_angles = []
all_image_ids = []
region_wise_errors = {region_id: [] for region_id in all_region_ids}

for imgs, metas, _, orig_angles, filenames in tqdm(full_val_loader, desc="Evaluating with region-specific models"):
    imgs, metas = imgs.to(device), metas.to(device)
    
    # Get actual region IDs from metadata
    region_ids = metas[:, 3].cpu().numpy()
    
    # For each sample in batch
    for i in range(len(imgs)):
        img = imgs[i:i+1]
        meta = metas[i:i+1]
        true_angle = orig_angles[i].item()
        region_id = int(region_ids[i])
        filename = filenames[i]
        
        # Extract image ID
        img_id = extract_image_id(filename)
        
        # Get region-specific model
        if region_id in region_models:
            model = region_models[region_id]
            model.eval()
            
            with torch.no_grad():
                out = model(img, meta)
                sin_val = out[0, 0].item()
                cos_val = out[0, 1].item()
                pred_angle = sin_cos_to_angle(sin_val, cos_val)
            
            # Calculate error
            error = circular_distance(true_angle, pred_angle)
            
            # Store results
            all_true_angles.append(true_angle)
            all_pred_angles.append(pred_angle)
            all_image_ids.append(img_id)
            region_wise_errors[region_id].append(error)

# Calculate overall MAE
overall_mae = circular_mae(np.array(all_true_angles), np.array(all_pred_angles))
overall_score = 1.0 / (1.0 + overall_mae)

print("\n=== FINAL EVALUATION ===")
print(f"Overall MAE: {overall_mae:.4f}")
print(f"Overall Score: {overall_score:.4f}")

# Print region-wise MAEs
print("\n=== REGION-WISE MAE ===")
for region_id in sorted(region_wise_errors.keys()):
    if len(region_wise_errors[region_id]) > 0:
        region_mae = np.mean(region_wise_errors[region_id])
        region_score = 1.0 / (1.0 + region_mae)
        print(f"Region {region_id}: MAE={region_mae:.4f}, Score={region_score:.4f}, Samples={len(region_wise_errors[region_id])}")
    else:
        print(f"Region {region_id}: No samples")

# Save all results
results_df = pd.DataFrame({
    'id': all_image_ids,
    'region_id': [region_id for region_id in region_wise_errors.keys() for _ in range(len(region_wise_errors[region_id]))],
    'true_angle': all_true_angles,
    'pred_angle': all_pred_angles,
    'error': [error for region_errors in region_wise_errors.values() for error in region_errors]
})

results_df.to_csv("region_specific_predictions.csv", index=False)
print("Results saved to region_specific_predictions.csv")

# Create merged best predictions file
all_best_predictions = pd.concat([df for df in region_predictions.values()])
all_best_predictions = all_best_predictions.sort_values('id')
all_best_predictions.to_csv("best_predictions.csv", index=False)
print("Best predictions saved to best_predictions.csv")