In [None]:
import os
import gc
import random
import warnings
import numpy as np
import pandas as pd
import cv2
import torch
from PIL import Image
from tqdm.auto import tqdm
from pathlib import Path
from copy import deepcopy
from dataclasses import dataclass

# Sklearn & Models
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cross_decomposition import PLSRegression
from sklearn.mixture import GaussianMixture
from sklearn.ensemble import GradientBoostingRegressor, HistGradientBoostingRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from transformers import AutoModel, AutoImageProcessor, AutoTokenizer

# Suppress warnings
warnings.filterwarnings('ignore')
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# =========================================================================================
# 1. CONFIGURATION & SEEDING
# =========================================================================================
@dataclass
class Config:
    DATA_PATH: Path = Path("/kaggle/input/csiro-biomass/")
    SPLIT_PATH: Path = Path("/kaggle/input/csiro-datasplit/csiro_data_split.csv")
    SIGLIP_PATH: str = "/kaggle/input/google-siglip-so400m-patch14-384/transformers/default/1"
    
    SEED: int = 42
    DEVICE: str = "cuda" if torch.cuda.is_available() else "cpu"
    PATCH_SIZE: int = 520
    OVERLAP: int = 16
    
    # Target definitions
    TARGET_NAMES = ['Dry_Clover_g', 'Dry_Dead_g', 'Dry_Green_g', 'Dry_Total_g', 'GDM_g']
    TARGET_MAX = {
        "Dry_Clover_g": 71.7865,
        "Dry_Dead_g": 83.8407,
        "Dry_Green_g": 157.9836,
        "Dry_Total_g": 185.70,
        "GDM_g": 157.9836,
    }

cfg = Config()

def seeding(SEED):
    np.random.seed(SEED)
    random.seed(SEED)
    os.environ['PYTHONHASHSEED'] = str(SEED)
    torch.manual_seed(SEED)
    if torch.cuda.is_available(): 
        torch.cuda.manual_seed(SEED)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

seeding(cfg.SEED)

# =========================================================================================
# 2. DATA LOADING & PRE-PROCESSING
# =========================================================================================
def pivot_table(df: pd.DataFrame) -> pd.DataFrame:
    if 'target' in df.columns.tolist():
        # Train data
        df_pt = pd.pivot_table(
            df, 
            values='target', 
            index=['image_path', 'Sampling_Date', 'State', 'Species', 'Pre_GSHH_NDVI', 'Height_Ave_cm'], 
            columns='target_name', 
            aggfunc='mean'
        ).reset_index()
    else:
        # Test data
        df['target'] = 0
        df_pt = pd.pivot_table(
            df, 
            values='target', 
            index='image_path', 
            columns='target_name', 
            aggfunc='mean'
        ).reset_index()
    return df_pt

def melt_table(df: pd.DataFrame) -> pd.DataFrame:
    melted = df.melt(
        id_vars='image_path',
        value_vars=cfg.TARGET_NAMES,
        var_name='target_name',
        value_name='target'
    )
    # Create sample_id matching submission format
    melted['sample_id'] = (
        melted['image_path']
        .str.replace(r'^.*/', '', regex=True)
        .str.replace('.jpg', '', regex=False)
        + '__' + melted['target_name']
    )
    return melted[['sample_id', 'target']]

def post_process_biomass(df_preds):
    """Enforce mass balance: Green+Clover=GDM, GDM+Dead=Total using projection."""
    ordered_cols = ["Dry_Green_g", "Dry_Clover_g", "Dry_Dead_g", "GDM_g", "Dry_Total_g"]
    
    # Ensure cols exist
    for c in ordered_cols:
        if c not in df_preds.columns:
            df_preds[c] = 0.0

    Y = df_preds[ordered_cols].values.T
    
    # Constraint Matrix: Cx = 0
    C = np.array([
        [1, 1, 0, -1,  0],
        [0, 0, 1,  1, -1]
    ])
    
    # Projection Matrix P = I - C^T(CC^T)^-1 C
    C_T = C.T
    try:
        inv_CCt = np.linalg.inv(C @ C_T)
        P = np.eye(5) - C_T @ inv_CCt @ C
    except:
        return df_preds # Fallback
    
    Y_reconciled = (P @ Y).T
    Y_reconciled = Y_reconciled.clip(min=0) # Non-negative constraint
    
    df_out = df_preds.copy()
    df_out[ordered_cols] = Y_reconciled
    return df_out

print("Loading Data...")
# Load Train (Metadata Split)
train_df = pd.read_csv(cfg.SPLIT_PATH) 

# --- FIX: Remove pre-existing embedding columns to prevent duplication ---
cols_to_keep = [c for c in train_df.columns if not c.startswith('emb')]
train_df = train_df[cols_to_keep]
# -----------------------------------------------------------------------

# Ensure train paths match local environment
if not str(train_df['image_path'].iloc[0]).startswith('/'):
     train_df['image_path'] = train_df['image_path'].apply(lambda p: str(cfg.DATA_PATH / 'train' / os.path.basename(p)))

# Load Test
test_df_raw = pd.read_csv(cfg.DATA_PATH / 'test.csv')
test_df = pivot_table(test_df_raw)
test_df['image_path'] = test_df['image_path'].apply(lambda p: str(cfg.DATA_PATH / p))

# =========================================================================================
# 3. FEATURE EXTRACTION: SIGLIP IMAGE EMBEDDINGS
# =========================================================================================
def split_image(image, patch_size=520, overlap=16):
    h, w, c = image.shape
    stride = patch_size - overlap
    patches = []
    for y in range(0, h, stride):
        for x in range(0, w, stride):
            y2 = min(y + patch_size, h)
            x2 = min(x + patch_size, w)
            y1 = max(0, y2 - patch_size) # Ensure fixed size
            x1 = max(0, x2 - patch_size)
            patch = image[y1:y2, x1:x2, :]
            patches.append(patch)
    return patches

def compute_embeddings(model_path, df):
    print(f"Computing Embeddings for {len(df)} images...")
    model = AutoModel.from_pretrained(model_path, local_files_only=True).eval().to(cfg.DEVICE)
    processor = AutoImageProcessor.from_pretrained(model_path)
    
    EMBEDDINGS = []
    
    for _, row in tqdm(df.iterrows(), total=len(df)):
        try:
            img = cv2.imread(row['image_path'])
            if img is None: raise ValueError("Image not found")
            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
            patches = split_image(img, patch_size=cfg.PATCH_SIZE, overlap=cfg.OVERLAP)
            images = [Image.fromarray(p) for p in patches]
            
            # Batch process patches
            inputs = processor(images=images, return_tensors="pt").to(cfg.DEVICE)
            with torch.no_grad():
                features = model.get_image_features(**inputs)
            
            # Average pooling of patches
            avg_embed = features.mean(dim=0).cpu().numpy()
            EMBEDDINGS.append(avg_embed)
        except Exception as e:
            print(f"Error processing {row['image_path']}: {e}")
            # Fallback zero embedding
            EMBEDDINGS.append(np.zeros(1152))
        
    torch.cuda.empty_cache()
    return np.stack(EMBEDDINGS)

# Compute Features
train_embeddings = compute_embeddings(cfg.SIGLIP_PATH, train_df)
test_embeddings = compute_embeddings(cfg.SIGLIP_PATH, test_df)

# Create Feature DataFrames
emb_cols = [f"emb{i}" for i in range(train_embeddings.shape[1])]
train_feat_df = pd.concat([train_df, pd.DataFrame(train_embeddings, columns=emb_cols)], axis=1)
test_feat_df = pd.concat([test_df, pd.DataFrame(test_embeddings, columns=emb_cols)], axis=1)

# Double check column counts
print(f"Train Features Shape: {train_feat_df.shape}")
print(f"Test Features Shape: {test_feat_df.shape}")

# =========================================================================================
# 4. FEATURE EXTRACTION: SEMANTIC FEATURES (TEXT PROBING)
# =========================================================================================
def generate_semantic_features(image_embeddings_np, model_path):
    print("Generating Semantic Features...")
    model = AutoModel.from_pretrained(model_path).to(cfg.DEVICE)
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    
    # Anchors
    concept_groups = {
        "bare": ["bare soil", "dirt ground", "sparse vegetation", "exposed earth"],
        "sparse": ["low density pasture", "thin grass", "short clipped grass"],
        "medium": ["average pasture cover", "medium height grass", "grazed pasture"],
        "dense": ["dense tall pasture", "thick grassy volume", "high biomass", "overgrown vegetation"],
        "green": ["lush green vibrant pasture", "photosynthesizing leaves", "fresh growth"],
        "dead": ["dry brown dead grass", "yellow straw", "senesced material", "standing hay"],
        "clover": ["white clover", "trifolium repens", "broadleaf legume", "clover flowers"],
        "grass": ["ryegrass", "blade-like leaves", "fescue", "grassy sward"]
    }
    
    # Encode Concepts
    concept_vectors = {}
    with torch.no_grad():
        for name, prompts in concept_groups.items():
            inputs = tokenizer(prompts, padding="max_length", return_tensors="pt").to(cfg.DEVICE)
            emb = model.get_text_features(**inputs)
            emb = emb / emb.norm(p=2, dim=-1, keepdim=True)
            concept_vectors[name] = emb.mean(dim=0, keepdim=True)
            
    # Compute Scores
    img_tensor = torch.tensor(image_embeddings_np, dtype=torch.float32).to(cfg.DEVICE)
    img_tensor = img_tensor / img_tensor.norm(p=2, dim=-1, keepdim=True)
    
    scores = {}
    for name, vec in concept_vectors.items():
        scores[name] = torch.matmul(img_tensor, vec.T).cpu().numpy().flatten()
        
    df_scores = pd.DataFrame(scores)
    # Ratios
    df_scores['ratio_greenness'] = df_scores['green'] / (df_scores['green'] + df_scores['dead'] + 1e-6)
    df_scores['ratio_clover'] = df_scores['clover'] / (df_scores['clover'] + df_scores['grass'] + 1e-6)
    df_scores['ratio_cover'] = (df_scores['dense'] + df_scores['medium']) / (df_scores['bare'] + df_scores['sparse'] + 1e-6)
    
    torch.cuda.empty_cache()
    return df_scores.values

# Combine for semantic generation to ensure consistency
all_emb = np.vstack([train_embeddings, test_embeddings])
all_semantic = generate_semantic_features(all_emb, cfg.SIGLIP_PATH)

sem_train = all_semantic[:len(train_df)]
sem_test = all_semantic[len(train_df):]

# =========================================================================================
# 5. SUPERVISED EMBEDDING ENGINE
# =========================================================================================
class SupervisedEmbeddingEngine:
    def __init__(self, n_pca=0.80, n_pls=8, n_gmm=6, random_state=42):
        self.scaler = StandardScaler()
        self.pca = PCA(n_components=n_pca, random_state=random_state)
        self.pls = PLSRegression(n_components=n_pls, scale=False)
        self.gmm = GaussianMixture(n_components=n_gmm, covariance_type='diag', random_state=random_state)
        self.pls_fitted_ = False

    def fit(self, X, y=None, X_semantic=None):
        # Scale
        X_scaled = self.scaler.fit_transform(X)
        
        # Unsupervised
        self.pca.fit(X_scaled)
        self.gmm.fit(X_scaled)
        
        # Supervised
        if y is not None:
            self.pls.fit(X_scaled, y)
            self.pls_fitted_ = True
        return self

    def transform(self, X, X_semantic=None):
        X_scaled = self.scaler.transform(X)
        
        features = [self.pca.transform(X_scaled)]
        
        if self.pls_fitted_:
            features.append(self.pls.transform(X_scaled))
            
        features.append(self.gmm.predict_proba(X_scaled))
        
        if X_semantic is not None:
            # Normalize semantic
            sem_norm = (X_semantic - np.mean(X_semantic, axis=0)) / (np.std(X_semantic, axis=0) + 1e-6)
            features.append(sem_norm)
            
        return np.hstack(features)

# =========================================================================================
# 6. TRAINING & INFERENCE (5-FOLD CV)
# =========================================================================================
def cross_validate_predict(model_cls, model_params, train_data, test_data, sem_tr, sem_te, feature_engine):
    target_max_arr = np.array([cfg.TARGET_MAX[t] for t in cfg.TARGET_NAMES], dtype=float)
    y_pred_test_accum = np.zeros([len(test_data), len(cfg.TARGET_NAMES)], dtype=float)
    
    # Ensure n_splits is integer
    n_splits = int(train_data['fold'].nunique())
    
    # Pre-extract raw columns to avoid indexing overhead
    # Force float32 to save memory and ensure compatibility
    X_train_full = train_data[emb_cols].values.astype(np.float32)
    X_test_raw = test_data[emb_cols].values.astype(np.float32)
    y_train_full = train_data[cfg.TARGET_NAMES].values.astype(np.float32)
    
    for fold in range(n_splits):
        print(f"Processing Fold {fold}...")
        # Split
        train_mask = train_data['fold'] != fold
        
        X_tr = X_train_full[train_mask]
        y_tr = y_train_full[train_mask] / target_max_arr # Max Scaling
        
        sem_tr_fold = sem_tr[train_mask]
        
        # Feature Engineering (Fit on fold train)
        engine = deepcopy(feature_engine)
        engine.fit(X_tr, y=y_tr, X_semantic=sem_tr_fold)
        
        x_tr_eng = engine.transform(X_tr, X_semantic=sem_tr_fold)
        x_te_eng = engine.transform(X_test_raw, X_semantic=sem_te)
        
        # Train & Predict per target
        fold_test_pred = np.zeros([len(test_data), len(cfg.TARGET_NAMES)])
        
        for k in range(len(cfg.TARGET_NAMES)):
            model = model_cls(**model_params)
            model.fit(x_tr_eng, y_tr[:, k])
            pred_raw = model.predict(x_te_eng)
            fold_test_pred[:, k] = pred_raw * target_max_arr[k] # Inverse Scale
            
        y_pred_test_accum += fold_test_pred
        
    return y_pred_test_accum / n_splits

# Model Parameters (Optimized)
params_cat = {
    'iterations': 1900, 'learning_rate': 0.045, 'depth': 4, 'l2_leaf_reg': 0.56, 
    'random_strength': 0.045, 'bagging_temperature': 0.98, 'verbose': 0, 'random_state': 42,
    'allow_writing_files': False
}
params_xgb = { # Using GradientBoostingRegressor as proxy
    'n_estimators': 1354, 'learning_rate': 0.010, 'max_depth': 3, 'subsample': 0.60, 
    'random_state': 42
}
params_lgbm = {
    'n_estimators': 807, 'learning_rate': 0.014, 'num_leaves': 48, 'min_child_samples': 19, 
    'subsample': 0.745, 'colsample_bytree': 0.745, 'reg_alpha': 0.21, 'reg_lambda': 3.78,
    'verbose': -1, 'random_state': 42
}
params_hist = {
    'max_iter': 300, 'learning_rate': 0.05, 'max_depth': None, 'l2_regularization': 0.44,
    'random_state': 42
}

feat_engine = SupervisedEmbeddingEngine(n_pca=0.80, n_pls=8, n_gmm=6)

print("Training & Inferring Models...")

# 1. HistGradientBoosting
print("Model: HistGradientBoosting")
pred_hist = cross_validate_predict(
    HistGradientBoostingRegressor, params_hist, 
    train_feat_df, test_feat_df, sem_train, sem_test, feat_engine
)

# 2. GradientBoosting
print("Model: GradientBoosting")
pred_gb = cross_validate_predict(
    GradientBoostingRegressor, params_xgb, 
    train_feat_df, test_feat_df, sem_train, sem_test, feat_engine
)

# 3. CatBoost
print("Model: CatBoost")
pred_cat = cross_validate_predict(
    CatBoostRegressor, params_cat, 
    train_feat_df, test_feat_df, sem_train, sem_test, feat_engine
)

# 4. LightGBM
print("Model: LightGBM")
pred_lgbm = cross_validate_predict(
    LGBMRegressor, params_lgbm, 
    train_feat_df, test_feat_df, sem_train, sem_test, feat_engine
)

# =========================================================================================
# 7. ENSEMBLING & SUBMISSION
# =========================================================================================
print("Ensembling and Post-processing...")
# Simple Average Ensemble
final_pred = (pred_hist + pred_gb + pred_cat + pred_lgbm) / 4.0

# Assign to dataframe
test_feat_df[cfg.TARGET_NAMES] = final_pred

# Post-process (Mass Balance)
test_processed = post_process_biomass(test_feat_df)

# Create Submission File
sub_df = melt_table(test_processed)
output_path = "submission_siglip.csv"
sub_df.to_csv(output_path, index=False)

print(f"✓ Siglip submission generated: {output_path}")
print(sub_df.head())

  if entities is not ():
2026-01-15 06:29:22.281828: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1768458562.466829      31 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1768458562.523006      31 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1768458562.962990      31 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1768458562.963026      31 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1768458562.963029      31 computation_placer.cc:17

Loading Data...
Computing Embeddings for 357 images...


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


  0%|          | 0/357 [00:00<?, ?it/s]

Computing Embeddings for 1 images...


  0%|          | 0/1 [00:00<?, ?it/s]

Train Features Shape: (357, 1165)
Test Features Shape: (1, 1158)
Generating Semantic Features...
Training & Inferring Models...
Model: HistGradientBoosting
Processing Fold 0...
Processing Fold 1...
Processing Fold 2...
Processing Fold 3...
Processing Fold 4...
Model: GradientBoosting
Processing Fold 0...
Processing Fold 1...
Processing Fold 2...
Processing Fold 3...
Processing Fold 4...
Model: CatBoost
Processing Fold 0...
Processing Fold 1...
Processing Fold 2...
Processing Fold 3...
Processing Fold 4...
Model: LightGBM
Processing Fold 0...
Processing Fold 1...
Processing Fold 2...
Processing Fold 3...
Processing Fold 4...
Ensembling and Post-processing...
✓ Siglip submission generated: submission_siglip.csv
                    sample_id     target
0  ID1001187975__Dry_Clover_g   3.730718
1    ID1001187975__Dry_Dead_g  21.481835
2   ID1001187975__Dry_Green_g  28.551182
3   ID1001187975__Dry_Total_g  53.763735
4         ID1001187975__GDM_g  32.281900


In [2]:
%%writefile csiro_infer.py

from PIL import Image
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold, GroupKFold, StratifiedGroupKFold
from tqdm.auto import tqdm
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import AdamW
from transformers import get_cosine_schedule_with_warmup

from torch.utils.data import Dataset, DataLoader
from torchvision import transforms as T

import os
from pathlib import Path
import timm
import warnings 

warnings.filterwarnings('ignore')
tqdm.pandas()

class RegressionDataset(Dataset):
    def __init__(self, data, transform=None):
        self.data = data
        self.transform = transform
    def __len__(self):
        return self.data.shape[0]

    def __getitem__(self, idx):
        item = self.data.iloc[idx]
        image = item.image
        targets = [item['Dry_Green_g'], item['Dry_Clover_g'], item['Dry_Dead_g']]
        width, height = image.size
        mid_point = width // 2
        left_image = image.crop((0, 0, mid_point, height))
        right_image = image.crop((mid_point, 0, width, height))

        if self.transform is not None:
            left_image = self.transform(left_image)
            right_image = self.transform(right_image)

        return left_image, right_image, targets

def get_test_dataloaders(data, image_size, batch_size):
    res = []
    for trans in [None, T.RandomHorizontalFlip(p=1.0), T.RandomVerticalFlip(p=1.0)]:
        transform = T.Compose([
            T.Resize(image_size),
            T.ToTensor(),
            T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        ])
        if trans:
            transform = T.Compose([
                T.Resize(image_size),
                trans,
                T.ToTensor(),
                T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
            ])
        dataset = RegressionDataset(data, transform=transform)
        res.append(DataLoader(dataset, batch_size=batch_size, shuffle=False, num_workers=4))
    return res


class FiLM(nn.Module):
    def __init__(self, feat_dim):
        super().__init__()
        self.mlp = nn.Sequential(
            nn.Linear(feat_dim, feat_dim // 2), 
            nn.ReLU(inplace=True), 
            nn.Linear(feat_dim // 2, feat_dim * 2)
        )

    def forward(self, context):
        gamma_beta = self.mlp(context)
        return torch.chunk(gamma_beta, 2, dim=1)

class CSIROModelRegressor(nn.Module):
    def __init__(self, model_name, pretrained=True, num_classes=3, dropout=0.0, freeze_backbone=False):
        super().__init__()
        self.backbone = timm.create_model(model_name, pretrained=pretrained, num_classes=0, global_pool='avg')

        self.film = FiLM(self.backbone.num_features)

        self.dropout = nn.Dropout(dropout)

        def make_head():
            return nn.Sequential(
                nn.Linear(self.backbone.num_features * 2, 8),
                nn.ReLU(inplace=True),
                nn.Dropout(dropout),
                nn.Linear(8, 1),
            )

        self.head_green = make_head()
        self.head_clover = make_head()
        self.head_dead = make_head()

        self.softplus = nn.Softplus(beta=1.0)

        if freeze_backbone:
            for p in self.backbone.parameters():
                p.requires_grad = False


    def forward(self, left_img, right_img):
        left_feat = self.backbone(left_img)
        right_feat = self.backbone(right_img)

        context = (left_feat + right_feat) / 2
        gamma, beta = self.film(context)

        left_feat_modulated = left_feat * (1 + gamma) + beta
        right_feat_modulated = right_feat * (1 + gamma) + beta

        combined = torch.cat([left_feat_modulated, right_feat_modulated], dim=1)

        green = self.softplus(self.head_green(combined))   
        clover = self.softplus(self.head_clover(combined))
        dead = self.softplus(self.head_dead(combined)) 

        logits = torch.cat([green, clover, dead], dim=1)

        return logits

def predict(model, dataloader, device):
    model.to(device)
    model.eval()

    all_outputs = []
    with torch.no_grad():
        for left_images, right_images, targets in dataloader:
            left_images = left_images.to(device)
            right_images = right_images.to(device)

            outputs = model(left_images, right_images)
            all_outputs.append(outputs.detach().cpu())

    outputs = torch.cat(all_outputs).numpy()
    return outputs

def predict_loaders(model, dataloaders, device):
    all_outputs = []
    for dataloader in dataloaders:
        outputs = predict(model, dataloader, device)
        all_outputs.append(outputs)
    avg_outputs = np.mean(all_outputs, axis=0)
    return avg_outputs

def predict_folds(dataloaders,models_dir):
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

    all_preds = []
    for model_file in Path(models_dir).glob('*.pth'):
        model = CSIROModelRegressor(CFG.MODEL_NAME, pretrained=False, num_classes=3)
        model.load_state_dict(torch.load(model_file))
        preds = predict_loaders(model, dataloaders, device)
        all_preds.append(preds)

    avg_preds = np.mean(all_preds, axis=0)
    return avg_preds


class CFG:
    DATA_PATH="/kaggle/input/csiro-biomass/"
    TEST_DATA_PATH="/kaggle/input/csiro-biomass/test.csv"
    MODEL_NAME="vit_large_patch16_dinov3_qkvb"
    MODELS_DIR ='/kaggle/input/csiro-vit-large-dinov3/pytorch/default/1/csiro-dinov3-models'
    IMG_SIZE=(512,512)


test_df = pd.read_csv(CFG.TEST_DATA_PATH)

test_df['target'] = 0.0
test_df[['sample_id_prefix', 'sample_id_suffix']] = test_df.sample_id.str.split('__', expand=True)

test_data_df = test_df.groupby(['sample_id_prefix', 'image_path']).apply(lambda df: df.set_index('target_name').target)
test_data_df.reset_index(inplace=True)
test_data_df.columns.name = None

test_data_df['image'] = test_data_df.image_path.progress_apply(lambda path: Image.open(CFG.DATA_PATH + path).convert('RGB'))

test_loaders = get_test_dataloaders(test_data_df, CFG.IMG_SIZE, 32)
preds = predict_folds(test_loaders,models_dir=CFG.MODELS_DIR)

test_data_df[['Dry_Green_g', 'Dry_Clover_g', 'Dry_Dead_g']] = preds
test_data_df['GDM_g'] = test_data_df.Dry_Green_g + test_data_df.Dry_Clover_g
test_data_df['Dry_Total_g'] = test_data_df.GDM_g + test_data_df.Dry_Dead_g

cols = [ 'Dry_Green_g', 'Dry_Dead_g', 'Dry_Clover_g', 'GDM_g' , 'Dry_Total_g']
sub_df = test_data_df.set_index('sample_id_prefix')[cols].stack()
sub_df = sub_df.reset_index()
sub_df.columns = ['sample_id_prefix', 'target_name', 'target']

sub_df['sample_id'] = sub_df.sample_id_prefix + '__' + sub_df.target_name

cols = ['sample_id', 'target']
sub_df[cols].to_csv('submission_dinov2026.csv', index=False)

print(sub_df[cols])

Writing csiro_infer.py


In [3]:
!python csiro_infer.py

100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 11.19it/s]
                    sample_id     target
0   ID1001187975__Dry_Green_g  28.325821
1    ID1001187975__Dry_Dead_g  34.191360
2  ID1001187975__Dry_Clover_g   0.020916
3         ID1001187975__GDM_g  28.346737
4   ID1001187975__Dry_Total_g  62.538097


In [4]:
import pandas as pd
import numpy as np
import os

# =============================================================================
# CONFIGURATION
# =============================================================================
# Weights for the ensemble (Must sum to 1.0)
# Adjust based on which model had better local CV or Public LB score.
# Example: If Siglip was better, give it 0.6.
W_SIGLIP = 0.35
W_DINO   = 0.65

FILES = {
    'siglip': 'submission_siglip.csv',
    'dino':   'submission_dinov2026.csv'
}

OUTPUT_FILE = 'submission.csv'

# Target definitions required for Mass Balance
ALL_TARGETS = ['Dry_Green_g', 'Dry_Clover_g', 'Dry_Dead_g', 'GDM_g', 'Dry_Total_g']

# =============================================================================
# HELPER FUNCTIONS
# =============================================================================

def enforce_mass_balance(df_wide):
    """
    Applies Orthogonal Projection to enforce biological constraints:
    1. Dry_Green_g + Dry_Clover_g = GDM_g
    2. GDM_g + Dry_Dead_g = Dry_Total_g
    
    This finds the closest set of values to the predictions that satisfy 
    the constraints (minimizing Euclidean distance modification).
    """
    # 1. Ensure columns are in the specific order for the matrix math
    # Vector x = [Green, Clover, Dead, GDM, Total]
    ordered_cols = ['Dry_Green_g', 'Dry_Clover_g', 'Dry_Dead_g', 'GDM_g', 'Dry_Total_g']
    
    # Extract values: Shape (5, N_samples)
    Y = df_wide[ordered_cols].values.T
    
    # 2. Define Constraint Matrix C where Cx = 0
    # Eq 1: 1*Gr + 1*Cl + 0*De - 1*GDM + 0*Tot = 0
    # Eq 2: 0*Gr + 0*Cl + 1*De + 1*GDM - 1*Tot = 0
    C = np.array([
        [1, 1, 0, -1,  0],
        [0, 0, 1,  1, -1]
    ])
    
    # 3. Calculate Projection Matrix P = I - C^T * (C * C^T)^-1 * C
    C_T = C.T
    try:
        inv_CCt = np.linalg.inv(C @ C_T)
        P = np.eye(5) - C_T @ inv_CCt @ C
    except np.linalg.LinAlgError:
        # Fallback if singular (unlikely with this specific matrix)
        print("Warning: Singular matrix in projection. Skipping constraint enforcement.")
        return df_wide

    # 4. Apply Projection
    Y_reconciled = P @ Y
    
    # 5. Transpose back to (N_samples, 5) and clip negatives
    Y_reconciled = Y_reconciled.T
    Y_reconciled = np.maximum(0, Y_reconciled) 
    
    # 6. Update DataFrame
    df_out = df_wide.copy()
    df_out[ordered_cols] = Y_reconciled
    
    return df_out

def robust_ensemble(file_paths, weights):
    print(f"--- Starting Ensemble ---")
    print(f"Weights: {weights}")
    
    dfs = []
    for name, path in file_paths.items():
        if not os.path.exists(path):
            raise FileNotFoundError(f"Missing file: {path}")
        
        # Read and sort by sample_id to ensure alignment
        df = pd.read_csv(path).sort_values('sample_id').reset_index(drop=True)
        dfs.append(df)
        print(f"Loaded {name}: {len(df)} rows")

    # 1. Check alignment
    base_ids = dfs[0]['sample_id']
    if not all(df['sample_id'].equals(base_ids) for df in dfs[1:]):
        raise ValueError("Sample IDs do not match between submission files!")

    # 2. Weighted Average
    # Stack targets: (N_samples, N_models)
    all_preds = np.column_stack([df['target'].values for df in dfs])
    w_vec = np.array(list(weights.values()))
    
    # Normalize weights just in case
    w_vec = w_vec / w_vec.sum()
    
    # Compute average
    avg_preds = np.sum(all_preds * w_vec, axis=1)
    
    # Create intermediate dataframe
    ensemble_df = pd.DataFrame({
        'sample_id': base_ids,
        'target': avg_preds
    })
    
    print("Weighted average complete.")

    # 3. Prepare for Mass Balance (Convert Long -> Wide)
    # Split sample_id into image_id and target_name
    # Format: ID1001187975__Dry_Green_g
    ensemble_df[['image_id', 'target_name']] = ensemble_df['sample_id'].str.rsplit('__', n=1, expand=True)
    
    # Pivot
    wide_df = ensemble_df.pivot(index='image_id', columns='target_name', values='target').reset_index()
    
    # 4. Apply Robust Constraints
    print("Applying Mass Balance Constraints...")
    wide_balanced = enforce_mass_balance(wide_df)
    
    # 5. Convert back (Wide -> Long)
    long_balanced = wide_balanced.melt(
        id_vars='image_id', 
        value_vars=ALL_TARGETS,
        var_name='target_name',
        value_name='target'
    )
    
    # Reconstruct sample_id
    long_balanced['sample_id'] = long_balanced['image_id'] + '__' + long_balanced['target_name']
    
    # 6. Final Formatting
    final_submission = long_balanced[['sample_id', 'target']].sort_values('sample_id').reset_index(drop=True)
    
    return final_submission

# =============================================================================
# EXECUTION
# =============================================================================
if __name__ == "__main__":
    
    # Define weights
    ensemble_weights = {
        'siglip': W_SIGLIP,
        'dino': W_DINO
    }
    
    try:
        # Run Ensemble
        submission = robust_ensemble(FILES, ensemble_weights)
        
        # Save
        submission.to_csv(OUTPUT_FILE, index=False)
        print(f"\nSuccess! Saved to {OUTPUT_FILE}")
        print(submission.head())
        
        # Sanity Check Stats
        print("\nStats:")
        print(submission['target'].describe())
        
    except Exception as e:
        print(f"\nError during ensembling: {e}")

--- Starting Ensemble ---
Weights: {'siglip': 0.35, 'dino': 0.65}
Loaded siglip: 5 rows
Loaded dino: 5 rows
Weighted average complete.
Applying Mass Balance Constraints...

Success! Saved to submission.csv
                    sample_id     target
0  ID1001187975__Dry_Clover_g   1.319347
1    ID1001187975__Dry_Dead_g  29.743026
2   ID1001187975__Dry_Green_g  28.404697
3   ID1001187975__Dry_Total_g  59.467070
4         ID1001187975__GDM_g  29.724044

Stats:
count     5.000000
mean     29.731637
std      20.574348
min       1.319347
25%      28.404697
50%      29.724044
75%      29.743026
max      59.467070
Name: target, dtype: float64
