# Copy-Move Segmentation Pipeline

This notebook walks through the entire workflow end-to-end:
1. Prepare CSV metadata from the folder layout.
2. Train the five required models (ConvNeXt/UNet++, Swin-DeepLab, CMSeg-Lite variants).
3. Optionally run single-model inference to inspect outputs.
4. Blend the five checkpoints for the final ensemble submission.

> **Note**: When running on Kaggle, the notebook auto-detects datasets mounted under `/kaggle/input` that contain a `train_images` folder.

## 0. Environment Setup
Install dependencies once per environment.

In [None]:
!pip install -r requirements.txt

## 0.1 Notebook Imports
Add the repository to `sys.path` so we can import modules without shell commands.

In [None]:
import sys
from pathlib import Path

project_root = Path().resolve()
if str(project_root) not in sys.path:
    sys.path.append(str(project_root))
print(f'Project root: {project_root}')

## 1. Data Preparation
Scan the dataset folders (auto-detected from Kaggle `input/` if available) and generate CSV metadata for training/inference.

In [None]:
import pandas as pd
from prepare_data import build_train_df, build_test_csv

input_root = Path('/kaggle/input')
detected_roots = []
if input_root.exists():
    for candidate in input_root.iterdir():
        if candidate.is_dir() and (candidate / 'train_images').exists():
            detected_roots.append(candidate)

if detected_roots:
    data_root = detected_roots[0]
else:
    data_root = Path('data')
print(f'Detected data root: {data_root}')

train_images = data_root / 'train_images'
train_masks = data_root / 'train_masks'
supp_images = data_root / 'supplemental_images'
supp_masks = data_root / 'supplemental_masks'
test_images = data_root / 'test_images'
sample_sub = data_root / 'sample_submission.csv'

output_data_dir = Path('data')
output_data_dir.mkdir(parents=True, exist_ok=True)
train_csv = output_data_dir / 'train.csv'
test_csv = output_data_dir / 'test.csv'

if not train_images.exists() or not train_masks.exists():
    raise FileNotFoundError('train_images/train_masks directories not found. Update data_root or mount dataset properly.')

print('Building primary train dataframe...')
primary_df = build_train_df(train_images, train_masks, source_label='primary', allow_missing_mask=False)
frames = [primary_df]
if supp_images.exists() and supp_masks.exists():
    print('Including supplemental data...')
    supp_df = build_train_df(supp_images, supp_masks, source_label='supplemental', allow_missing_mask=False)
    frames.append(supp_df)
train_df = pd.concat(frames, ignore_index=True)
train_df.to_csv(train_csv, index=False)
print(f'Saved {len(train_df)} training rows to {train_csv}')

if test_images.exists():
    build_test_csv(test_images, test_csv, sample_sub if sample_sub.exists() else None)
    print(f'Saved test metadata to {test_csv}')
else:
    print('Test images folder not found; skipping test CSV generation.')

## 2. Train the Five Models
Loop over the five configuration files and run the training routine directly within the notebook.

In [None]:
import src.train as train_module

def run_training(config_path: str):
    backup = sys.argv
    sys.argv = ['train.py', '--config', config_path]
    try:
        train_module.main()
    finally:
        sys.argv = backup

training_configs = [
    'config/convnext_unetpp_512_base.yaml',
    'config/swin_deeplab_512_base.yaml',
    'config/cmseg_lite_512_base.yaml',
    'config/convnext_unetpp_768_heavyaug.yaml',
    'config/cmseg_lite_512_synCM.yaml',
]

for cfg in training_configs:
    print(f'\n=== Training with {cfg} ===')
    run_training(cfg)

## 3. Single-Model Inference (Optional)
Run each trained checkpoint individually, exporting per-model predictions and probability maps.

In [None]:
import src.infer as infer_module

def run_inference(image_dir: str, checkpoint: str, output_csv: str, prob_dir: str):
    backup = sys.argv
    sys.argv = [
        'infer.py',
        '--image-dir', image_dir,
        '--checkpoint', checkpoint,
        '--output', output_csv,
        '--save-prob',
        '--prob-dir', prob_dir,
    ]
    try:
        infer_module.main()
    finally:
        sys.argv = backup

image_dir = str(test_images if test_images.exists() else Path('data/test_images'))
inference_jobs = [
    ('outputs/convnext_unetpp_512_base/best.ckpt', 'predictions_convnext_base.csv', 'probs_convnext_base'),
    ('outputs/swin_deeplab_512_base/best.ckpt', 'predictions_swin_base.csv', 'probs_swin_base'),
    ('outputs/cmseg_lite_512_base/best.ckpt', 'predictions_cmseg_base.csv', 'probs_cmseg_base'),
    ('outputs/convnext_unetpp_768_heavyaug/best.ckpt', 'predictions_convnext_768.csv', 'probs_convnext_768'),
    ('outputs/cmseg_lite_512_synCM/best.ckpt', 'predictions_cmseg_syn.csv', 'probs_cmseg_syn'),
]

for ckpt, out_csv, prob_dir in inference_jobs:
    print(f'Running inference for {ckpt}')
    run_inference(image_dir, ckpt, out_csv, prob_dir)

## 4. Ensemble Submission
Average the five probability maps on-the-fly and export the final Kaggle submission.

In [None]:
import src.ensemble as ensemble_module

def run_ensemble(image_dir: str, checkpoints, output_csv: str, prob_dir: str | None = None):
    backup = sys.argv
    argv = [
        'ensemble.py',
        '--image-dir', image_dir,
        '--checkpoints',
    ] + list(checkpoints) + ['--output', output_csv]
    if prob_dir:
        argv += ['--save-prob-dir', prob_dir]
    sys.argv = argv
    try:
        ensemble_module.main()
    finally:
        sys.argv = backup

ensemble_checkpoints = [
    'outputs/convnext_unetpp_512_base/best.ckpt',
    'outputs/swin_deeplab_512_base/best.ckpt',
    'outputs/cmseg_lite_512_base/best.ckpt',
    'outputs/convnext_unetpp_768_heavyaug/best.ckpt',
    'outputs/cmseg_lite_512_synCM/best.ckpt',
]
ensemble_image_dir = str(test_images if test_images.exists() else Path('data/test_images'))
run_ensemble(ensemble_image_dir, ensemble_checkpoints, output_csv='submission.csv', prob_dir='ensemble_probs')
print('Ensemble submission saved to submission.csv')