# ==========================================
# Sanvia - 1 - Data Preprocessing Pipline
# ==========================================

In [1]:
import os
import sys
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from pathlib import Path
from typing import Dict, List, Tuple, Optional
import json
import pickle
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import cv2
import albumentations as A

# ==========================================
# 1. SEED AND CONFIGURATION
# ==========================================

In [None]:
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)
os.environ['TF_DETERMINISTIC_OPS'] = '1'
os.environ['PYTHONHASHSEED'] = str(SEED)

print(f"Python: {sys.version}")
print(f"TensorFlow: {tf.__version__}")
print(f"GPU Available: {tf.config.list_physical_devices('GPU')}")

# GPU
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        print(f"GPU memory growth enabled for {len(gpus)} GPU(s)")
    except RuntimeError as e:
        print(f"GPU configuration error: {e}")

DATA_DIR = Path('/content/drive/MyDrive/VnDir_Mammo')

plt.rcParams['figure.dpi'] = 150
plt.rcParams['savefig.dpi'] = 300

CONFIG = {
    # Paths
    'data_dir': DATA_DIR,
    'metadata_csv': DATA_DIR / 'metadata.csv',
    'breast_annotations_csv': DATA_DIR / 'breast-level_annotations.csv',
    'finding_annotations_csv': DATA_DIR / 'finding_annotations.csv',
    'images_dir': DATA_DIR / 'images' / 'images_png',

    # Output directories
    'output_dir': DATA_DIR / 'sanvia_outputs',
    'scaler_path': DATA_DIR / 'sanvia_outputs' / 'age_scaler.pkl',
    'class_weights_path': DATA_DIR / 'sanvia_outputs' / 'class_weights.json',
    'config_path': DATA_DIR / 'sanvia_outputs' / 'config.json',
    'image_cache_path': DATA_DIR / 'sanvia_outputs' / 'image_paths_cache.json',

    # Data splits
    'valid_fraction': 0.5,
    'test_split_name': 'test',

    # Image parameters
    'img_size': (512, 512),
    'num_channels': 3,
    'dtype': tf.float32,

    # Model parameters
    'backbone': 'EfficientNetB4',
    'tab_embed_dim': 128,

    # Training parameters
    'batch_size': 16,
    'buffer_size': 5000,
    'prefetch_buffer': tf.data.AUTOTUNE,
    'num_parallel_calls': tf.data.AUTOTUNE,
    'cache_dataset': True,

    # Augmentation parameters
    'augment_prob': 0.7,
    'rotation_limit': 7,
    'brightness_limit': 0.15,
    'contrast_limit': 0.15,
    'shift_scale_limit': 0.1,

    # Oversampling
    'oversample_factor': 2.0,

    # Labels
    'birads_classes': 5,
    'density_classes': 4,
    'finding_classes': ['mass', 'calcification', 'architectural_distortion'],

    # Class weights
    'focal_gamma': 2.0,
    'focal_alpha': 0.25,

    'seed': 42,
}

CONFIG['output_dir'].mkdir(parents=True, exist_ok=True)

with open(CONFIG['config_path'], 'w') as f:
    json.dump(CONFIG, f, indent=2, default=str, sort_keys=True)

print("Configuration:")
print(json.dumps(CONFIG, indent=2, default=str))

Python: 3.12.12 (main, Oct 10 2025, 08:52:57) [GCC 11.4.0]
TensorFlow: 2.19.0
GPU Available: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
GPU memory growth enabled for 1 GPU(s)
Configuration:
{
  "data_dir": "/content/drive/MyDrive/VnDir_Mammo",
  "metadata_csv": "/content/drive/MyDrive/VnDir_Mammo/metadata.csv",
  "breast_annotations_csv": "/content/drive/MyDrive/VnDir_Mammo/breast-level_annotations.csv",
  "finding_annotations_csv": "/content/drive/MyDrive/VnDir_Mammo/finding_annotations.csv",
  "images_dir": "/content/drive/MyDrive/VnDir_Mammo/images/images_png",
  "output_dir": "/content/drive/MyDrive/VnDir_Mammo/sanvia_outputs",
  "scaler_path": "/content/drive/MyDrive/VnDir_Mammo/sanvia_outputs/age_scaler.pkl",
  "class_weights_path": "/content/drive/MyDrive/VnDir_Mammo/sanvia_outputs/class_weights.json",
  "config_path": "/content/drive/MyDrive/VnDir_Mammo/sanvia_outputs/config.json",
  "image_cache_path": "/content/drive/MyDrive/VnDir_Mammo/sanvia_outputs/

# ==========================================
# 2. DATA LOADING
# ==========================================

In [3]:
def load_vindr_data(config: Dict) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    if not config['metadata_csv'].exists():
        raise FileNotFoundError(f"Metadata CSV not found: {config['metadata_csv']}")
    metadata_df = pd.read_csv(config['metadata_csv'])
    print(f"Metadata shape: {metadata_df.shape}")

    if not config['breast_annotations_csv'].exists():
        raise FileNotFoundError(f"Breast annotations CSV not found: {config['breast_annotations_csv']}")
    breast_df = pd.read_csv(config['breast_annotations_csv'])
    print(f"Breast annotations shape: {breast_df.shape}")

    finding_df = None
    if config['finding_annotations_csv'].exists():
        finding_df = pd.read_csv(config['finding_annotations_csv'])
        print(f"Finding annotations shape: {finding_df.shape}")
    else:
        print("Warning: Finding annotations CSV not found.")

    return metadata_df, breast_df, finding_df

metadata_df, breast_df, finding_df = load_vindr_data(CONFIG)

Metadata shape: (20000, 21)
Breast annotations shape: (20000, 10)
Finding annotations shape: (20486, 16)


# ============================================
# 3. DATA CLEANING
# ============================================

In [None]:
def clean_and_validate_data(metadata_df: pd.DataFrame,
                           breast_df: pd.DataFrame,
                           finding_df: Optional[pd.DataFrame]) -> Tuple[pd.DataFrame, pd.DataFrame, Optional[pd.DataFrame]]:

    df = breast_df.copy()

    required_breast_cols = ['study_id', 'series_id', 'laterality', 'view_position',
                           'breast_birads', 'breast_density', 'split']
    missing_cols = set(required_breast_cols) - set(df.columns)
    if missing_cols:
        raise ValueError(f"Missing required columns: {missing_cols}")

    # Clean BI-RADS - Extract digit, then convert to float to allow NaNs, then subtract 1
    df.loc[:, 'breast_birads'] = (
        df['breast_birads']
        .astype(str)
        .str.extract(r'(\d)')[0] # Returns Series values not found are NaN
        .astype(float) - 1
    )
    birads_invalid = ~df['breast_birads'].isin([0.0, 1.0, 2.0, 3.0, 4.0]) # Check float values
    if birads_invalid.any():
        print(f"Warning: {birads_invalid.sum()} invalid BI-RADS values. Dropping.")
        df = df.loc[~birads_invalid].copy() # Ensure new DataFrame is also a copy
    df.loc[:, 'breast_birads'] = df['breast_birads'].astype(int) # Now convert to int after dropping NaNs

    # Clean density - Extract letter, map to int, convert to float to allow NaNs
    df.loc[:, 'breast_density'] = (
        df['breast_density']
        .astype(str)
        .str.extract(r'DENSITY\s+([A-D])')[0] # Returns Series, values not found are NaN
        .map({'A': 0, 'B': 1, 'C': 2, 'D': 3})
        .astype(float) # Convert to float to allow NaN values
    )
    density_invalid = df['breast_density'].isna()
    if density_invalid.any():
        print(f"Warning: {density_invalid.sum()} invalid density values. Dropping.")
        df = df.loc[~density_invalid].copy() # Ensure new DataFrame is also a copy
    df.loc[:, 'breast_density'] = df['breast_density'].astype(int) # Now convert to int after dropping NaNs

    # Detect duplicates but DO NOT drop
    duplicates = df.groupby(['study_id', 'laterality', 'view_position']).size()
    duplicates = duplicates[duplicates > 1]
    if len(duplicates) > 0:
        print(f"Warning: {len(duplicates)} duplicate groups found. No rows will be dropped.")

    # Age column
    if "Patient's Age" not in metadata_df.columns:
        print("Warning: 'age' column not found. Creating placeholder.")
        metadata_df['age'] = np.nan

    return metadata_df, df, finding_df

metadata_df, breast_df, finding_df = clean_and_validate_data(metadata_df, breast_df, finding_df)



# ============================================
# 4. DATA DIAGNOSTICS REPORT
# ============================================

In [5]:
print("\n 1. GENERAL INFORMATION")
print(f"   - Dataset shape: {breast_df.shape}")
print(f"   - Number of studies: {breast_df['study_id'].nunique()}")
print(f"   - Number of unique images: {breast_df['image_id'].nunique()}")
print(f"   - Memory usage: {breast_df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

print("\n 2. COLUMN DETAILS")
column_info = pd.DataFrame({
    'DataType': breast_df.dtypes,
    'NonNullCount': breast_df.count(),
    'NullCount': breast_df.isnull().sum(),
    'UniqueValues': breast_df.nunique()
})
print(column_info)

print("\n 3. BI-RADS CLASS DISTRIBUTION")
birads_counts = breast_df['breast_birads'].value_counts().sort_index()
for cls, count in birads_counts.items():
    percentage = (count / len(breast_df)) * 100
    birads_name = f"BI-RADS {cls + 1}"
    print(f"   {birads_name:12} | Count: {count:5} | {percentage:5.1f}%")
print(f"   {'Total':12} | Count: {len(breast_df):5} | {'100.0%':8}")

print("\n 4. DENSITY CLASS DISTRIBUTION")
density_map = {0: 'A', 1: 'B', 2: 'C', 3: 'D'}
density_counts = breast_df['breast_density'].value_counts().sort_index()
for cls, count in density_counts.items():
    percentage = (count / len(breast_df)) * 100
    density_name = f"DENSITY {density_map[cls]}"
    print(f"   {density_name:12} | Count: {count:5} | {percentage:5.1f}%")

print("\n 5. VIEW POSITION DISTRIBUTION")
view_counts = breast_df['view_position'].value_counts()
for view, count in view_counts.items():
    percentage = (count / len(breast_df)) * 100
    print(f"   {view:6} | Count: {count:5} | {percentage:5.1f}%")

print("\n 6. LATERALITY DISTRIBUTION")
lat_counts = breast_df['laterality'].value_counts()
for lat, count in lat_counts.items():
    percentage = (count / len(breast_df)) * 100
    print(f"   {lat:6} | Count: {count:5} | {percentage:5.1f}%")

print("\n 7. SPLIT DISTRIBUTION")
split_counts = breast_df['split'].value_counts()
for split, count in split_counts.items():
    percentage = (count / len(breast_df)) * 100
    print(f"   {split:8} | Count: {count:5} | {percentage:5.1f}%")

print("\n 8. MISSING VALUES CHECK")
missing_info = breast_df.isnull().sum()
if missing_info.sum() == 0:
    print("No missing values found!")
else:
    print("Missing values detected:")
    for col, count in missing_info[missing_info > 0].items():
        print(f"   - {col}: {count} missing values")

print("\n 9. STUDY STATISTICS")
studies_stats = {
    'Total studies': breast_df['study_id'].nunique(),
    'Avg images per study': len(breast_df) / breast_df['study_id'].nunique(),
    'Min images per study': breast_df.groupby('study_id').size().min(),
    'Max images per study': breast_df.groupby('study_id').size().max()
}
for stat, value in studies_stats.items():
    print(f"   {stat:20}: {value:.2f}" if isinstance(value, float) else f"   {stat:20}: {value}")

print("\n 10. DATA SAMPLE (Cleaned)")
print(breast_df.head(10).to_string())

print("\n 11. KEY OBSERVATIONS")
print(f"    BI-RADS classes: {sorted(breast_df['breast_birads'].unique())}")
print(f"    Density classes: {sorted(breast_df['breast_density'].unique())}")
print(f"    Unique studies: {breast_df['study_id'].nunique()}")

diagnostic_report = {
    'dataset_shape': breast_df.shape,
    'birads_distribution': breast_df['breast_birads'].value_counts().sort_index().to_dict(),
    'density_distribution': breast_df['breast_density'].value_counts().sort_index().to_dict(),
    'view_distribution': breast_df['view_position'].value_counts().to_dict(),
    'laterality_distribution': breast_df['laterality'].value_counts().to_dict(),
    'split_distribution': breast_df.get('split_final', breast_df['split']).value_counts().to_dict(),
    'missing_values': breast_df.isnull().sum().to_dict(),
    'memory_usage_mb': breast_df.memory_usage(deep=True).sum() / 1024**2,
    'unique_studies': int(breast_df['study_id'].nunique())
}

report_path = Path(CONFIG['output_dir']) / 'diagnostic_report.json'
with open(report_path, 'w') as f:
    json.dump(diagnostic_report, f, indent=2, default=str)

print(f"\n Diagnostic report saved to: {report_path}")
print("="*80)
print(" Data diagnostics completed successfully!")
print("="*80)



 1. GENERAL INFORMATION
   - Dataset shape: (20000, 10)
   - Number of studies: 5000
   - Number of unique images: 20000
   - Memory usage: 9.32 MB

 2. COLUMN DETAILS
               DataType  NonNullCount  NullCount  UniqueValues
study_id         object         20000          0          5000
series_id        object         20000          0          5036
image_id         object         20000          0         20000
laterality       object         20000          0             2
view_position    object         20000          0             2
height            int64         20000          0             3
width             int64         20000          0            58
breast_birads    object         20000          0             5
breast_density   object         20000          0             4
split            object         20000          0             2

 3. BI-RADS CLASS DISTRIBUTION
   BI-RADS 1    | Count: 13406 |  67.0%
   BI-RADS 2    | Count:  4676 |  23.4%
   BI-RADS 3    | Count:  

# ============================================
# 5. PATIENT SPLITS
# ============================================

In [6]:
def create_patient_splits(breast_df: pd.DataFrame, config: Dict) -> pd.DataFrame:

    studies = breast_df['study_id'].unique()
    test_studies = breast_df[breast_df['split'] == config['test_split_name']]['study_id'].unique()
    train_studies = np.setdiff1d(studies, test_studies)

    valid_studies, final_test_studies = train_test_split(
        test_studies,
        test_size=1-config['valid_fraction'],
        random_state=config['seed']
    )

    breast_df['split_final'] = 'train'
    breast_df.loc[breast_df['study_id'].isin(valid_studies), 'split_final'] = 'val'
    breast_df.loc[breast_df['study_id'].isin(final_test_studies), 'split_final'] = 'test'

    print(f"Train: {len(train_studies)}, Val: {len(valid_studies)}, Test: {len(final_test_studies)}")

    return breast_df

breast_df = create_patient_splits(breast_df, CONFIG)

Train: 4000, Val: 500, Test: 500


# ============================================
# 6. MERGE TABULAR FEATURES
# ============================================

In [7]:
def merge_tabular_features(breast_df: pd.DataFrame,
                           metadata_df: pd.DataFrame,
                           config: Dict) -> pd.DataFrame:

    if "Patient's Age" in metadata_df.columns:
        metadata_df = metadata_df.rename(columns={"Patient's Age": "age"})
    elif "age" not in metadata_df.columns:
        print("Warning: No age column found. Creating placeholder.")
        metadata_df["age"] = np.nan

    metadata_df["age"] = (
        metadata_df["age"]
        .astype(str)
        .str.extract(r'(\d+)')
        .astype(float)
    )

    metadata_subset = (
        metadata_df[["SOP Instance UID", "age"]]
        .rename(columns={"SOP Instance UID": "image_name_no_ext"})
        .groupby("image_name_no_ext", as_index=False)
        .first()
    )

    breast_df["image_name_no_ext"] = breast_df["image_id"].str.replace(".png", "", regex=False)

    age_median = metadata_subset["age"].median(skipna=True)
    print(f"Median age: {age_median}")

    metadata_subset["age_missing_flag"] = metadata_subset["age"].isna().astype(int)
    metadata_subset["age"] = metadata_subset["age"].fillna(age_median)

    merged_df = breast_df.merge(
        metadata_subset,
        on="image_name_no_ext",
        how="left",
        validate="many_to_one"
    )

    if merged_df["age"].isna().any():
        merged_df["age"] = merged_df["age"].fillna(age_median)
        merged_df["age_missing_flag"] = merged_df["age_missing_flag"].fillna(1).astype(int)

    return merged_df

merged_df = merge_tabular_features(breast_df, metadata_df, CONFIG)
print(f"Merged dataframe: {merged_df.shape}")

Median age: 45.0
Merged dataframe: (20000, 14)


# ============================================
# 7. TABULAR SCALER
# ============================================

In [8]:
class TabularScaler:
    def __init__(self, features: List[str]):
        self.features = features
        self.scalers = {feat: MinMaxScaler() for feat in features}
        self.fitted = False

    def fit(self, df: pd.DataFrame):
        for feat in self.features:
            self.scalers[feat].fit(df[[feat]])
        self.fitted = True

    def transform(self, df: pd.DataFrame) -> np.ndarray:
        if not self.fitted:
            raise ValueError("Scaler must be fitted first!")
        scaled_features = []
        for feat in self.features:
            scaled = self.scalers[feat].transform(df[[feat]])
            scaled_features.append(scaled)
        return np.concatenate(scaled_features, axis=1)

    def save(self, path: Path):
        scaler_params = {}
        for feat, scaler in self.scalers.items():
            scaler_params[feat] = {
                'min_': scaler.min_.tolist(),
                'scale_': scaler.scale_.tolist(),
                'data_min_': scaler.data_min_.tolist(),
                'data_max_': scaler.data_max_.tolist(),
                'data_range_': scaler.data_range_.tolist()
            }
        with open(path, 'wb') as f:
            pickle.dump(scaler_params, f)

tabular_features = ['age']
tabular_scaler = TabularScaler(tabular_features)

train_df = merged_df[merged_df['split_final'] == 'train']
tabular_scaler.fit(train_df)

merged_df['age_norm'] = tabular_scaler.transform(merged_df)[:, 0]

tabular_scaler.save(CONFIG['scaler_path'])
print(f"Scaler saved to {CONFIG['scaler_path']}")

Scaler saved to /content/drive/MyDrive/VnDir_Mammo/sanvia_outputs/age_scaler.pkl


# ============================================
# 8. CREATE VIEW MAPPING WITH ALL CLASSES
# ============================================

In [9]:
def create_view_mapping(df: pd.DataFrame) -> pd.DataFrame:

    result = []

    for study_id, group in df.groupby('study_id'):
        study_data = {'study_id': study_id}

        for _, row in group.iterrows():
            view = f"{row['laterality']}_{row['view_position']}"
            study_data[f'image_id_{view}'] = row['image_id']
            study_data[f'breast_birads_{view}'] = row['breast_birads']
            study_data[f'breast_density_{view}'] = row['breast_density']
            study_data['age_norm'] = row.get('age_norm', 0.5)
            study_data['age_missing_flag'] = row.get('age_missing_flag', 0)
            study_data['split_final'] = row.get('split_final', 'train')

        result.append(study_data)

    view_mapping = pd.DataFrame(result)

    required_views = ['L_CC', 'L_MLO', 'R_CC', 'R_MLO']
    for view in required_views:
        if f'image_id_{view}' not in view_mapping.columns:
            view_mapping[f'image_id_{view}'] = 'missing.png'
            view_mapping[f'breast_birads_{view}'] = 0
            view_mapping[f'breast_density_{view}'] = 2

    return view_mapping

print(f" merged_df contains {len(merged_df)} images with all classes")
view_mapping = create_view_mapping(merged_df)

print(f"\n Corrected view mapping distribution:")
print("BI-RADS L_CC:", view_mapping['breast_birads_L_CC'].value_counts().sort_index())
print("Density L_CC:", view_mapping['breast_density_L_CC'].value_counts().sort_index())

view_mapping.to_csv(CONFIG['output_dir'] / 'view_mapping_corrected.csv', index=False)
print(f"\n Saved corrected mapping: {CONFIG['output_dir'] / 'view_mapping_corrected.csv'}")


 merged_df contains 20000 images with all classes

 Corrected view mapping distribution:
BI-RADS L_CC: breast_birads_L_CC
0    3328
1    1173
2     241
3     204
4      54
Name: count, dtype: int64
Density L_CC: breast_density_L_CC
0      24
1     477
2    3821
3     678
Name: count, dtype: int64

 Saved corrected mapping: /content/drive/MyDrive/VnDir_Mammo/sanvia_outputs/view_mapping_corrected.csv


# =========================================
# 9. UNIFY AGE FIELDS
# =========================================

In [10]:
def unify_age_fields(df: pd.DataFrame) -> pd.DataFrame:
    df_copy = df.copy()

    age_cols = [c for c in df.columns if 'age_norm_' in c]
    flag_cols = [c for c in df.columns if 'age_missing_flag_' in c]

    if age_cols:
        df_copy['age_norm'] = df_copy[age_cols].mean(axis=1, skipna=True)
    if flag_cols:
        df_copy['age_missing_flag'] = df_copy[flag_cols].max(axis=1)

    return df_copy

view_mapping = unify_age_fields(view_mapping)

# ============================================
# 10. COMPUTE ADVANCED CLASS WEIGHTS
# ============================================

In [None]:
def compute_class_weights_advanced(df, config, method='effective'):

    weights = {}

    # BI-RADS
    birads_col = 'breast_birads_L_CC'
    birads_counts = df[birads_col].value_counts().sort_index()

    print("\n BI-RADS distribution:")
    print(birads_counts)

    if method == 'effective':
        beta = 0.9999
        effective_num = 1.0 - np.power(beta, birads_counts)
        weights_birads = (1.0 - beta) / effective_num
        weights_birads = weights_birads / np.min(weights_birads[birads_counts > 0])
        weights_birads[4] *= 2.0

    elif method == 'balanced':
        total_samples = len(df)
        weights_birads = total_samples / (len(birads_counts) * birads_counts)
        weights_birads = weights_birads / np.mean(weights_birads)

    weights['birads'] = {str(i): float(weights_birads[i]) for i in birads_counts.index}

    # Density
    density_col = 'breast_density_L_CC'
    density_counts = df[density_col].value_counts().sort_index()

    print("\n Density distribution:")
    print(density_counts)

    if method == 'effective':
        effective_num = 1.0 - np.power(beta, density_counts)
        weights_density = (1.0 - beta) / effective_num
        weights_density = weights_density / np.min(weights_density[density_counts > 0])
        weights_density[0] *= 3.0

    elif method == 'balanced':
        total_samples = len(df)
        weights_density = total_samples / (len(density_counts) * density_counts)
        weights_density = weights_density / np.mean(weights_density)

    weights['density'] = {str(i): float(weights_density[i]) for i in density_counts.index}

    weights_path = Path(config['output_dir']) / 'class_weights_advanced.json'
    with open(weights_path, 'w') as f:
        json.dump(weights, f, indent=2)

    print("\n Class weights computed:")
    for task, w in weights.items():
        print(f"   {task}: {w}")

    return weights

class_weights = compute_class_weights_advanced(view_mapping, CONFIG, method='effective')


 BI-RADS distribution:
breast_birads_L_CC
0    3328
1    1173
2     241
3     204
4      54
Name: count, dtype: int64

ðŸ“Š Density distribution:
breast_density_L_CC
0      24
1     477
2    3821
3     678
Name: count, dtype: int64

 Class weights computed:
   birads: {'0': 1.0, '1': 2.5576511187579705, '2': 11.888350644466636, '3': 14.01870755276353, '4': 105.12935778082031}
   density: {'0': 397.43950968356467, '1': 6.817714903514889, '2': 1.0, '3': 4.844515923073796}


# ============================================
# 11. BUILD IMAGE PATH CACHE
# ============================================

In [12]:
def build_image_path_cache(images_dir: Path, cache_path: Path) -> Dict[str, str]:

    if cache_path.exists():
        with open(cache_path, 'r') as f:
            return json.load(f)

    print("Building image path cache (first time only)...")
    all_pngs = list(images_dir.rglob("*.png"))
    image_map = {p.name: str(p) for p in all_pngs}

    with open(cache_path, 'w') as f:
        json.dump(image_map, f)

    print(f" Cached {len(image_map)} image paths to {cache_path}")
    return image_map

IMAGE_PATH_CACHE = build_image_path_cache(CONFIG['images_dir'], CONFIG['image_cache_path'])

# ============================================
# 12. FINAL SUMMARY
# ============================================

In [13]:
print("\n" + "="*60)
print(" OPTIMIZED SANVIA PIPELINE COMPLETE")
print("="*60)
print(f" Image path cache: {len(IMAGE_PATH_CACHE)} images")
print(f" View mapping: {len(view_mapping)} studies")
print(f" Class weights computed")
print("="*60)

view_mapping.to_csv(CONFIG['output_dir'] / 'view_mapping_final.csv', index=False)
print(f" Final view mapping saved to: {CONFIG['output_dir'] / 'view_mapping_final.csv'}")


 OPTIMIZED SANVIA PIPELINE COMPLETE
 Image path cache: 20001 images
 View mapping: 5000 studies
 Class weights computed
 Final view mapping saved to: /content/drive/MyDrive/VnDir_Mammo/sanvia_outputs/view_mapping_final.csv
