In [None]:
import pandas as pd
from pathlib import Path

# ===== Paths (edit if needed) =====
METADATA_CSV = Path("/kaggle/input/MABe-mouse-behavior-detection/train.csv")
TRACKING_ROOT = Path("/kaggle/input/MABe-mouse-behavior-detection/train_tracking")

path = Path("/kaggle/input/MABe-mouse-behavior-detection/train.csv")

df = pd.read_csv(path)
# set target folder + filename
folder = Path("kaggle/working")   # change to your desired folder
filename = "my_dataframe.csv"

# ensure folder exists
folder.mkdir(parents=True, exist_ok=True)

# full path
out_path = folder / filename

# save
df.to_csv(out_path, index=False)  # index=False is usually what you want

print(f"Saved to: {out_path.resolve()}")

# Output folder (keeps same lab subfolders)
OUTPUT_ROOT = Path("kaggle/working/train_tracking_normalized")

# ===== Column names (with fallbacks) =====
LAB_COL = "lab_id"
VIDEO_COL = "video_id"

# Primary expected name based on your description + earlier usage
PPCM_COL_CANDIDATES = [
    "pix_per_cm_approx",
    "pix_per_cm",
    "pix per cm (approx)",  # fallback if original naming survived to CSV
]

# Tracking coordinate columns
X_COL = "x"
Y_COL = "y"

print('DONE')

In [None]:
df_meta = pd.read_csv(METADATA_CSV)

# Resolve pix-per-cm column
ppcm_col = None
for c in PPCM_COL_CANDIDATES:
    if c in df_meta.columns:
        ppcm_col = c
        break

if ppcm_col is None:
    raise KeyError(
        f"Could not find a pix-per-cm column. Tried: {PPCM_COL_CANDIDATES}\n"
        f"Available columns: {list(df_meta.columns)}"
    )

# Ensure required columns exist
missing = [c for c in [LAB_COL, VIDEO_COL, ppcm_col] if c not in df_meta.columns]
if missing:
    raise KeyError(f"Missing required metadata columns: {missing}")

# Clean + normalize types
df_meta = df_meta.copy()
df_meta[LAB_COL] = df_meta[LAB_COL].astype(str)
df_meta[VIDEO_COL] = df_meta[VIDEO_COL].astype(str)
df_meta[ppcm_col] = pd.to_numeric(df_meta[ppcm_col], errors="coerce")

# Drop rows without pix_per_cm
df_meta = df_meta.dropna(subset=[ppcm_col])

# If the metadata has multiple rows per (lab_id, video_id),
# keep the first non-null pix_per_cm (or you could aggregate).
df_unique = (
    df_meta[[LAB_COL, VIDEO_COL, ppcm_col]]
    .drop_duplicates(subset=[LAB_COL, VIDEO_COL])
    .reset_index(drop=True)
)

# Build lookup dict: (lab_id, video_id) -> pix_per_cm
ppcm_lookup = {
    (row[LAB_COL], row[VIDEO_COL]): row[ppcm_col]
    for _, row in df_unique.iterrows()
}

print(f"Loaded metadata rows: {len(df_meta)}")
print(f"Unique (lab_id, video_id) pairs with pix_per_cm: {len(df_unique)}")
print(f"Using pix-per-cm column: {ppcm_col}")


In [None]:
def normalize_one_tracking_file(parquet_path: Path, pix_per_cm: float) -> pd.DataFrame:
    """
    Load a tracking parquet file, add normalized columns, and return the new DataFrame.
    """
    df = pd.read_parquet(parquet_path)

    # Basic column checks
    if X_COL not in df.columns or Y_COL not in df.columns:
        raise KeyError(
            f"Expected columns '{X_COL}' and '{Y_COL}' in {parquet_path.name}. "
            f"Found: {list(df.columns)}"
        )

    if pix_per_cm == 0 or pd.isna(pix_per_cm):
        raise ValueError(f"Invalid pix_per_cm={pix_per_cm} for {parquet_path}")

    df = df.copy()
    df["x_normalized"] = df[X_COL] / pix_per_cm
    df["y_normalized"] = df[Y_COL] / pix_per_cm
    return df

In [None]:
OUTPUT_ROOT.mkdir(parents=True, exist_ok=True)

processed = 0
missing_files = 0
missing_meta = 0
errors = 0

# Iterate over unique metadata pairs
for (lab_id, video_id), pix_per_cm in ppcm_lookup.items():
    in_path = TRACKING_ROOT / lab_id / f"{video_id}.parquet"
    out_dir = OUTPUT_ROOT / lab_id
    out_path = out_dir / f"{video_id}.parquet"

    if not in_path.exists():
        missing_files += 1
        continue

    out_dir.mkdir(parents=True, exist_ok=True)

    try:
        df_norm = normalize_one_tracking_file(in_path, pix_per_cm)
        df_norm.to_parquet(out_path, index=False)
        processed += 1
    except Exception as e:
        errors += 1
        print(f"[ERROR] {lab_id=} {video_id=} file={in_path} -> {e}")

print("==== Spatial normalization summary ====")
print(f"Processed files:     {processed}")
print(f"Missing tracking:    {missing_files}")
print(f"Metadata missing:    {missing_meta}")  # kept for symmetry; not used in this loop
print(f"Errors:              {errors}")
print(f"Output root:         {OUTPUT_ROOT.resolve()}")
