In [None]:
# 0.45895
import os
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# ---------------------------------------------------------------------
# Paths (update if needed)
# ---------------------------------------------------------------------
BASE_DIR = "data/Question4/baseball-pitch-tracking-cs-gy-6643/baseball_kaggle_dataset_trimmed_only"

TRAIN_CSV = os.path.join(BASE_DIR, "data", "train_ground_truth.csv")
TEST_FEATURES_CSV = os.path.join(BASE_DIR, "data", "test_features.csv")

TEST_TEMPLATE_CSV = "data/Question4/baseball-pitch-tracking-cs-gy-6643/test_submission_template.csv"

# (Videos are not used in this baseline, but paths are here for reference)
TRAIN_VIDEO_DIR = os.path.join(BASE_DIR, "train_trimmed")
TEST_VIDEO_DIR = os.path.join(BASE_DIR, "test")

# ---------------------------------------------------------------------
# Load data
# ---------------------------------------------------------------------
train_df = pd.read_csv(TRAIN_CSV)
test_df = pd.read_csv(TEST_FEATURES_CSV)
template_df = pd.read_csv(TEST_TEMPLATE_CSV)

# Quick sanity checks
print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)
print("Template shape:", template_df.shape)

# ---------------------------------------------------------------------
# Features and targets
# ---------------------------------------------------------------------
numeric_features = [
    "sz_top",
    "sz_bot",
    "release_speed",
    "effective_speed",
    "release_spin_rate",
    "release_pos_x",
    "release_pos_y",
    "release_pos_z",
    "release_extension",
    "pfx_x",
    "pfx_z",
]

categorical_features = [
    "stand",
    "p_throws",
]

# X and y for train
X_train_full = train_df[numeric_features + categorical_features].copy()
y_class = train_df["pitch_class"].copy()  # "strike" or "ball"
y_zone = train_df["zone"].copy().astype(int)  # 1 to 14

# X for test
X_test_full = test_df[numeric_features + categorical_features].copy()

# Basic cleaning: fill missing numeric with median, categorical with "Unknown"
for col in numeric_features:
    median_val = X_train_full[col].median()
    X_train_full[col].fillna(median_val, inplace=True)
    X_test_full[col].fillna(median_val, inplace=True)

for col in categorical_features:
    X_train_full[col].fillna("Unknown", inplace=True)
    X_test_full[col].fillna("Unknown", inplace=True)

# ---------------------------------------------------------------------
# Preprocessing and model definitions
# ---------------------------------------------------------------------
numeric_transformer = Pipeline(
    steps=[
        ("scaler", StandardScaler())
    ]
)

categorical_transformer = Pipeline(
    steps=[
        ("onehot", OneHotEncoder(handle_unknown="ignore"))
    ]
)

preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

def make_pipeline(random_state=42, n_estimators=300, max_depth=None):
    """Return a fresh pipeline (preprocess + RandomForest)."""
    clf = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        n_jobs=-1,
        random_state=random_state,
        class_weight=None,
    )
    pipe = Pipeline(
        steps=[
            ("preprocess", preprocess),
            ("clf", clf),
        ]
    )
    return pipe

# ---------------------------------------------------------------------
# Train validation split to get a sense of performance
# ---------------------------------------------------------------------
X_tr, X_val, y_class_tr, y_class_val, y_zone_tr, y_zone_val = train_test_split(
    X_train_full,
    y_class,
    y_zone,
    test_size=0.2,
    random_state=42,
    stratify=y_class,  # stratify on pitch class to keep class balance
)

# Pipelines for each target (fresh preprocess for each)
pipe_class = make_pipeline(random_state=42, n_estimators=300, max_depth=None)
pipe_zone = make_pipeline(random_state=43, n_estimators=400, max_depth=None)

# ---------------------------------------------------------------------
# Train models
# ---------------------------------------------------------------------
print("\nTraining pitch_class model...")
pipe_class.fit(X_tr, y_class_tr)

print("Training zone model...")
pipe_zone.fit(X_tr, y_zone_tr)

# ---------------------------------------------------------------------
# Validation metrics (simple check, not used in Kaggle submission)
# ---------------------------------------------------------------------
y_class_val_pred = pipe_class.predict(X_val)
y_zone_val_pred = pipe_zone.predict(X_val)

acc_class = accuracy_score(y_class_val, y_class_val_pred)
acc_zone = accuracy_score(y_zone_val, y_zone_val_pred)
overall_score = 0.7 * acc_class + 0.3 * acc_zone

print(f"\nValidation pitch_class accuracy: {acc_class:.4f}")
print(f"Validation zone accuracy:       {acc_zone:.4f}")
print(f"Validation competition score:   {overall_score:.4f}")

# ---------------------------------------------------------------------
# Retrain on full training data for final models (optional but better)
# ---------------------------------------------------------------------
print("\nRetraining pitch_class model on full training data...")
pipe_class_full = make_pipeline(random_state=42, n_estimators=300, max_depth=None)
pipe_class_full.fit(X_train_full, y_class)

print("Retraining zone model on full training data...")
pipe_zone_full = make_pipeline(random_state=43, n_estimators=400, max_depth=None)
pipe_zone_full.fit(X_train_full, y_zone)

# ---------------------------------------------------------------------
# Predict on test data
# ---------------------------------------------------------------------
print("\nPredicting on test set...")
test_pitch_class_pred = pipe_class_full.predict(X_test_full)
test_zone_pred = pipe_zone_full.predict(X_test_full)

# Ensure zone is integer from 1 to 14
test_zone_pred = test_zone_pred.astype(int)

# ---------------------------------------------------------------------
# Build submission based on template file
# ---------------------------------------------------------------------
submission = template_df.copy()

# Join predictions by file_name, to be safe with order
pred_df = pd.DataFrame(
    {
        "file_name": test_df["file_name"],
        "pitch_class": test_pitch_class_pred,
        "zone": test_zone_pred,
    }
)

submission = submission.drop(columns=["pitch_class", "zone"], errors="ignore")
submission = submission.merge(pred_df, on="file_name", how="left")

# Safety checks
missing_rows = submission["pitch_class"].isna().sum() + submission["zone"].isna().sum()
if missing_rows > 0:
    print(f"Warning: there are {missing_rows} missing predictions in the submission.")

# ---------------------------------------------------------------------
# Save submission
# ---------------------------------------------------------------------
OUTPUT_SUBMISSION_PATH = "submission_baseball_tabular_baseline.csv"
submission.to_csv(OUTPUT_SUBMISSION_PATH, index=False)
print(f"\nSubmission file written to: {OUTPUT_SUBMISSION_PATH}")
print(submission.head())

Train shape: (6000, 18)
Test shape: (4000, 14)
Template shape: (4000, 3)

Training pitch_class model...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_train_full[col].fillna(median_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_test_full[col].fillna(median_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting

Training zone model...

Validation pitch_class accuracy: 0.5433
Validation zone accuracy:       0.1925
Validation competition score:   0.4381

Retraining pitch_class model on full training data...
Retraining zone model on full training data...

Predicting on test set...

Submission file written to: submission_baseball_tabular_baseline.csv
     file_name pitch_class  zone
0   pitch2.mp4        ball    12
1   pitch6.mp4      strike    11
2   pitch7.mp4        ball    12
3   pitch9.mp4      strike     6
4  pitch10.mp4        ball    12


In [None]:
# 0.46890

import os
import random
import numpy as np
import pandas as pd
from tqdm import tqdm

import cv2

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

import torchvision
from torchvision import models


# =========================================================
# Device selection (macOS friendly)
# =========================================================

if torch.backends.mps.is_available():
    device = torch.device("mps")
else:
    device = torch.device("cpu")

print("Device:", device)


# =========================================================
# Paths
# =========================================================

BASE_DIR = "data/Question4/baseball-pitch-tracking-cs-gy-6643/baseball_kaggle_dataset_trimmed_only"

TRAIN_CSV = os.path.join(BASE_DIR, "data", "train_ground_truth.csv")
TEST_FEATURES_CSV = os.path.join(BASE_DIR, "data", "test_features.csv")

TRAIN_VIDEO_DIR = os.path.join(BASE_DIR, "train_trimmed")
TEST_VIDEO_DIR = os.path.join(BASE_DIR, "test")

TEST_TEMPLATE_CSV = "data/Question4/baseball-pitch-tracking-cs-gy-6643/test_submission_template.csv"

OUTPUT_SUBMISSION = "submission_baseball_hybrid_macos.csv"


# =========================================================
# Settings optimized for macOS (lower memory)
# =========================================================

SEED = 42
NUM_FRAMES = 6         # slightly lower to avoid RAM issues
IMG_SIZE = 112
BATCH_SIZE = 4         # smaller batches recommended on mac
EPOCHS_VAL = 3
EPOCHS_FULL = 5
LR = 1e-4
WEIGHT_CLASS = 0.7
WEIGHT_ZONE = 0.3


# =========================================================
# Reproducibility
# =========================================================

def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)

set_seed(SEED)


# =========================================================
# Load Data
# =========================================================

train_df = pd.read_csv(TRAIN_CSV)
test_df = pd.read_csv(TEST_FEATURES_CSV)
template_df = pd.read_csv(TEST_TEMPLATE_CSV)

print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)
print("Template shape:", template_df.shape)


# =========================================================
# Tabular preprocessing
# =========================================================

numeric_features = [
    "sz_top", "sz_bot", "release_speed", "effective_speed",
    "release_spin_rate", "release_pos_x", "release_pos_y",
    "release_pos_z", "release_extension", "pfx_x", "pfx_z",
]

categorical_features = ["stand", "p_throws"]

X_train_tab_raw = train_df[numeric_features + categorical_features].copy()
X_test_tab_raw  = test_df[numeric_features + categorical_features].copy()

# Fill missing values safely
for col in numeric_features:
    median_val = X_train_tab_raw[col].median()
    X_train_tab_raw[col] = X_train_tab_raw[col].fillna(median_val)
    X_test_tab_raw[col] = X_test_tab_raw[col].fillna(median_val)

for col in categorical_features:
    X_train_tab_raw[col] = X_train_tab_raw[col].fillna("Unknown")
    X_test_tab_raw[col] = X_test_tab_raw[col].fillna("Unknown")

# Prepare transformer
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown="ignore")

preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

X_train_tab = preprocess.fit_transform(X_train_tab_raw)
X_test_tab = preprocess.transform(X_test_tab_raw)

# ensure numpy arrays
X_train_tab = np.asarray(X_train_tab, dtype=np.float32)
X_test_tab = np.asarray(X_test_tab, dtype=np.float32)

tabular_dim = X_train_tab.shape[1]
print("Tabular dim:", tabular_dim)


# =========================================================
# Targets
# =========================================================

class_to_idx = {"ball": 0, "strike": 1}
idx_to_class = {v: k for k, v in class_to_idx.items()}

y_class = np.array([class_to_idx[c] for c in train_df["pitch_class"]], dtype=np.int64)
y_zone  = train_df["zone"].values.astype(np.int64) - 1   # 0..13


# =========================================================
# Train/Val split indices
# =========================================================

indices = np.arange(len(train_df))

train_idx, val_idx = train_test_split(
    indices, test_size=0.2, random_state=SEED, stratify=y_class
)

print(f"Train size: {len(train_idx)}")
print(f"Val size: {len(val_idx)}")


# =========================================================
# Video loader (macOS safe)
# =========================================================

def load_video_frames(path, num_frames=NUM_FRAMES, img_size=IMG_SIZE):
    frames = []
    cap = cv2.VideoCapture(path)

    if not cap.isOpened():
        return np.zeros((num_frames, 3, img_size, img_size), dtype="float32")

    while True:
        ret, frame = cap.read()
        if not ret:
            break
        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        frame = cv2.resize(frame, (img_size, img_size))
        frames.append(frame)

    cap.release()

    if len(frames) == 0:
        frames = [np.zeros((img_size, img_size, 3), dtype=np.uint8)] * num_frames

    idxs = np.linspace(0, len(frames)-1, num_frames).astype(int)
    sampled = [frames[i] for i in idxs]

    arr = np.stack(sampled).astype("float32") / 255.0
    arr = np.transpose(arr, (0, 3, 1, 2))

    return arr


# =========================================================
# Dataset
# =========================================================

class PitchDataset(Dataset):
    def __init__(self, df, tab, video_dir, indices, y_class=None, y_zone=None, is_train=True):
        self.df = df
        self.tab = tab
        self.video_dir = video_dir
        self.indices = indices
        self.y_class = y_class
        self.y_zone = y_zone
        self.is_train = is_train

        # ImageNet normalization
        self.mean = torch.tensor([0.485, 0.456, 0.406]).view(1,3,1,1)
        self.std  = torch.tensor([0.229, 0.224, 0.225]).view(1,3,1,1)

    def __len__(self):
        return len(self.indices)

    def __getitem__(self, idx):
        real_idx = self.indices[idx]
        row = self.df.iloc[real_idx]
        file_name = row["file_name"]
        path = os.path.join(self.video_dir, file_name)

        frames_np = load_video_frames(path)
        frames = torch.from_numpy(frames_np)
        frames = (frames - self.mean) / self.std

        tab_tensor = torch.from_numpy(self.tab[real_idx].astype("float32"))

        if self.is_train:
            c = torch.tensor(self.y_class[real_idx], dtype=torch.long)
            z = torch.tensor(self.y_zone[real_idx], dtype=torch.long)
            return frames, tab_tensor, c, z

        else:
            return frames, tab_tensor, file_name


# =========================================================
# Model (unchanged)
# =========================================================

class HybridModel(nn.Module):
    def __init__(self, tabular_dim, num_zones=14):
        super().__init__()

        res = models.resnet18(weights=models.ResNet18_Weights.IMAGENET1K_V1)
        self.cnn = nn.Sequential(*list(res.children())[:-1])  # remove final FC
        video_dim = 512

        self.tab_mlp = nn.Sequential(
            nn.Linear(tabular_dim, 128),
            nn.ReLU(),
            nn.BatchNorm1d(128),
            nn.Linear(128, 128),
            nn.ReLU(),
        )

        self.shared = nn.Sequential(
            nn.Linear(video_dim + 128, 256),
            nn.ReLU(),
            nn.BatchNorm1d(256)
        )

        self.class_head = nn.Linear(256, 2)
        self.zone_head  = nn.Linear(256, num_zones)

    def forward(self, video, tab):
        B, T, C, H, W = video.shape

        # --- IMPORTANT FIX FOR macOS / MPS ---
        # Flatten time dimension safely
        video = video.reshape(B * T, C, H, W).contiguous()

        # Make sure it's contiguous for CNN (ResNet has internal view ops)
        video = video.contiguous()

        # CNN
        feat = self.cnn(video)  # (B*T, 512, 1, 1)

        # Ensure the CNN output is contiguous before reshaping
        feat = feat.contiguous()

        # Reshape back to (B, T, 512)
        feat = feat.reshape(B, T, -1).mean(dim=1)

        # Tabular branch
        tab_feat = self.tab_mlp(tab)

        # Fusion
        fused = torch.cat([feat, tab_feat], dim=1)
        shared = self.shared(fused)

        return self.class_head(shared), self.zone_head(shared)


# =========================================================
# Dataloaders (mac-safe)
# =========================================================

train_loader = DataLoader(
    PitchDataset(train_df, X_train_tab, TRAIN_VIDEO_DIR, train_idx, y_class, y_zone),
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_workers=0,       # macOS: must be 0
)

val_loader = DataLoader(
    PitchDataset(train_df, X_train_tab, TRAIN_VIDEO_DIR, val_idx, y_class, y_zone),
    batch_size=BATCH_SIZE,
    shuffle=False,
    num_workers=0,
)


# =========================================================
# Train one epoch
# =========================================================

def train_one_epoch(model, loader, opt, c_loss, z_loss):
    model.train()
    total = 0
    total_loss = 0

    for frames, tab_vec, c, z in tqdm(loader):
        frames = frames.to(device)
        tab_vec = tab_vec.to(device)
        c = c.to(device)
        z = z.to(device)

        opt.zero_grad()
        out_c, out_z = model(frames, tab_vec)

        lc = c_loss(out_c, c)
        lz = z_loss(out_z, z)
        loss = WEIGHT_CLASS*lc + WEIGHT_ZONE*lz

        loss.backward()
        opt.step()

        total_loss += loss.item() * frames.size(0)
        total += frames.size(0)

    return total_loss/total


def evaluate(model, loader):
    model.eval()
    correct_c, correct_z, total = 0, 0, 0

    with torch.no_grad():
        for frames, tab_vec, c, z in tqdm(loader):
            frames = frames.to(device)
            tab_vec = tab_vec.to(device)
            c = c.to(device)
            z = z.to(device)

            out_c, out_z = model(frames, tab_vec)

            pc = out_c.argmax(1)
            pz = out_z.argmax(1)

            correct_c += (pc == c).sum().item()
            correct_z += (pz == z).sum().item()
            total += frames.size(0)

    acc_c = correct_c / total
    acc_z = correct_z / total
    score = WEIGHT_CLASS*acc_c + WEIGHT_ZONE*acc_z
    return acc_c, acc_z, score


# =========================================================
# Stage 1: Train with val split
# =========================================================

model = HybridModel(tabular_dim).to(device)
opt = optim.Adam(model.parameters(), lr=LR)
c_loss = nn.CrossEntropyLoss()
z_loss = nn.CrossEntropyLoss()

print("\nTraining with validation split...\n")
for e in range(1, EPOCHS_VAL+1):
    print(f"Epoch {e}/{EPOCHS_VAL}")
    loss = train_one_epoch(model, train_loader, opt, c_loss, z_loss)
    ac, az, sc = evaluate(model, val_loader)
    print(f"Loss: {loss:.4f} | Class Acc: {ac:.4f} | Zone Acc: {az:.4f} | Score: {sc:.4f}\n")


# =========================================================
# Stage 2: Full training
# =========================================================

full_loader = DataLoader(
    PitchDataset(train_df, X_train_tab, TRAIN_VIDEO_DIR, np.arange(len(train_df)), y_class, y_zone),
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_workers=0,
)

model_final = HybridModel(tabular_dim).to(device)
opt2 = optim.Adam(model_final.parameters(), lr=LR)

print("\nTraining on full dataset...\n")
for e in range(1, EPOCHS_FULL+1):
    print(f"Full Epoch {e}/{EPOCHS_FULL}")
    loss = train_one_epoch(model_final, full_loader, opt2, c_loss, z_loss)
    print(f"Loss: {loss:.4f}\n")


# =========================================================
# Inference
# =========================================================

test_loader = DataLoader(
    PitchDataset(test_df, X_test_tab, TEST_VIDEO_DIR, np.arange(len(test_df)), is_train=False),
    batch_size=BATCH_SIZE,
    shuffle=False,
    num_workers=0,
)

model_final.eval()

file_names = []
pred_class = []
pred_zone = []

with torch.no_grad():
    for frames, tab_vec, names in tqdm(test_loader):
        frames = frames.to(device)
        tab_vec = tab_vec.to(device)

        out_c, out_z = model_final(frames, tab_vec)

        pc = out_c.argmax(1).cpu().numpy()
        pz = out_z.argmax(1).cpu().numpy() + 1

        file_names += list(names)
        pred_class += [idx_to_class[i] for i in pc]
        pred_zone += list(pz)


pred_df = pd.DataFrame({
    "file_name": file_names,
    "pitch_class": pred_class,
    "zone": pred_zone,
})

submission = template_df.merge(pred_df, on="file_name", how="left")
submission.to_csv(OUTPUT_SUBMISSION, index=False)

print("Saved:", OUTPUT_SUBMISSION)
print(submission.head())

Device: mps
Train shape: (6000, 18)
Test shape: (4000, 14)
Template shape: (4000, 3)
Tabular dim: 15
Train size: 4800
Val size: 1200

Training with validation split...

Epoch 1/3


100%|██████████| 1200/1200 [04:44<00:00,  4.22it/s]
100%|██████████| 300/300 [00:45<00:00,  6.64it/s]


Loss: 1.2957 | Class Acc: 0.5858 | Zone Acc: 0.1567 | Score: 0.4571

Epoch 2/3


100%|██████████| 1200/1200 [04:25<00:00,  4.53it/s]
100%|██████████| 300/300 [01:23<00:00,  3.60it/s]


Loss: 1.2301 | Class Acc: 0.5550 | Zone Acc: 0.1700 | Score: 0.4395

Epoch 3/3


100%|██████████| 1200/1200 [06:03<00:00,  3.30it/s]
100%|██████████| 300/300 [01:39<00:00,  3.02it/s]


Loss: 1.1986 | Class Acc: 0.5642 | Zone Acc: 0.1933 | Score: 0.4529


Training on full dataset...

Full Epoch 1/5


100%|██████████| 1500/1500 [08:09<00:00,  3.06it/s]


Loss: 1.2910

Full Epoch 2/5


100%|██████████| 1500/1500 [07:54<00:00,  3.16it/s]


Loss: 1.2292

Full Epoch 3/5


100%|██████████| 1500/1500 [06:59<00:00,  3.58it/s]


Loss: 1.2139

Full Epoch 4/5


100%|██████████| 1500/1500 [06:43<00:00,  3.72it/s]


Loss: 1.2024

Full Epoch 5/5


100%|██████████| 1500/1500 [05:01<00:00,  4.98it/s]


Loss: 1.1969



100%|██████████| 1000/1000 [02:43<00:00,  6.13it/s]


Saved: submission_baseball_hybrid_macos.csv
     file_name  pitch_class_x  zone_x pitch_class_y  zone_y
0   pitch2.mp4            NaN     NaN          ball      11
1   pitch6.mp4            NaN     NaN        strike      11
2   pitch7.mp4            NaN     NaN          ball      11
3   pitch9.mp4            NaN     NaN        strike      11
4  pitch10.mp4            NaN     NaN          ball      11


In [6]:
submission_clean = submission.copy()

submission_clean = submission_clean[["file_name", "pitch_class_y", "zone_y"]]

# Rename to Kaggle-required format
submission_clean = submission_clean.rename(columns={
    "pitch_class_y": "pitch_class",
    "zone_y": "zone"
})

# Save clean file
submission_clean.to_csv("submission_final_2.csv", index=False)

print(submission_clean.head())

     file_name pitch_class  zone
0   pitch2.mp4        ball    11
1   pitch6.mp4      strike    11
2   pitch7.mp4        ball    11
3   pitch9.mp4      strike    11
4  pitch10.mp4        ball    11


In [7]:
!pip3 install xgboost

Collecting xgboost
  Downloading xgboost-3.1.1-py3-none-macosx_12_0_arm64.whl.metadata (2.1 kB)
Downloading xgboost-3.1.1-py3-none-macosx_12_0_arm64.whl (2.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m85.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: xgboost
Successfully installed xgboost-3.1.1

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [None]:
# 0.44605

import os
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

import xgboost as xgb

# =========================================================
# Paths
# =========================================================

BASE_DIR = "data/Question4/baseball-pitch-tracking-cs-gy-6643/baseball_kaggle_dataset_trimmed_only"

TRAIN_CSV = os.path.join(BASE_DIR, "data", "train_ground_truth.csv")
TEST_FEATURES_CSV = os.path.join(BASE_DIR, "data", "test_features.csv")
TEMPLATE_CSV = "data/Question4/baseball-pitch-tracking-cs-gy-6643/test_submission_template.csv"

OUTPUT_SUBMISSION = "submission_xgb_tabular_strikezone.csv"

SEED = 42
np.random.seed(SEED)


# =========================================================
# Load data
# =========================================================

train_df = pd.read_csv(TRAIN_CSV)
test_df = pd.read_csv(TEST_FEATURES_CSV)
template_df = pd.read_csv(TEMPLATE_CSV)

print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)
print("Template shape:", template_df.shape)

# =========================================================
# Basic sanity check on columns
# =========================================================

print("\nTrain columns:", train_df.columns.tolist())
print("Test columns:", test_df.columns.tolist())


# =========================================================
# Feature engineering
# Only use features that exist in BOTH train_ground_truth and test_features
# =========================================================

base_numeric = [
    "sz_top",
    "sz_bot",
    "release_speed",
    "effective_speed",
    "release_spin_rate",
    "release_pos_x",
    "release_pos_y",
    "release_pos_z",
    "release_extension",
    "pfx_x",
    "pfx_z",
]

categorical = ["stand", "p_throws"]

for df in [train_df, test_df]:
    # Fill numeric missing values with median from train (we do this after computing medians)
    pass

# Compute medians from train for numeric columns
medians = {col: train_df[col].median() for col in base_numeric}

# Fill missing numeric values
for col in base_numeric:
    train_df[col] = train_df[col].fillna(medians[col])
    test_df[col] = test_df[col].fillna(medians[col])

# Fill categorical nulls with "Unknown"
for col in categorical:
    train_df[col] = train_df[col].fillna("Unknown")
    test_df[col] = test_df[col].fillna("Unknown")

# Derived numeric features (must be computable in both train and test)
for df in [train_df, test_df]:
    # Height of strike zone
    df["sz_height"] = df["sz_top"] - df["sz_bot"]
    # Center of strike zone
    df["sz_center"] = 0.5 * (df["sz_top"] + df["sz_bot"])
    # Break magnitude
    df["break_mag"] = np.sqrt(df["pfx_x"] ** 2 + df["pfx_z"] ** 2)
    # Absolute break components
    df["abs_pfx_x"] = df["pfx_x"].abs()
    df["abs_pfx_z"] = df["pfx_z"].abs()
    # Extension-speed interaction
    df["ext_speed"] = df["release_speed"] * df["release_extension"]
    # Effective speed ratio (guard against div by zero)
    safe_release_speed = df["release_speed"].replace(0, medians["release_speed"])
    df["eff_ratio"] = df["effective_speed"] / safe_release_speed

# Map categoricals manually: stand, p_throws
# L -> 0, R -> 1, Unknown -> -1
map_hand = {"L": 0, "R": 1}
for df in [train_df, test_df]:
    df["stand_enc"] = df["stand"].map(map_hand).fillna(-1).astype(int)
    df["p_throws_enc"] = df["p_throws"].map(map_hand).fillna(-1).astype(int)
    # Same-side vs opposite-side (batter vs pitcher)
    df["same_side"] = (df["stand_enc"] == df["p_throws_enc"]).astype(int)

# Final feature list
feature_cols = (
    base_numeric
    + [
        "sz_height",
        "sz_center",
        "break_mag",
        "abs_pfx_x",
        "abs_pfx_z",
        "ext_speed",
        "eff_ratio",
        "stand_enc",
        "p_throws_enc",
        "same_side",
    ]
)

print("\nUsing features:")
for c in feature_cols:
    print(" -", c)


# =========================================================
# Targets
# =========================================================

# pitch_class: "strike"/"ball"
class_to_idx = {"ball": 0, "strike": 1}
idx_to_class = {v: k for k, v in class_to_idx.items()}
y_class = train_df["pitch_class"].map(class_to_idx).astype(int).values

# zone: 1..14 -> 0..13 for XGBoost
y_zone_raw = train_df["zone"].astype(int).values
y_zone = (y_zone_raw - 1).astype(int)

# Features matrix
X_all = train_df[feature_cols].astype(float).values
X_test = test_df[feature_cols].astype(float).values


# =========================================================
# Train/validation split
# =========================================================

train_idx, val_idx = train_test_split(
    np.arange(len(train_df)),
    test_size=0.2,
    random_state=SEED,
    stratify=y_class,
)

X_tr = X_all[train_idx]
X_val = X_all[val_idx]
y_class_tr = y_class[train_idx]
y_class_val = y_class[val_idx]
y_zone_tr = y_zone[train_idx]
y_zone_val = y_zone[val_idx]

print(f"\nTrain samples: {len(train_idx)}")
print(f"Validation samples: {len(val_idx)}")


# =========================================================
# XGBoost: pitch_class model (binary)
# =========================================================

dtrain_class = xgb.DMatrix(X_tr, label=y_class_tr)
dval_class = xgb.DMatrix(X_val, label=y_class_val)

params_class = {
    "objective": "binary:logistic",
    "eval_metric": "logloss",
    "eta": 0.03,
    "max_depth": 6,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "lambda": 1.0,
    "alpha": 0.0,
    "seed": SEED,
    "tree_method": "hist",
}

print("\nTraining XGBoost model for pitch_class...")
evals = [(dtrain_class, "train"), (dval_class, "val")]
bst_class = xgb.train(
    params_class,
    dtrain_class,
    num_boost_round=200,
    evals=evals,
    early_stopping_rounds=20,
    verbose_eval=20,
)

# Validation predictions for pitch_class
y_class_val_pred_proba = bst_class.predict(dval_class)
y_class_val_pred = (y_class_val_pred_proba >= 0.5).astype(int)

acc_class = accuracy_score(y_class_val, y_class_val_pred)
print(f"\nValidation pitch_class accuracy: {acc_class:.4f}")


# =========================================================
# XGBoost: zone model (multiclass 14 zones)
# =========================================================

dtrain_zone = xgb.DMatrix(X_tr, label=y_zone_tr)
dval_zone = xgb.DMatrix(X_val, label=y_zone_val)

params_zone = {
    "objective": "multi:softprob",
    "eval_metric": "mlogloss",
    "num_class": 14,
    "eta": 0.05,
    "max_depth": 7,
    "subsample": 0.9,
    "colsample_bytree": 0.9,
    "lambda": 1.0,
    "alpha": 0.0,
    "seed": SEED,
    "tree_method": "hist",
}

print("\nTraining XGBoost model for zone...")
evals_zone = [(dtrain_zone, "train"), (dval_zone, "val")]
bst_zone = xgb.train(
    params_zone,
    dtrain_zone,
    num_boost_round=300,
    evals=evals_zone,
    early_stopping_rounds=25,
    verbose_eval=20,
)

# Validation predictions for zone
y_zone_val_proba = bst_zone.predict(dval_zone)  # shape (N_val, 14)
y_zone_val_pred = np.argmax(y_zone_val_proba, axis=1)  # 0..13

acc_zone = accuracy_score(y_zone_val, y_zone_val_pred)
print(f"\nValidation zone accuracy: {acc_zone:.4f}")

# Combined competition metric
score_combined = 0.7 * acc_class + 0.3 * acc_zone
print(f"Validation combined score (0.7 * class + 0.3 * zone): {score_combined:.4f}")


# =========================================================
# Retrain on FULL data
# =========================================================

print("\nRetraining on full training set for final models...")

dall_class = xgb.DMatrix(X_all, label=y_class)
bst_class_full = xgb.train(
    params_class,
    dall_class,
    num_boost_round=bst_class.best_iteration + 20,  # small buffer
)

dall_zone = xgb.DMatrix(X_all, label=y_zone)
bst_zone_full = xgb.train(
    params_zone,
    dall_zone,
    num_boost_round=bst_zone.best_iteration + 30,
)


# =========================================================
# Predict on test set
# =========================================================

dtest = xgb.DMatrix(X_test)

print("\nPredicting on test set...")

test_class_proba = bst_class_full.predict(dtest)
test_class_pred = (test_class_proba >= 0.5).astype(int)
test_class_str = [idx_to_class[i] for i in test_class_pred]

test_zone_proba = bst_zone_full.predict(dtest)  # (N_test, 14)
test_zone_pred_idx = np.argmax(test_zone_proba, axis=1)  # 0..13
test_zone_pred = (test_zone_pred_idx + 1).astype(int)  # 1..14


# =========================================================
# Build submission
# =========================================================

pred_df = pd.DataFrame(
    {
        "file_name": test_df["file_name"].values,
        "pitch_class": test_class_str,
        "zone": test_zone_pred,
    }
)

# Merge with template to ensure exact ordering and no missing files
submission = template_df.drop(columns=["pitch_class", "zone"], errors="ignore")
submission = submission.merge(pred_df, on="file_name", how="left")

missing = submission["pitch_class"].isna().sum() + submission["zone"].isna().sum()
if missing > 0:
    print(f"Warning: {missing} missing predictions in submission")

submission.to_csv(OUTPUT_SUBMISSION, index=False)
print("\nSaved submission to:", OUTPUT_SUBMISSION)
print(submission.head())

Train shape: (6000, 18)
Test shape: (4000, 14)
Template shape: (4000, 3)

Train columns: ['file_name', 'plate_x', 'plate_z', 'sz_top', 'sz_bot', 'release_speed', 'effective_speed', 'release_spin_rate', 'release_pos_x', 'release_pos_y', 'release_pos_z', 'release_extension', 'pfx_x', 'pfx_z', 'stand', 'p_throws', 'pitch_class', 'zone']
Test columns: ['file_name', 'sz_top', 'sz_bot', 'release_speed', 'effective_speed', 'release_spin_rate', 'release_pos_x', 'release_pos_y', 'release_pos_z', 'release_extension', 'pfx_x', 'pfx_z', 'stand', 'p_throws']

Using features:
 - sz_top
 - sz_bot
 - release_speed
 - effective_speed
 - release_spin_rate
 - release_pos_x
 - release_pos_y
 - release_pos_z
 - release_extension
 - pfx_x
 - pfx_z
 - sz_height
 - sz_center
 - break_mag
 - abs_pfx_x
 - abs_pfx_z
 - ext_speed
 - eff_ratio
 - stand_enc
 - p_throws_enc
 - same_side

Train samples: 4800
Validation samples: 1200

Training XGBoost model for pitch_class...
[0]	train-logloss:0.69094	val-logloss:0.69

In [None]:
# 0.49630

import os
import cv2
import json
import numpy as np
import pandas as pd
from tqdm import tqdm

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchvision import models, transforms

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# ============================================================
# PATHS / CONFIG
# ============================================================

BASE = "data/Question4/baseball-pitch-tracking-cs-gy-6643/baseball_kaggle_dataset_trimmed_only"

TRAIN_VID_DIR = os.path.join(BASE, "train_trimmed")
TEST_VID_DIR  = os.path.join(BASE, "test")

TRAIN_CSV = os.path.join(BASE, "data", "train_ground_truth.csv")
TEST_FEATURES_CSV = os.path.join(BASE, "data", "test_features.csv")
TEMPLATE_CSV = "data/Question4/baseball-pitch-tracking-cs-gy-6643/test_submission_template.csv"

TRAIN_VIDEO_FEATS = "train_video_features_nn.csv"
TEST_VIDEO_FEATS  = "test_video_features_nn.csv"

SUBMISSION_OUT = "submission_hybrid_video_tabular_nn.csv"

SEED = 42
N_FRAMES = 16       # frames per clip for CNN
BATCH_SIZE = 4     # keep small for macOS MPS
EPOCHS = 10         # you can increase once it runs
LR = 1e-4

# Set random seeds
np.random.seed(SEED)
torch.manual_seed(SEED)

# Device (MPS-friendly)
if torch.backends.mps.is_available():
    device = torch.device("mps")
elif torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

print("Using device:", device)

# ============================================================
# BALL DETECTION / TRAJECTORY FEATURES (NO CLICKING)
# ============================================================

def detect_ball(frame, prev=None, max_dist=60):
    """
    Simple ball detector: white-ish blob + proximity to previous.
    """
    hsv = cv2.cvtColor(frame, cv2.COLOR_BGR2HSV)
    lower = np.array([0, 0, 200])
    upper = np.array([180, 40, 255])
    mask = cv2.inRange(hsv, lower, upper)
    mask = cv2.GaussianBlur(mask, (5, 5), 0)
    mask = cv2.morphologyEx(mask, cv2.MORPH_OPEN, np.ones((3, 3), np.uint8))

    cnts, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

    best = None
    best_dist = 1e9
    for c in cnts:
        x, y, w, h = cv2.boundingRect(c)
        area = w * h
        if 8 < area < 500:  # small bright blob
            cx = x + w / 2.0
            cy = y + h / 2.0
            if prev is None:
                return (cx, cy)
            dist = np.hypot(cx - prev[0], cy - prev[1])
            if dist < best_dist and dist < max_dist:
                best_dist = dist
                best = (cx, cy)

    return best


def safe_polyfit(x, y, deg):
    """
    Robust polyfit wrapper that never fails and always returns a fixed-length
    coefficient list [a2, a1, a0] for quadratic.
    If deg=2 fit fails, fallback to deg=1, then to deg=0.
    """
    try:
        coeffs = np.polyfit(x, y, deg)
    except Exception:
        coeffs = None

    # Handle degeneracies: fewer samples or near-constant inputs
    if coeffs is None or len(coeffs) != deg + 1:
        # Try linear
        try:
            coeffs = np.polyfit(x, y, 1)
            # Promote to quadratic form: ax^2 + bx + c
            a = 0.0
            b, c = coeffs
            return np.array([a, b, c], dtype=float)
        except Exception:
            # Fallback constant
            mean_val = float(np.mean(y))
            return np.array([0.0, 0.0, mean_val], dtype=float)

    # If returned coefficients are the wrong length, patch them
    if len(coeffs) == 2:
        a = 0.0
        b, c = coeffs
        return np.array([a, b, c], dtype=float)

    if len(coeffs) == 1:
        mean_val = float(coeffs[0])
        return np.array([0.0, 0.0, mean_val], dtype=float)

    return coeffs.astype(float)



def compute_trajectory_features(xs, ys, frame_w, frame_h):
    """
    From xs, ys → compute normalized physics features robustly.
    """
    n = len(xs)
    if n < 5 or frame_w <= 0 or frame_h <= 0:
        return {
            "x_norm_final": 0.0,
            "y_norm_final": 0.0,
            "vx_mean": 0.0,
            "vy_mean": 0.0,
            "ax_mean": 0.0,
            "ay_mean": 0.0,
            "speed_mean": 0.0,
            "speed_max": 0.0,
            "curvature": 0.0,
            "px0": 0.0, "px1": 0.0, "px2": 0.0,
            "py0": 0.0, "py1": 0.0, "py2": 0.0,
            "traj_len": float(n),
        }

    xs = np.array(xs, float)
    ys = np.array(ys, float)
    t = np.arange(n, dtype=float)

    xs_n = xs / frame_w
    ys_n = ys / frame_h

    vx = np.gradient(xs_n)
    vy = np.gradient(ys_n)
    speed = np.hypot(vx, vy)

    ax = np.gradient(vx)
    ay = np.gradient(vy)

    curvature = np.mean(np.abs(ax * vy - ay * vx) / (speed**3 + 1e-6))

    # SAFE polyfits
    px_coeff = safe_polyfit(t, xs_n, deg=2)  # [a2, a1, a0]
    py_coeff = safe_polyfit(t, ys_n, deg=2)

    px2, px1, px0 = px_coeff
    py2, py1, py0 = py_coeff

    # create callable polynomials
    px = np.poly1d(px_coeff)
    py = np.poly1d(py_coeff)

    # extrapolate
    t_final = n / 0.8
    x_norm_final = float(px(t_final))
    y_norm_final = float(py(t_final))

    return {
        "x_norm_final": x_norm_final,
        "y_norm_final": y_norm_final,
        "vx_mean": float(np.mean(vx)),
        "vy_mean": float(np.mean(vy)),
        "ax_mean": float(np.mean(ax)),
        "ay_mean": float(np.mean(ay)),
        "speed_mean": float(np.mean(speed)),
        "speed_max": float(np.max(speed)),
        "curvature": float(curvature),
        "px0": float(px0), "px1": float(px1), "px2": float(px2),
        "py0": float(py0), "py1": float(py1), "py2": float(py2),
        "traj_len": float(n),
    }


def process_video_trajectory(video_path):
    """
    Track ball for a single video and return physics features.
    No ground-truth usage here. No strike zone heuristics.
    """
    cap = cv2.VideoCapture(video_path)
    ret, frame = cap.read()
    if not ret:
        cap.release()
        # empty features
        return compute_trajectory_features([], [], 1, 1)

    h, w = frame.shape[:2]
    xs, ys = [], []
    prev = None

    cap.set(cv2.CAP_PROP_POS_FRAMES, 0)
    while True:
        ret, frame = cap.read()
        if not ret:
            break
        c = detect_ball(frame, prev)
        if c is not None:
            prev = c
            xs.append(c[0])
            ys.append(c[1])

    cap.release()
    return compute_trajectory_features(xs, ys, w, h)


def build_video_features_all():
    """
    Process all train and test videos and save physics features:
    They are purely learned-input features (no manual labels).
    """
    train_df = pd.read_csv(TRAIN_CSV)
    test_df = pd.read_csv(TEST_FEATURES_CSV)

    # Train video features
    rows = []
    print("Extracting trajectory features for TRAIN videos...")
    for fname in tqdm(train_df["file_name"]):
        vp = os.path.join(TRAIN_VID_DIR, fname)
        feats = process_video_trajectory(vp)
        feats["file_name"] = fname
        rows.append(feats)
    train_feats = pd.DataFrame(rows)
    train_feats.to_csv(TRAIN_VIDEO_FEATS, index=False)
    print("Saved", TRAIN_VIDEO_FEATS)

    # Test video features
    rows = []
    print("Extracting trajectory features for TEST videos...")
    for fname in tqdm(test_df["file_name"]):
        vp = os.path.join(TEST_VID_DIR, fname)
        feats = process_video_trajectory(vp)
        feats["file_name"] = fname
        rows.append(feats)
    test_feats = pd.DataFrame(rows)
    test_feats.to_csv(TEST_VIDEO_FEATS, index=False)
    print("Saved", TEST_VIDEO_FEATS)

    return train_feats, test_feats


# ============================================================
# TABULAR + PHYSICS FEATURE ENGINEERING
# ============================================================

def build_tabular_and_physics():
    """
    Build full tabular feature matrices from Statcast + physics.
    Returns:
        train_df_full, test_df_full, X_train (np), X_test (np),
        y_class (np), y_zone (np), scaler, feature_names
    """
    train_df = pd.read_csv(TRAIN_CSV)
    test_df = pd.read_csv(TEST_FEATURES_CSV)

    if os.path.exists(TRAIN_VIDEO_FEATS) and os.path.exists(TEST_VIDEO_FEATS):
        train_vid = pd.read_csv(TRAIN_VIDEO_FEATS)
        test_vid = pd.read_csv(TEST_VIDEO_FEATS)
    else:
        train_vid, test_vid = build_video_features_all()

    # Merge video features
    train = train_df.merge(train_vid, on="file_name", how="left")
    test = test_df.merge(test_vid, on="file_name", how="left")

    # Statcast numeric
    base_numeric = [
        "sz_top", "sz_bot",
        "release_speed", "effective_speed",
        "release_spin_rate",
        "release_pos_x", "release_pos_y", "release_pos_z",
        "release_extension",
        "pfx_x", "pfx_z",
    ]
    categorical = ["stand", "p_throws"]

    # Fill numeric
    medians = {}
    for col in base_numeric:
        med = train[col].median()
        medians[col] = med
        train[col] = train[col].fillna(med)
        test[col] = test[col].fillna(med)

    # Fill categorical
    for col in categorical:
        train[col] = train[col].fillna("Unknown")
        test[col] = test[col].fillna("Unknown")

    # Derived Statcast features
    for df in [train, test]:
        df["sz_height"] = df["sz_top"] - df["sz_bot"]
        df["sz_center"] = 0.5 * (df["sz_top"] + df["sz_bot"])
        df["break_mag"] = np.sqrt(df["pfx_x"]**2 + df["pfx_z"]**2)
        df["abs_pfx_x"] = df["pfx_x"].abs()
        df["abs_pfx_z"] = df["pfx_z"].abs()
        df["ext_speed"] = df["release_speed"] * df["release_extension"]
        safe_speed = df["release_speed"].replace(0, medians["release_speed"])
        df["eff_ratio"] = df["effective_speed"] / safe_speed

        # Encode handedness
        map_hand = {"L": 0, "R": 1}
        df["stand_enc"] = df["stand"].map(map_hand).fillna(-1).astype(int)
        df["p_throws_enc"] = df["p_throws"].map(map_hand).fillna(-1).astype(int)
        df["same_side"] = (df["stand_enc"] == df["p_throws_enc"]).astype(int)

    # Physics feature columns (from trajectory)
    physics_cols = [
        "x_norm_final", "y_norm_final",
        "vx_mean", "vy_mean",
        "ax_mean", "ay_mean",
        "speed_mean", "speed_max",
        "curvature",
        "px0", "px1", "px2",
        "py0", "py1", "py2",
        "traj_len",
    ]

    for col in physics_cols:
        train[col] = train[col].fillna(0.0)
        test[col] = test[col].fillna(0.0)

    feature_cols = (
        base_numeric
        + [
            "sz_height", "sz_center", "break_mag", "abs_pfx_x", "abs_pfx_z",
            "ext_speed", "eff_ratio",
            "stand_enc", "p_throws_enc", "same_side",
        ]
        + physics_cols
    )

    X_train_raw = train[feature_cols].astype(float).values
    X_test_raw = test[feature_cols].astype(float).values

    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train_raw)
    X_test = scaler.transform(X_test_raw)

    # Targets
    class_map = {"ball": 0, "strike": 1}
    y_class = train["pitch_class"].map(class_map).astype(int).values
    y_zone = train["zone"].astype(int).values - 1  # 0–13

    return train, test, X_train, X_test, y_class, y_zone, scaler, feature_cols


# ============================================================
# VIDEO LOADING FOR CNN
# ============================================================

video_transform = transforms.Compose([
    transforms.ToTensor(),                                # HxWxC -> CxHxW, [0,1]
    transforms.Resize((224, 224)),                        # ResNet18 size
    transforms.Normalize(
        mean=[0.485, 0.456, 0.406],                      # ImageNet stats
        std=[0.229, 0.224, 0.225]
    ),
])

def load_video_frames_sampled(video_path, n_frames=N_FRAMES):
    """
    Load a mp4 and sample n_frames uniformly over its length.
    Returns tensor of shape (T, C, H, W).
    """
    cap = cv2.VideoCapture(video_path)
    total = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

    if total <= 0:
        cap.release()
        # return zeros
        dummy = torch.zeros(n_frames, 3, 224, 224, dtype=torch.float32)
        return dummy

    # indices spread across frames
    indices = np.linspace(0, total - 1, n_frames).astype(int)

    frames = []
    for idx in indices:
        cap.set(cv2.CAP_PROP_POS_FRAMES, int(idx))
        ret, frame = cap.read()
        if not ret:
            # if read fails, use a black frame
            frame = np.zeros((224, 224, 3), dtype=np.uint8)
        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        pil_like = frame_rgb
        tensor = video_transform(pil_like)
        frames.append(tensor)

    cap.release()

    video_tensor = torch.stack(frames, dim=0)  # (T, C, H, W)
    return video_tensor


# ============================================================
# DATASET CLASS
# ============================================================

class PitchDataset(Dataset):
    def __init__(
        self,
        df,
        tab_array,
        video_dir,
        file_col="file_name",
        y_class=None,
        y_zone=None,
    ):
        self.df = df.reset_index(drop=True)
        self.tab = tab_array.astype(np.float32)
        self.video_dir = video_dir
        self.file_col = file_col
        self.y_class = y_class
        self.y_zone = y_zone

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        fname = self.df.loc[idx, self.file_col]
        video_path = os.path.join(self.video_dir, fname)

        # video frames (T,C,H,W)
        frames = load_video_frames_sampled(video_path)  # torch.float32

        # tabular
        tab = torch.from_numpy(self.tab[idx])

        if self.y_class is not None:
            c = int(self.y_class[idx])
            z = int(self.y_zone[idx])
            return frames, tab, c, z
        else:
            return frames, tab, fname


# ============================================================
# MODEL: CNN (video) + MLP (tabular) → 2 heads
# ============================================================

class HybridVideoTabModel(nn.Module):
    def __init__(self, tab_dim, num_zones=14):
        super().__init__()

        res = models.resnet18(weights=models.ResNet18_Weights.IMAGENET1K_V1)
        self.cnn_backbone = nn.Sequential(*list(res.children())[:-1])  # (B,512,1,1)
        video_feat_dim = 512

        self.tab_mlp = nn.Sequential(
            nn.Linear(tab_dim, 128),
            nn.ReLU(),
            nn.BatchNorm1d(128),
            nn.Linear(128, 128),
            nn.ReLU(),
        )

        self.shared = nn.Sequential(
            nn.Linear(video_feat_dim + 128, 256),
            nn.ReLU(),
            nn.BatchNorm1d(256),
            nn.Dropout(0.3),
        )

        self.class_head = nn.Linear(256, 2)
        self.zone_head = nn.Linear(256, num_zones)

    def forward(self, video, tab):
        # video: (B, T, C, H, W)
        B, T, C, H, W = video.shape

        # flatten time → (B*T, C, H, W)
        video = video.reshape(B * T, C, H, W).contiguous()

        feat = self.cnn_backbone(video)         # (B*T, 512, 1, 1)
        feat = feat.contiguous().reshape(B, T, -1).mean(dim=1)  # (B,512)

        tab_feat = self.tab_mlp(tab)           # (B,128)

        fused = torch.cat([feat, tab_feat], dim=1)  # (B,512+128)
        shared = self.shared(fused)

        class_logits = self.class_head(shared)
        zone_logits = self.zone_head(shared)
        return class_logits, zone_logits


# ============================================================
# TRAIN / EVAL LOOPS
# ============================================================

def train_one_epoch(model, loader, opt, loss_class, loss_zone):
    model.train()
    total_loss = 0.0

    for frames, tab, c, z in tqdm(loader, desc="Train", leave=False):
        frames = frames.to(device)
        tab = tab.to(device)
        c = c.to(device)
        z = z.to(device)

        opt.zero_grad()
        out_c, out_z = model(frames, tab)

        lc = loss_class(out_c, c)
        lz = loss_zone(out_z, z)
        loss = 0.7 * lc + 0.3 * lz

        loss.backward()
        opt.step()

        total_loss += loss.item() * frames.size(0)

    return total_loss / len(loader.dataset)


@torch.no_grad()
def eval_model(model, loader, loss_class, loss_zone):
    model.eval()
    all_c_true, all_c_pred = [], []
    all_z_true, all_z_pred = [], []

    total_loss = 0.0

    for frames, tab, c, z in tqdm(loader, desc="Val", leave=False):
        frames = frames.to(device)
        tab = tab.to(device)
        c = c.to(device)
        z = z.to(device)

        out_c, out_z = model(frames, tab)
        lc = loss_class(out_c, c)
        lz = loss_zone(out_z, z)
        loss = 0.7 * lc + 0.3 * lz
        total_loss += loss.item() * frames.size(0)

        c_hat = out_c.argmax(dim=1).cpu().numpy()
        z_hat = out_z.argmax(dim=1).cpu().numpy()
        all_c_true.append(c.cpu().numpy())
        all_c_pred.append(c_hat)
        all_z_true.append(z.cpu().numpy())
        all_z_pred.append(z_hat)

    all_c_true = np.concatenate(all_c_true)
    all_c_pred = np.concatenate(all_c_pred)
    all_z_true = np.concatenate(all_z_true)
    all_z_pred = np.concatenate(all_z_pred)

    acc_c = accuracy_score(all_c_true, all_c_pred)
    acc_z = accuracy_score(all_z_true, all_z_pred)
    avg_loss = total_loss / len(loader.dataset)
    score = 0.7 * acc_c + 0.3 * acc_z

    return avg_loss, acc_c, acc_z, score


# ============================================================
# MAIN PIPELINE
# ============================================================

def main():
    # 1) Build tabular + physics matrices (also builds video physics CSVs if missing)
    train_df_full, test_df_full, X_train, X_test, y_class, y_zone, scaler, feature_cols = build_tabular_and_physics()
    tab_dim = X_train.shape[1]
    print("Tabular feature dimension:", tab_dim)

    # 2) Train/val split
    idx = np.arange(len(train_df_full))
    train_idx, val_idx = train_test_split(
        idx, test_size=0.2, random_state=SEED, stratify=y_class
    )

    X_tr = X_train[train_idx]
    y_class_tr = y_class[train_idx]
    y_zone_tr = y_zone[train_idx]

    X_val = X_train[val_idx]
    y_class_val = y_class[val_idx]
    y_zone_val = y_zone[val_idx]

    df_tr = train_df_full.iloc[train_idx].reset_index(drop=True)
    df_val = train_df_full.iloc[val_idx].reset_index(drop=True)

    # 3) Datasets / loaders
    train_ds = PitchDataset(
        df_tr, X_tr, TRAIN_VID_DIR,
        y_class=y_class_tr, y_zone=y_zone_tr
    )
    val_ds = PitchDataset(
        df_val, X_val, TRAIN_VID_DIR,
        y_class=y_class_val, y_zone=y_zone_val
    )

    train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, num_workers=0)
    val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False, num_workers=0)

    # 4) Model, loss, optimizer
    model = HybridVideoTabModel(tab_dim=tab_dim, num_zones=14).to(device)
    loss_class = nn.CrossEntropyLoss()
    loss_zone = nn.CrossEntropyLoss()
    opt = torch.optim.Adam(model.parameters(), lr=LR)

    # 5) Training loop with validation
    best_score = -1
    best_state = None

    for epoch in range(1, EPOCHS + 1):
        print(f"\nEpoch {epoch}/{EPOCHS}")
        train_loss = train_one_epoch(model, train_loader, opt, loss_class, loss_zone)
        val_loss, acc_c, acc_z, score = eval_model(model, val_loader, loss_class, loss_zone)
        print(f"Train loss: {train_loss:.4f} | Val loss: {val_loss:.4f}")
        print(f"Val pitch_class acc: {acc_c:.4f}, Val zone acc: {acc_z:.4f}, Combined: {score:.4f}")

        if score > best_score:
            best_score = score
            best_state = model.state_dict().copy()

    print(f"\nBest validation combined score: {best_score:.4f}")

    # 6) Optionally fine-tune on full train
    model.load_state_dict(best_state)
    full_ds = PitchDataset(
        train_df_full, X_train, TRAIN_VID_DIR,
        y_class=y_class, y_zone=y_zone
    )
    full_loader = DataLoader(full_ds, batch_size=BATCH_SIZE, shuffle=True, num_workers=0)

    print("\nFine-tuning on full training data...")
    for epoch in range(1, 3):  # a couple extra passes
        ft_loss = train_one_epoch(model, full_loader, opt, loss_class, loss_zone)
        print(f"Fine-tune epoch {epoch}, loss: {ft_loss:.4f}")

    # 7) Predict on test set
    test_ds = PitchDataset(
        test_df_full, X_test, TEST_VID_DIR,
        y_class=None, y_zone=None
    )
    test_loader = DataLoader(test_ds, batch_size=BATCH_SIZE, shuffle=False, num_workers=0)

    model.eval()
    all_file_names = []
    all_class_pred = []
    all_zone_pred = []

    with torch.no_grad():
        for frames, tab, fnames in tqdm(test_loader, desc="Test", leave=False):
            frames = frames.to(device)
            tab = tab.to(device)
            out_c, out_z = model(frames, tab)

            c_hat = out_c.argmax(dim=1).cpu().numpy()
            z_hat = out_z.argmax(dim=1).cpu().numpy()

            all_class_pred.extend(c_hat.tolist())
            all_zone_pred.extend(z_hat.tolist())
            all_file_names.extend(fnames)

    idx_to_class = {0: "ball", 1: "strike"}
    pred_class_str = [idx_to_class[i] for i in all_class_pred]
    pred_zone = (np.array(all_zone_pred) + 1).astype(int)

    pred_df = pd.DataFrame({
        "file_name": all_file_names,
        "pitch_class": pred_class_str,
        "zone": pred_zone,
    })

    # 8) Build submission in template order
    template = pd.read_csv(TEMPLATE_CSV)
    submission = template.drop(columns=["pitch_class", "zone"], errors="ignore")
    submission = submission.merge(pred_df, on="file_name", how="left")

    submission.to_csv(SUBMISSION_OUT, index=False)
    print("\nSubmission written to:", SUBMISSION_OUT)
    print(submission.head())


# Run the whole thing
if __name__ == "__main__":
    main()

Using device: mps
Extracting trajectory features for TRAIN videos...


100%|██████████| 6000/6000 [06:24<00:00, 15.60it/s]


Saved train_video_features_nn.csv
Extracting trajectory features for TEST videos...


100%|██████████| 4000/4000 [04:28<00:00, 14.88it/s]


Saved test_video_features_nn.csv
Tabular feature dimension: 37

Epoch 1/5


                                                          

Train loss: 1.3086 | Val loss: 1.2907
Val pitch_class acc: 0.5625, Val zone acc: 0.1500, Combined: 0.4387

Epoch 2/5


                                                          

Train loss: 1.2473 | Val loss: 1.2564
Val pitch_class acc: 0.6117, Val zone acc: 0.1933, Combined: 0.4862

Epoch 3/5


                                                          

Train loss: 1.2077 | Val loss: 1.2414
Val pitch_class acc: 0.5675, Val zone acc: 0.1867, Combined: 0.4532

Epoch 4/5


                                                          

Train loss: 1.1792 | Val loss: 1.1950
Val pitch_class acc: 0.5858, Val zone acc: 0.2133, Combined: 0.4741

Epoch 5/5


                                                          

Train loss: 1.1356 | Val loss: 1.1550
Val pitch_class acc: 0.6400, Val zone acc: 0.2133, Combined: 0.5120

Best validation combined score: 0.5120

Fine-tuning on full training data...


                                                          

Fine-tune epoch 1, loss: 1.1217


                                                          

Fine-tune epoch 2, loss: 1.0620


                                                         


Submission written to: submission_hybrid_video_tabular_nn.csv
     file_name pitch_class  zone
0   pitch2.mp4        ball    12
1   pitch6.mp4      strike     4
2   pitch7.mp4        ball    11
3   pitch9.mp4      strike    11
4  pitch10.mp4      strike    14




In [13]:
import os
import cv2
import numpy as np
import pandas as pd
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# ============================================================
# PATHS / CONFIG
# ============================================================

BASE = "data/Question4/baseball-pitch-tracking-cs-gy-6643/baseball_kaggle_dataset_trimmed_only"

TRAIN_VID_DIR = os.path.join(BASE, "train_trimmed")
TEST_VID_DIR  = os.path.join(BASE, "test")

TRAIN_CSV = os.path.join(BASE, "data", "train_ground_truth.csv")
TEST_FEATURES_CSV = os.path.join(BASE, "data", "test_features.csv")
TEMPLATE_CSV = "data/Question4/baseball-pitch-tracking-cs-gy-6643/test_submission_template.csv"

TRAIN_VIDEO_FEATS = "train_video_features_nn.csv"
TEST_VIDEO_FEATS  = "test_video_features_nn.csv"

SUBMISSION_OUT = "submission_hybrid_video_tabular_3dcnn.csv"

SEED = 42
N_FRAMES = 16          # upscaled for better temporal modeling
BATCH_SIZE = 4
EPOCHS = 8             # start here, raise if stable
LR = 1e-4

np.random.seed(SEED)
torch.manual_seed(SEED)

device = (
    torch.device("mps") if torch.backends.mps.is_available()
    else torch.device("cuda") if torch.cuda.is_available()
    else torch.device("cpu")
)
print("Using device:", device)


# ============================================================
# SAFE POLYFIT WRAPPER
# ============================================================

def safe_polyfit(x, y, deg):
    """
    np.polyfit wrapper that always returns a 3-coefficient quadratic (a2,a1,a0).
    Falls back to linear or constant if needed.
    """
    try:
        coeffs = np.polyfit(x, y, deg)
    except Exception:
        coeffs = None

    # Degenerate → try linear
    if coeffs is None or len(coeffs) != deg + 1:
        try:
            coeffs = np.polyfit(x, y, 1)
            b, c = coeffs
            return np.array([0.0, b, c], float)
        except Exception:
            mean_val = float(np.mean(y))
            return np.array([0.0, 0.0, mean_val], float)

    # If it returns 1 or 2 coeffs
    if len(coeffs) == 2:
        b, c = coeffs
        return np.array([0.0, b, c], float)
    if len(coeffs) == 1:
        c = coeffs[0]
        return np.array([0.0, 0.0, c], float)

    return np.array(coeffs, float)


# ============================================================
# IMPROVED BALL DETECTION + TRAJECTORY FEATURES
# ============================================================

def detect_ball(frame, prev=None, max_dist=80, roi_radius=80):
    """
    Improved tracker:
      - If prev is known, only search in a local ROI around it.
      - If detection fails, return prev (hold last good position).
    """
    h, w = frame.shape[:2]

    if prev is not None:
        cx_prev, cy_prev = prev
        x1 = max(int(cx_prev - roi_radius), 0)
        x2 = min(int(cx_prev + roi_radius), w)
        y1 = max(int(cy_prev - roi_radius), 0)
        y2 = min(int(cy_prev + roi_radius), h)
        roi = frame[y1:y2, x1:x2]
        offset = (x1, y1)
    else:
        roi = frame
        offset = (0, 0)

    hsv = cv2.cvtColor(roi, cv2.COLOR_BGR2HSV)
    mask = cv2.inRange(hsv, np.array([0, 0, 200]), np.array([180, 40, 255]))
    mask = cv2.GaussianBlur(mask, (5, 5), 0)
    mask = cv2.morphologyEx(mask, cv2.MORPH_OPEN, np.ones((3, 3), np.uint8))

    cnts, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

    best = None
    best_dist = 1e9

    for c in cnts:
        x, y, w_box, h_box = cv2.boundingRect(c)
        area = w_box * h_box
        if 8 < area < 500:
            cx = x + w_box / 2 + offset[0]
            cy = y + h_box / 2 + offset[1]
            if prev is None:
                return (cx, cy)
            d = np.hypot(cx - prev[0], cy - prev[1])
            if d < best_dist and d < max_dist:
                best, best_dist = (cx, cy), d

    if best is None and prev is not None:
        # hold last known position to keep trajectory continuous
        return prev

    return best


def compute_trajectory_features(xs, ys, frame_w, frame_h):
    n = len(xs)
    if n < 5 or frame_w <= 0 or frame_h <= 0:
        return {k: 0.0 for k in [
            "x_norm_final","y_norm_final",
            "vx_mean","vy_mean","ax_mean","ay_mean",
            "speed_mean","speed_max","curvature",
            "px0","px1","px2","py0","py1","py2",
            "traj_len"
        ]}

    xs = np.array(xs, float)
    ys = np.array(ys, float)
    t = np.arange(n, dtype=float)

    xs_n = xs / frame_w
    ys_n = ys / frame_h

    vx = np.gradient(xs_n)
    vy = np.gradient(ys_n)
    speed = np.hypot(vx, vy)

    ax = np.gradient(vx)
    ay = np.gradient(vy)
    curvature = np.mean(np.abs(ax * vy - ay * vx) / (speed**3 + 1e-6))

    px_coeff = safe_polyfit(t, xs_n, deg=2)
    py_coeff = safe_polyfit(t, ys_n, deg=2)
    px2, px1, px0 = px_coeff
    py2, py1, py0 = py_coeff

    px = np.poly1d(px_coeff)
    py = np.poly1d(py_coeff)

    t_final = n / 0.8
    return {
        "x_norm_final": float(px(t_final)),
        "y_norm_final": float(py(t_final)),
        "vx_mean": float(np.mean(vx)),
        "vy_mean": float(np.mean(vy)),
        "ax_mean": float(np.mean(ax)),
        "ay_mean": float(np.mean(ay)),
        "speed_mean": float(np.mean(speed)),
        "speed_max": float(np.max(speed)),
        "curvature": float(curvature),
        "px0": float(px0), "px1": float(px1), "px2": float(px2),
        "py0": float(py0), "py1": float(py1), "py2": float(py2),
        "traj_len": float(n),
    }


def process_video_trajectory(video_path):
    cap = cv2.VideoCapture(video_path)
    ret, frame = cap.read()
    if not ret:
        return compute_trajectory_features([], [], 1, 1)

    h, w = frame.shape[:2]
    xs, ys = [], []
    prev = None

    cap.set(cv2.CAP_PROP_POS_FRAMES, 0)
    while True:
        ret, frame = cap.read()
        if not ret:
            break
        c = detect_ball(frame, prev)
        if c is not None:
            prev = c
            xs.append(c[0])
            ys.append(c[1])

    cap.release()
    return compute_trajectory_features(xs, ys, w, h)


def build_video_features_all():
    train_df = pd.read_csv(TRAIN_CSV)
    test_df  = pd.read_csv(TEST_FEATURES_CSV)

    rows = []
    print("Extracting video trajectory features for TRAIN...")
    for fname in tqdm(train_df["file_name"]):
        feats = process_video_trajectory(os.path.join(TRAIN_VID_DIR, fname))
        feats["file_name"] = fname
        rows.append(feats)
    pd.DataFrame(rows).to_csv(TRAIN_VIDEO_FEATS, index=False)

    rows = []
    print("Extracting video trajectory features for TEST...")
    for fname in tqdm(test_df["file_name"]):
        feats = process_video_trajectory(os.path.join(TEST_VID_DIR, fname))
        feats["file_name"] = fname
        rows.append(feats)
    pd.DataFrame(rows).to_csv(TEST_VIDEO_FEATS, index=False)


# ============================================================
# TABULAR + PHYSICS FUSION
# ============================================================

def build_tabular_and_physics():
    train_df = pd.read_csv(TRAIN_CSV)
    test_df  = pd.read_csv(TEST_FEATURES_CSV)

    if not os.path.exists(TRAIN_VIDEO_FEATS) or not os.path.exists(TEST_VIDEO_FEATS):
        build_video_features_all()

    train_vid = pd.read_csv(TRAIN_VIDEO_FEATS)
    test_vid  = pd.read_csv(TEST_VIDEO_FEATS)

    train = train_df.merge(train_vid, on="file_name", how="left")
    test  = test_df.merge(test_vid, on="file_name", how="left")

    base_numeric = [
        "sz_top","sz_bot","release_speed","effective_speed",
        "release_spin_rate","release_pos_x","release_pos_y",
        "release_pos_z","release_extension","pfx_x","pfx_z"
    ]
    categorical = ["stand","p_throws"]

    med = {c: train[c].median() for c in base_numeric}
    for df in [train, test]:
        for c in base_numeric:
            df[c] = df[c].fillna(med[c])

    for df in [train, test]:
        for c in categorical:
            df[c] = df[c].fillna("Unknown")

    for df in [train, test]:
        df["sz_height"] = df["sz_top"] - df["sz_bot"]
        df["sz_center"] = 0.5 * (df["sz_top"] + df["sz_bot"])
        df["break_mag"] = np.sqrt(df["pfx_x"]**2 + df["pfx_z"]**2)
        df["abs_pfx_x"] = df["pfx_x"].abs()
        df["abs_pfx_z"] = df["pfx_z"].abs()
        df["ext_speed"] = df["release_speed"] * df["release_extension"]
        safe = df["release_speed"].replace(0, med["release_speed"])
        df["eff_ratio"] = df["effective_speed"] / safe

        map_hand = {"L": 0, "R": 1}
        df["stand_enc"] = df["stand"].map(map_hand).fillna(-1).astype(int)
        df["p_throws_enc"] = df["p_throws"].map(map_hand).fillna(-1).astype(int)
        df["same_side"] = (df["stand_enc"] == df["p_throws_enc"]).astype(int)

    physics_cols = [
        "x_norm_final","y_norm_final",
        "vx_mean","vy_mean","ax_mean","ay_mean",
        "speed_mean","speed_max","curvature",
        "px0","px1","px2","py0","py1","py2",
        "traj_len"
    ]

    for col in physics_cols:
        train[col] = train[col].fillna(0.0)
        test[col]  = test[col].fillna(0.0)

    feature_cols = (
        base_numeric +
        ["sz_height","sz_center","break_mag","abs_pfx_x","abs_pfx_z",
         "ext_speed","eff_ratio","stand_enc","p_throws_enc","same_side"] +
        physics_cols
    )

    X_train_raw = train[feature_cols].astype(float).values
    X_test_raw  = test[feature_cols].astype(float).values

    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train_raw)
    X_test  = scaler.transform(X_test_raw)

    class_map = {"ball": 0, "strike": 1}
    y_class = train["pitch_class"].map(class_map).astype(int).values
    y_zone  = train["zone"].astype(int).values - 1

    return train, test, X_train, X_test, y_class, y_zone, scaler, feature_cols


# ============================================================
# VIDEO SAMPLING FOR 3D CNN
# ============================================================

video_transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Resize((128, 128)),  # smaller for 3D conv efficiency
    transforms.Normalize([0.485, 0.456, 0.406],
                         [0.229, 0.224, 0.225])
])

def load_video_frames_sampled(video_path, n_frames=N_FRAMES):
    cap = cv2.VideoCapture(video_path)
    total = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

    if total <= 0:
        cap.release()
        return torch.zeros(n_frames, 3, 128, 128)

    indices = np.linspace(0, total - 1, n_frames).astype(int)
    frames = []

    for idx in indices:
        cap.set(cv2.CAP_PROP_POS_FRAMES, int(idx))
        ret, img = cap.read()
        if not ret:
            frames.append(torch.zeros(3, 128, 128))
            continue
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        tensor = video_transform(img)
        frames.append(tensor)

    cap.release()
    return torch.stack(frames, dim=0)  # (T, C, H, W)


# ============================================================
# DATASET
# ============================================================

class PitchDataset(Dataset):
    def __init__(self, df, tab_arr, video_dir, y_class=None, y_zone=None):
        self.df = df.reset_index(drop=True)
        self.tab = tab_arr.astype(np.float32)
        self.dir = video_dir
        self.yc = y_class
        self.yz = y_zone

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        fname = self.df.loc[idx, "file_name"]
        video_tensor = load_video_frames_sampled(os.path.join(self.dir, fname))
        tab = torch.from_numpy(self.tab[idx])
        if self.yc is not None:
            return video_tensor, tab, int(self.yc[idx]), int(self.yz[idx]), fname
        else:
            return video_tensor, tab, None, None, fname


# ============================================================
# 3D CNN BACKBONE + TABULAR FUSION
# ============================================================

class VideoBackbone3D(nn.Module):
    """
    Simple 3D CNN encoder over (C,T,H,W).
    """
    def __init__(self, in_channels=3, base_channels=32):
        super().__init__()
        self.conv1 = nn.Conv3d(in_channels, base_channels, kernel_size=(3,7,7),
                               stride=(1,2,2), padding=(1,3,3))
        self.bn1 = nn.BatchNorm3d(base_channels)
        self.conv2 = nn.Conv3d(base_channels, base_channels*2, kernel_size=3,
                               stride=(1,2,2), padding=1)
        self.bn2 = nn.BatchNorm3d(base_channels*2)
        self.conv3 = nn.Conv3d(base_channels*2, base_channels*4, kernel_size=3,
                               stride=(2,2,2), padding=1)
        self.bn3 = nn.BatchNorm3d(base_channels*4)

        self.out_channels = base_channels * 4

    def forward(self, x):
        # x: (B, C, T, H, W)
        x = F.relu(self.bn1(self.conv1(x)))
        x = F.relu(self.bn2(self.conv2(x)))
        x = F.relu(self.bn3(self.conv3(x)))
        # global avg pool over (T,H,W)
        x = F.adaptive_avg_pool3d(x, (1,1,1))  # (B, C,1,1,1)
        x = x.view(x.size(0), -1)             # (B, C)
        return x


class HybridVideoTabModel3D(nn.Module):
    def __init__(self, tab_dim, num_zones=14):
        super().__init__()
        self.video_backbone = VideoBackbone3D(in_channels=3, base_channels=32)
        video_feat_dim = self.video_backbone.out_channels

        self.tab_mlp = nn.Sequential(
            nn.Linear(tab_dim, 128),
            nn.ReLU(),
            nn.BatchNorm1d(128),
            nn.Linear(128, 128),
            nn.ReLU()
        )

        self.shared = nn.Sequential(
            nn.Linear(video_feat_dim + 128, 256),
            nn.ReLU(),
            nn.BatchNorm1d(256),
            nn.Dropout(0.3)
        )

        self.head_class = nn.Linear(256, 2)
        self.head_zone  = nn.Linear(256, num_zones)

    def forward(self, video, tab):
        # video: (B, T, C, H, W) → (B, C, T, H, W)
        video = video.permute(0, 2, 1, 3, 4).contiguous()
        vfeat = self.video_backbone(video)        # (B, video_feat_dim)
        tfeat = self.tab_mlp(tab)                 # (B, 128)
        fused = torch.cat([vfeat, tfeat], dim=1)  # (B, video_feat_dim+128)
        h = self.shared(fused)
        return self.head_class(h), self.head_zone(h)


# ============================================================
# TRAIN & EVAL
# ============================================================

def train_one_epoch(model, loader, opt, lc, lz):
    model.train()
    tot = 0
    for v, t, c, z, _ in tqdm(loader, desc="Train", leave=False):
        v, t, c, z = v.to(device), t.to(device), c.to(device), z.to(device)
        opt.zero_grad()
        oc, oz = model(v, t)
        loss = 0.7 * lc(oc, c) + 0.3 * lz(oz, z)
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=5.0)
        opt.step()
        tot += loss.item() * v.size(0)
    return tot / len(loader.dataset)


@torch.no_grad()
def eval_model(model, loader, lc, lz):
    model.eval()
    tot = 0
    ac_true, ac_pred = [], []
    az_true, az_pred = [], []

    for v, t, c, z, _ in tqdm(loader, desc="Val", leave=False):
        v, t, c, z = v.to(device), t.to(device), c.to(device), z.to(device)
        oc, oz = model(v, t)
        loss = 0.7 * lc(oc, c) + 0.3 * lz(oz, z)
        tot += loss.item() * v.size(0)

        ac_pred.append(oc.argmax(1).cpu().numpy())
        ac_true.append(c.cpu().numpy())
        az_pred.append(oz.argmax(1).cpu().numpy())
        az_true.append(z.cpu().numpy())

    ac_pred = np.concatenate(ac_pred)
    ac_true = np.concatenate(ac_true)
    az_pred = np.concatenate(az_pred)
    az_true = np.concatenate(az_true)

    acc_c = accuracy_score(ac_true, ac_pred)
    acc_z = accuracy_score(az_true, az_pred)
    score = 0.7 * acc_c + 0.3 * acc_z

    return tot / len(loader.dataset), acc_c, acc_z, score


# ============================================================
# MAIN
# ============================================================

def main():
    # 1) Build fused features
    train_df_full, test_df_full, X_train, X_test, y_class, y_zone, scaler, feature_cols = build_tabular_and_physics()
    tab_dim = X_train.shape[1]
    print("Tabular dim:", tab_dim)

    # 2) Train/val split
    idx = np.arange(len(train_df_full))
    tr_idx, va_idx = train_test_split(idx, test_size=0.2, random_state=SEED, stratify=y_class)

    df_tr, df_va = train_df_full.iloc[tr_idx], train_df_full.iloc[va_idx]
    X_tr, yc_tr, yz_tr = X_train[tr_idx], y_class[tr_idx], y_zone[tr_idx]
    X_va, yc_va, yz_va = X_train[va_idx], y_class[va_idx], y_zone[va_idx]

    train_ds = PitchDataset(df_tr, X_tr, TRAIN_VID_DIR, yc_tr, yz_tr)
    val_ds   = PitchDataset(df_va, X_va, TRAIN_VID_DIR, yc_va, yz_va)

    train_ld = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
    val_ld   = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False)

    # 3) Model
    model = HybridVideoTabModel3D(tab_dim).to(device)
    opt = torch.optim.Adam(model.parameters(), lr=LR)
    lc, lz = nn.CrossEntropyLoss(), nn.CrossEntropyLoss()

    best_score = -1
    best_state = None

    # 4) Training with validation, early selection
    for epoch in range(1, EPOCHS + 1):
        print(f"\nEpoch {epoch}/{EPOCHS}")
        tl = train_one_epoch(model, train_ld, opt, lc, lz)
        vl, ac, az, sc = eval_model(model, val_ld, lc, lz)

        print(f"Train {tl:.4f} | Val {vl:.4f}")
        print(f"Class Acc: {ac:.4f} | Zone Acc: {az:.4f} | Score: {sc:.4f}")

        if sc > best_score:
            best_score = sc
            best_state = model.state_dict().copy()

    print("\nBest validation score:", best_score)
    # Use the best validation model directly for testing
    model.load_state_dict(best_state)

    # 5) Predict on test
    test_ds = PitchDataset(test_df_full, X_test, TEST_VID_DIR)
    test_ld = DataLoader(test_ds, batch_size=BATCH_SIZE, shuffle=False)

    model.eval()
    all_cls, all_zone, all_names = [], [], []
    with torch.no_grad():
        for v, t, _, _, fn in tqdm(test_ld, desc="Test"):
            v, t = v.to(device), t.to(device)
            oc, oz = model(v, t)
            all_cls.extend(oc.argmax(1).cpu().tolist())
            all_zone.extend(oz.argmax(1).cpu().tolist())
            all_names.extend(fn)

    idx2cls = {0: "ball", 1: "strike"}
    cls_str = [idx2cls[i] for i in all_cls]
    zone = (np.array(all_zone) + 1).tolist()

    pred = pd.DataFrame({
        "file_name": all_names,
        "pitch_class": cls_str,
        "zone": zone
    })

    template = pd.read_csv(TEMPLATE_CSV)
    sub = template.drop(columns=["pitch_class", "zone"], errors="ignore")
    sub = sub.merge(pred, on="file_name", how="left")

    sub.to_csv(SUBMISSION_OUT, index=False)
    print("\nSaved submission:", SUBMISSION_OUT)
    print(sub.head())


if __name__ == "__main__":
    main()

Using device: mps
Tabular dim: 37

Epoch 1/8


                                                          

Train 1.3194 | Val 1.3885
Class Acc: 0.5308 | Zone Acc: 0.1300 | Score: 0.4106

Epoch 2/8


                                                          

Train 1.2700 | Val 1.4783
Class Acc: 0.5575 | Zone Acc: 0.1692 | Score: 0.4410

Epoch 3/8


                                                          

Train 1.2439 | Val 1.3814
Class Acc: 0.5292 | Zone Acc: 0.1867 | Score: 0.4264

Epoch 4/8


                                                          

Train 1.2227 | Val 1.4434
Class Acc: 0.5333 | Zone Acc: 0.1933 | Score: 0.4313

Epoch 5/8


                                                          

Train 1.2110 | Val 1.4195
Class Acc: 0.5275 | Zone Acc: 0.1950 | Score: 0.4277

Epoch 6/8


                                                          

Train 1.2053 | Val 1.5302
Class Acc: 0.5275 | Zone Acc: 0.1883 | Score: 0.4257

Epoch 7/8


                                                          

Train 1.2053 | Val 1.3540
Class Acc: 0.5083 | Zone Acc: 0.2067 | Score: 0.4178

Epoch 8/8


                                                          

Train 1.1951 | Val 1.4150
Class Acc: 0.5375 | Zone Acc: 0.1975 | Score: 0.4355

Best validation score: 0.441


Test:   0%|          | 0/1000 [00:00<?, ?it/s]


TypeError: default_collate: batch must contain tensors, numpy arrays, numbers, dicts or lists; found <class 'NoneType'>

In [15]:
def load_video_frames_sampled(video_path, n_frames=N_FRAMES):
    """
    Always returns a tensor of shape (T, C, H, W)
    Never returns None, even if video is corrupted.
    """
    try:
        cap = cv2.VideoCapture(video_path)
        total = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

        if total <= 1 or not cap.isOpened():
            cap.release()
            return torch.zeros(n_frames, 3, 128, 128)

        indices = np.linspace(0, total - 1, n_frames).astype(int)
        frames = []

        for idx in indices:
            cap.set(cv2.CAP_PROP_POS_FRAMES, int(idx))
            ret, img = cap.read()

            if not ret or img is None:
                # return black frame instead of None
                frames.append(torch.zeros(3, 128, 128))
                continue

            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
            tensor = video_transform(img)
            frames.append(tensor)

        cap.release()
        return torch.stack(frames, dim=0)

    except Exception:
        # absolute fallback
        return torch.zeros(n_frames, 3, 128, 128)
    
train_df_full, test_df_full, X_train, X_test, y_class, y_zone, scaler, feature_cols = build_tabular_and_physics()
# 5) Predict on test
test_ds = PitchDataset(test_df_full, X_test, TEST_VID_DIR)
test_ld = DataLoader(test_ds, batch_size=BATCH_SIZE, shuffle=False)

model.eval()
all_cls, all_zone, all_names = [], [], []
with torch.no_grad():
    for v, t, _, _, fn in tqdm(test_ld, desc="Test"):
        v, t = v.to(device), t.to(device)
        oc, oz = model(v, t)
        all_cls.extend(oc.argmax(1).cpu().tolist())
        all_zone.extend(oz.argmax(1).cpu().tolist())
        all_names.extend(fn)

idx2cls = {0: "ball", 1: "strike"}
cls_str = [idx2cls[i] for i in all_cls]
zone = (np.array(all_zone) + 1).tolist()

pred = pd.DataFrame({
    "file_name": all_names,
    "pitch_class": cls_str,
    "zone": zone
})

template = pd.read_csv(TEMPLATE_CSV)
sub = template.drop(columns=["pitch_class", "zone"], errors="ignore")
sub = sub.merge(pred, on="file_name", how="left")

sub.to_csv(SUBMISSION_OUT, index=False)
print("\nSaved submission:", SUBMISSION_OUT)
print(sub.head())

Test:   0%|          | 0/1000 [00:00<?, ?it/s]


TypeError: default_collate: batch must contain tensors, numpy arrays, numbers, dicts or lists; found <class 'NoneType'>