In [1]:
# Mthod 2 using baseball model

In [3]:
!pip3 install baseballcv ultralytics

Collecting baseballcv
  Using cached baseballcv-0.1.24-py3-none-any.whl.metadata (26 kB)
Collecting autodistill==0.1.29 (from baseballcv)
  Using cached autodistill-0.1.29-py3-none-any.whl.metadata (32 kB)
Collecting autodistill-grounded-sam==0.1.2 (from baseballcv)
  Using cached autodistill_grounded_sam-0.1.2-py3-none-any.whl.metadata (1.1 kB)
Collecting coco-eval==0.0.4 (from baseballcv)
  Using cached coco_eval-0.0.4.tar.gz (4.5 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting datasets==3.4.0 (from baseballcv)
  Using cached datasets-3.4.0-py3-none-any.whl.metadata (19 kB)
Collecting dotenv<0.10.0,>=0.9.9 (from baseballcv)
  Using cached dotenv-0.9.9-py2.py3-none-any.whl.metadata (279 bytes)
Collecting mediapipe==0.10.21 (from baseballcv)
  Using cached mediapipe-0.10.21-cp312-cp312-macosx_11_0_universal2.whl.metadata (9.9 kB)
Collecting pandas<3.0.0,>=2.2.3 (from baseballcv)
  Using cached pandas-2.3.3-cp312-cp312-macosx_11_0_arm64.whl.metadata (91 kB)
Collectin

In [None]:
#score 0.411

import os
import numpy as np
import pandas as pd
import cv2
import warnings
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, ExtraTreesClassifier
from sklearn.impute import SimpleImputer
import joblib
import subprocess
import sys

warnings.filterwarnings('ignore')
np.random.seed(42)

# Install dependencies properly
print("Installing dependencies...")
subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "cmake"])
subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "ultralytics"])

try:
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "baseballcv"])
except:
    print("BaseballCV installation had issues, will use YOLO directly")

from ultralytics import YOLO

# Paths
data_path = 'data/Question4/baseball-pitch-tracking-cs-gy-6643/baseball_kaggle_dataset_trimmed_only/data'
train_video_path = 'data/Question4/baseball-pitch-tracking-cs-gy-6643/baseball_kaggle_dataset_trimmed_only/train_trimmed'
test_video_path = 'data/Question4/baseball-pitch-tracking-cs-gy-6643/baseball_kaggle_dataset_trimmed_only/test'
test_template_path = 'data/Question4/baseball-pitch-tracking-cs-gy-6643/test_submission_template.csv'

# Load CSVs
train_df = pd.read_csv(os.path.join(data_path, 'train_ground_truth.csv'))
test_df = pd.read_csv(os.path.join(data_path, 'test_features.csv'))
test_template = pd.read_csv(test_template_path)

print(f"Training samples: {len(train_df)}")
print(f"Test samples: {len(test_df)}")

# Load pre-trained YOLO ball tracking model
print("\nLoading YOLOv8 ball tracking model...")
try:
    # Try to load from baseballcv if available
    from baseballcv.functions import LoadTools
    load_tools = LoadTools()
    model_path = load_tools.load_model("ball_tracking")
    ball_model = YOLO(model_path)
    print("✓ Loaded BaseballCV ball tracking model")
except Exception as e:
    print(f"Using YOLOv8 nano as fallback: {e}")
    ball_model = YOLO("yolov8n.pt")

# Extract ball trajectory features using YOLO
def extract_yolo_ball_features(video_path, model, conf=0.3):
    """Extract ball position and trajectory from YOLO predictions"""
    try:
        results = model.predict(video_path, conf=conf, verbose=False, stream=True)
        
        ball_positions = []
        frame_count = 0
        
        for result in results:
            if len(result.boxes) > 0:
                # Get boxes from this frame
                for box in result.boxes:
                    x_center = float(box.xywh[0, 0])
                    y_center = float(box.xywh[0, 1])
                    width = float(box.xywh[0, 2])
                    height = float(box.xywh[0, 3])
                    conf = float(box.conf[0])
                    ball_positions.append([x_center, y_center, width, height, conf])
            frame_count += 1
        
        if len(ball_positions) == 0:
            return np.zeros(80)
        
        ball_pos = np.array(ball_positions)
        
        # Extract comprehensive features
        features = []
        
        # Position statistics (center coordinates)
        features.extend([
            np.mean(ball_pos[:, 0]), np.std(ball_pos[:, 0]),
            np.mean(ball_pos[:, 1]), np.std(ball_pos[:, 1]),
            np.max(ball_pos[:, 0]), np.min(ball_pos[:, 0]),
            np.max(ball_pos[:, 1]), np.min(ball_pos[:, 1]),
            np.percentile(ball_pos[:, 0], 25), np.percentile(ball_pos[:, 0], 75),
            np.percentile(ball_pos[:, 1], 25), np.percentile(ball_pos[:, 1], 75)
        ])
        
        # Size statistics (bounding box)
        features.extend([
            np.mean(ball_pos[:, 2]), np.std(ball_pos[:, 2]),
            np.mean(ball_pos[:, 3]), np.std(ball_pos[:, 3]),
            np.max(ball_pos[:, 2]), np.min(ball_pos[:, 2]),
            np.max(ball_pos[:, 3]), np.min(ball_pos[:, 3])
        ])
        
        # Confidence statistics
        features.extend([
            np.mean(ball_pos[:, 4]), np.std(ball_pos[:, 4]),
            np.max(ball_pos[:, 4]), np.min(ball_pos[:, 4]),
            np.percentile(ball_pos[:, 4], 25), np.percentile(ball_pos[:, 4], 75)
        ])
        
        # Trajectory motion analysis
        if len(ball_pos) > 1:
            # Position differences (velocity)
            diffs = np.diff(ball_pos[:, :2], axis=0)
            distances = np.linalg.norm(diffs, axis=1)
            
            features.extend([
                np.mean(distances), np.std(distances),
                np.max(distances), np.min(distances),
                np.sum(distances), np.percentile(distances, 75)
            ])
            
            # Direction consistency
            if len(distances) > 1:
                direction_changes = np.sqrt(np.diff(diffs[:, 0])**2 + np.diff(diffs[:, 1])**2)
                features.extend([
                    np.mean(direction_changes), np.std(direction_changes),
                    np.max(direction_changes)
                ])
            
            # Acceleration (rate of motion change)
            if len(distances) > 2:
                accel = np.diff(distances)
                features.extend([
                    np.mean(accel), np.std(accel),
                    np.max(np.abs(accel))
                ])
            
            # Horizontal vs vertical motion
            features.extend([
                np.mean(np.abs(diffs[:, 0])), np.mean(np.abs(diffs[:, 1])),
                np.std(diffs[:, 0]), np.std(diffs[:, 1])
            ])
        
        # Video statistics
        cap = cv2.VideoCapture(video_path)
        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        fps = cap.get(cv2.CAP_PROP_FPS)
        cap.release()
        
        features.extend([
            total_frames,
            len(ball_pos) / max(total_frames, 1),
            fps,
            np.mean(ball_pos[:, 0]) / 1920 if total_frames > 0 else 0.5,
            np.mean(ball_pos[:, 1]) / 1080 if total_frames > 0 else 0.5,
            np.std(ball_pos[:, 0]) / 1920 if total_frames > 0 else 0.25,
            np.std(ball_pos[:, 1]) / 1080 if total_frames > 0 else 0.25
        ])
        
        # Pad to size 80
        while len(features) < 80:
            features.append(0)
        
        return np.array(features[:80])
    except Exception as e:
        print(f"Error in YOLO extraction: {e}")
        return np.zeros(80)

# Extract optical flow features as complementary
def extract_optical_flow_features(video_path, sample_frames=10):
    """Extract optical flow features"""
    try:
        cap = cv2.VideoCapture(video_path)
        frames = []
        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        
        if total_frames < 2:
            return np.zeros(70)
        
        frame_indices = np.linspace(0, total_frames - 1, sample_frames, dtype=int)
        frame_set = set(frame_indices)
        frame_count = 0
        
        while True:
            ret, frame = cap.read()
            if not ret:
                break
            if frame_count in frame_set:
                frame = cv2.resize(frame, (64, 64))
                frames.append(cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY).astype(float))
            frame_count += 1
        
        cap.release()
        
        if len(frames) < 2:
            return np.zeros(70)
        
        features = []
        
        # Optical flow between consecutive frames
        for i in range(len(frames) - 1):
            flow = cv2.calcOpticalFlowFarneback(frames[i], frames[i+1], None, 0.5, 3, 15, 3, 5, 1.2, 0)
            mag, ang = cv2.cartToPolar(flow[..., 0], flow[..., 1])
            
            features.extend([
                np.mean(mag), np.std(mag), np.max(mag), np.min(mag),
                np.percentile(mag, 50), np.percentile(mag, 75), np.percentile(mag, 90),
                np.mean(np.abs(flow[..., 0])), np.mean(np.abs(flow[..., 1])),
                np.std(np.abs(flow[..., 0])), np.std(np.abs(flow[..., 1]))
            ])
        
        # Temporal statistics
        frames_array = np.array(frames)
        features.extend([
            np.mean(frames_array), np.std(frames_array),
            np.max(frames_array), np.min(frames_array),
            np.percentile(frames_array, 25), np.percentile(frames_array, 75)
        ])
        
        # Intensity changes
        intensity_diffs = np.diff(frames_array, axis=0)
        features.extend([
            np.mean(intensity_diffs), np.std(intensity_diffs),
            np.max(np.abs(intensity_diffs)), np.percentile(np.abs(intensity_diffs), 90)
        ])
        
        while len(features) < 70:
            features.append(0)
        
        return np.array(features[:70])
    except Exception as e:
        return np.zeros(70)

# Extract training features
print("\nExtracting training features...")
train_yolo_features = []
train_flow_features = []
valid_indices = []

for idx, row in train_df.iterrows():
    video_path = os.path.join(train_video_path, row['file_name'])
    if os.path.exists(video_path):
        yolo_feat = extract_yolo_ball_features(video_path, ball_model)
        flow_feat = extract_optical_flow_features(video_path)
        train_yolo_features.append(yolo_feat)
        train_flow_features.append(flow_feat)
        valid_indices.append(idx)
    
    if (idx + 1) % 500 == 0:
        print(f"  Processed {idx + 1}/{len(train_df)}")

train_yolo_features = np.array(train_yolo_features)
train_flow_features = np.array(train_flow_features)
train_df_valid = train_df.iloc[valid_indices].reset_index(drop=True)
print(f"✓ Extracted {len(train_yolo_features)} training samples")

# Prepare metadata
metadata_cols = ['release_speed', 'effective_speed', 'release_spin_rate',
                  'release_pos_x', 'release_pos_y', 'release_pos_z',
                  'release_extension', 'pfx_x', 'pfx_z', 'sz_top', 'sz_bot']

train_metadata = train_df_valid[metadata_cols].values.astype(float)
stand_map = {'L': -1, 'R': 1}
throws_map = {'L': -1, 'R': 1}

train_metadata = np.hstack([
    train_metadata,
    np.array([stand_map[s] for s in train_df_valid['stand']]).reshape(-1, 1),
    np.array([throws_map[t] for t in train_df_valid['p_throws']]).reshape(-1, 1),
    (train_df_valid['release_speed'] * train_df_valid['release_extension']).values.reshape(-1, 1),
    (train_df_valid['pfx_x'] ** 2 + train_df_valid['pfx_z'] ** 2).values.reshape(-1, 1),
    (train_df_valid['release_spin_rate'] / (train_df_valid['release_speed'] + 1e-6)).values.reshape(-1, 1),
    ((train_df_valid['sz_top'] + train_df_valid['sz_bot']) / 2).values.reshape(-1, 1),
    (train_df_valid['sz_top'] - train_df_valid['sz_bot']).values.reshape(-1, 1)
])

# Combine features
X_train = np.hstack([train_yolo_features, train_flow_features, train_metadata])
y_class = np.array([1 if c == 'strike' else 0 for c in train_df_valid['pitch_class']])
y_zone = train_df_valid['zone'].values.astype(int)

print(f"Training feature matrix shape: {X_train.shape}")

# Impute and scale
imputer = SimpleImputer(strategy='median')
X_train = imputer.fit_transform(X_train)
X_train = np.nan_to_num(X_train, nan=0.0, posinf=0.0, neginf=0.0)

scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)

# Train ensemble models
print("\n" + "="*60)
print("TRAINING THOROUGH ENSEMBLE")
print("="*60)

print("Training zone ensemble...")
zone_gb = GradientBoostingClassifier(
    n_estimators=350, max_depth=10, learning_rate=0.08,
    subsample=0.8, min_samples_split=5, min_samples_leaf=2,
    random_state=42, validation_fraction=0.15, n_iter_no_change=25
)
zone_gb.fit(X_train_scaled, y_zone)

zone_rf = RandomForestClassifier(
    n_estimators=400, max_depth=16, min_samples_split=5,
    min_samples_leaf=2, random_state=42, n_jobs=-1
)
zone_rf.fit(X_train_scaled, y_zone)

zone_et = ExtraTreesClassifier(
    n_estimators=400, max_depth=16, min_samples_split=5,
    min_samples_leaf=2, random_state=42, n_jobs=-1
)
zone_et.fit(X_train_scaled, y_zone)

print("✓ Zone ensemble trained")

print("Training strike/ball ensemble...")
class_gb = GradientBoostingClassifier(
    n_estimators=300, max_depth=9, learning_rate=0.1,
    subsample=0.85, min_samples_split=5, min_samples_leaf=2,
    random_state=42, validation_fraction=0.15, n_iter_no_change=25
)
class_gb.fit(X_train_scaled, y_class)

class_rf = RandomForestClassifier(
    n_estimators=400, max_depth=15, min_samples_split=5,
    min_samples_leaf=2, random_state=42, n_jobs=-1
)
class_rf.fit(X_train_scaled, y_class)

class_et = ExtraTreesClassifier(
    n_estimators=400, max_depth=15, min_samples_split=5,
    min_samples_leaf=2, random_state=42, n_jobs=-1
)
class_et.fit(X_train_scaled, y_class)

print("✓ Strike/ball ensemble trained")

# Extract test features
print("\nExtracting test features...")
test_yolo_features = []
test_flow_features = []
test_files = []

for idx, row in test_df.iterrows():
    video_path = os.path.join(test_video_path, row['file_name'])
    if os.path.exists(video_path):
        yolo_feat = extract_yolo_ball_features(video_path, ball_model)
        flow_feat = extract_optical_flow_features(video_path)
        test_yolo_features.append(yolo_feat)
        test_flow_features.append(flow_feat)
        test_files.append(row['file_name'])
    
    if (idx + 1) % 500 == 0:
        print(f"  Processed {idx + 1}/{len(test_df)}")

test_yolo_features = np.array(test_yolo_features)
test_flow_features = np.array(test_flow_features)
print(f"✓ Extracted {len(test_yolo_features)} test samples")

# Prepare test metadata
test_metadata = test_df[metadata_cols].values.astype(float)
test_metadata = np.hstack([
    test_metadata,
    np.array([stand_map[s] for s in test_df['stand']]).reshape(-1, 1),
    np.array([throws_map[t] for t in test_df['p_throws']]).reshape(-1, 1),
    (test_df['release_speed'] * test_df['release_extension']).values.reshape(-1, 1),
    (test_df['pfx_x'] ** 2 + test_df['pfx_z'] ** 2).values.reshape(-1, 1),
    (test_df['release_spin_rate'] / (test_df['release_speed'] + 1e-6)).values.reshape(-1, 1),
    ((test_df['sz_top'] + test_df['sz_bot']) / 2).values.reshape(-1, 1),
    (test_df['sz_top'] - test_df['sz_bot']).values.reshape(-1, 1)
])

X_test = np.hstack([test_yolo_features, test_flow_features, test_metadata])
X_test = imputer.transform(X_test)
X_test = np.nan_to_num(X_test, nan=0.0, posinf=0.0, neginf=0.0)
X_test_scaled = scaler.transform(X_test)

# Ensemble predictions
print("\nMaking ensemble predictions...")

zone_proba_gb = zone_gb.predict_proba(X_test_scaled)
zone_proba_rf = zone_rf.predict_proba(X_test_scaled)
zone_proba_et = zone_et.predict_proba(X_test_scaled)

zone_proba_ensemble = (zone_proba_gb * 0.5 + zone_proba_rf * 0.25 + zone_proba_et * 0.25)
pred_zone = np.argmax(zone_proba_ensemble, axis=1)
pred_zone = np.clip(pred_zone, 1, 14)

class_proba_gb = class_gb.predict_proba(X_test_scaled)
class_proba_rf = class_rf.predict_proba(X_test_scaled)
class_proba_et = class_et.predict_proba(X_test_scaled)

class_proba_ensemble = (class_proba_gb * 0.5 + class_proba_rf * 0.25 + class_proba_et * 0.25)
pred_class = np.argmax(class_proba_ensemble, axis=1)

# Post-processing
pred_class_refined = []
for i in range(len(pred_class)):
    zone_pred = pred_zone[i]
    class_conf = np.max(class_proba_ensemble[i])
    
    if class_conf < 0.51:
        pred_class_refined.append(1 if zone_pred <= 9 else 0)
    else:
        pred_class_refined.append(pred_class[i])

pred_class = np.array(pred_class_refined)

# Create submission
submission = pd.DataFrame({
    'file_name': test_files,
    'pitch_class': ['strike' if c == 1 else 'ball' for c in pred_class],
    'zone': pred_zone.astype(int)
})

output_path = 'submission_claude.csv'
submission.to_csv(output_path, index=False)

print(f"\n{'='*60}")
print(f"SUBMISSION COMPLETE")
print(f"{'='*60}")
print(f"File: {output_path}")
print(f"Shape: {submission.shape}")
print(f"\nFirst 20 predictions:")
print(submission.head(20))

print(f"\nStrike: {(submission['pitch_class'] == 'strike').sum()} ({(submission['pitch_class'] == 'strike').sum()/len(submission)*100:.1f}%)")
print(f"Ball: {(submission['pitch_class'] == 'ball').sum()} ({(submission['pitch_class'] == 'ball').sum()/len(submission)*100:.1f}%)")
print(f"\nZone distribution:")
print(submission['zone'].value_counts().sort_index())

Installing dependencies...



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py bdist_wheel[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m [31m[183 lines of output][0m
  [31m   [0m   CMAKE = find_executable('cmake')
  [31m   [0m fatal: not a git repository (or any of the parent directories): .git
  [31m   [0m fatal: not a git repository (or any of the parent directories): .git
  [31m   [0m /Library/Frameworks/Python.framework/Versions/

BaseballCV installation had issues, will use YOLO directly
Training samples: 6000
Test samples: 4000

Loading YOLOv8 ball tracking model...
Using YOLOv8 nano as fallback: No module named 'baseballcv'

Extracting training features...
  Processed 500/6000
  Processed 1000/6000
  Processed 1500/6000
  Processed 2000/6000
  Processed 2500/6000
  Processed 3000/6000
  Processed 3500/6000
  Processed 4000/6000
  Processed 4500/6000
  Processed 5000/6000
  Processed 5500/6000
  Processed 6000/6000
✓ Extracted 6000 training samples
Training feature matrix shape: (6000, 168)

TRAINING THOROUGH ENSEMBLE
Training zone ensemble...
✓ Zone ensemble trained
Training strike/ball ensemble...
✓ Strike/ball ensemble trained

Extracting test features...
  Processed 500/4000
  Processed 1000/4000
  Processed 1500/4000
  Processed 2000/4000
  Processed 2500/4000
  Processed 3000/4000
  Processed 3500/4000
  Processed 4000/4000
✓ Extracted 4000 test samples

Making ensemble predictions...

SUBMISSION COMPLET