In [None]:
import os
from pathlib import Path
import sys
from datetime import datetime
import json

import pandas as pd
import polars as pl
import numpy as np
import lightgbm as lgb
import xgboost as xgb

# import kaggle_evaluation.nfl_inference_server
# Use robust importer to handle missing module in runtime
try:
    import kaggle_evaluation.nfl_inference_server as nfl_inf
except ModuleNotFoundError:
    from pathlib import Path
    root = Path('/kaggle/input')
    comp = None
    if root.exists():
        for p in root.iterdir():
            if p.is_dir() and 'nfl-big-data-bowl-2026-prediction' in p.name:
                comp = p
                break
    candidates = []
    if comp:
        candidates.append(comp / 'kaggle_evaluation')
    for p in root.iterdir() if root.exists() else []:
        if p.is_dir():
            candidates.append(p / 'kaggle_evaluation')
    for c in candidates:
        if c.exists():
            sys.path.insert(0, str(c.parent))
    import kaggle_evaluation.nfl_inference_server as nfl_inf

# Submission tracking info
SUBMISSION_CREATED = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
SUBMISSION_ID = datetime.now().strftime("%Y%m%d_%H%M%S")

print("\n" + "=" * 60)
print("LOADING ENSEMBLE MODELS")
print("=" * 60)
print(f"Submission Created: {SUBMISSION_CREATED}")
print(f"Submission ID: {SUBMISSION_ID}")


def _find_ensemble_dir():
    """Search all attached datasets for nfl_ensemble_v* directories.
    Returns path to the FIRST ensemble directory found that matches the pattern.
    """
    root = Path('/kaggle/input')
    if not root.exists():
        print("⚠️  /kaggle/input does not exist")
        return None
    
    print("\n=== Searching for Ensemble Models ===")
    folders = [p.name for p in root.iterdir() if p.is_dir()]
    print(f"Available folders: {folders}")
    print("Looking for: nfl_ensemble_v* directories")
    
    for dataset in root.iterdir():
        if not dataset.is_dir():
            continue
        
        # Skip the competition data folder
        if 'nfl-big-data-bowl-2026-prediction' in dataset.name.lower():
            print(f"  Skipping competition folder: {dataset.name}")
            continue
        
        print(f"  Checking dataset: {dataset.name}")
        
        # Search for nfl_ensemble_v* directories
        ensemble_candidates = list(dataset.glob('**/nfl_ensemble_v*'))
        ensemble_candidates = [p for p in ensemble_candidates if p.is_dir()]
        
        if ensemble_candidates:
            ensemble_dir = ensemble_candidates[0]
            print(f"✓ Found ensemble in: {dataset}")
            print(f"  Ensemble at: {ensemble_dir}")
            return ensemble_dir
    
    return None


def _find_features_module(ensemble_root=None):
    """Search all attached datasets for features.py
    
    Args:
        ensemble_root: Optional path where ensemble was found, to search there first
    """
    root = Path('/kaggle/input')
    if not root.exists():
        print("⚠️  /kaggle/input does not exist")
        return None
    
    print(f"\nSearching for features.py in {root}")
    
    # First check if features.py is in the same location as the ensemble
    if ensemble_root:
        print(f"  Checking ensemble location: {ensemble_root.parent.name}")
        for candidate in [
            ensemble_root / 'features.py',
            ensemble_root.parent / 'features.py',
            ensemble_root.parent / 'for_kaggle' / 'features.py'
        ]:
            if candidate.exists():
                print(f"✓ Found features.py with ensemble at: {candidate}")
                return candidate
    
    # Search all folders for features.py
    for dataset in root.iterdir():
        if not dataset.is_dir():
            continue
        features_candidates = list(dataset.rglob('features.py'))
        if features_candidates:
            print(f"✓ Found features.py in: {dataset.name}")
            return features_candidates[0]
    
    return None


def _to_pandas(df):
    if isinstance(df, pl.DataFrame):
        return df.to_pandas()
    return df


# Find ensemble and features paths
ensemble_dir = _find_ensemble_dir()
if not ensemble_dir:
    raise FileNotFoundError(
        "❌ No valid ensemble found (nfl_ensemble_v* directory required).\n\n"
        "To fix:\n"
        "1. Train ensemble: python scripts/train_ensemble.py\n"
        "2. Upload ensemble directory to Kaggle as a dataset\n"
        "3. Attach the dataset to this notebook\n"
        "4. Re-run the notebook"
    )

features_path = _find_features_module(ensemble_root=ensemble_dir)
if not features_path:
    raise FileNotFoundError(
        "\nfeatures.py not found in any attached dataset.\n"
        "Solutions:\n"
        "  1. Upload features.py as a separate dataset and attach it, OR\n"
        "  2. Include features.py in the ensemble dataset package"
    )

print(f"\n✓ Found features at: {features_path}")
print(f"\n✓ Found ensemble at: {ensemble_dir}")

# Write paths to environment variables so predict() can find them
# This avoids having ANY objects in global scope that could be pickled
os.environ['NFL_ENSEMBLE_DIR'] = str(ensemble_dir)
os.environ['NFL_FEATURES_PATH'] = str(features_path)

print(f"✓ Stored paths in environment")
print("=" * 60 + "\n")


def predict(test: pl.DataFrame, test_input: pl.DataFrame) -> pl.DataFrame | pd.DataFrame:
    """
    Inference function used by the NFL evaluation gateway.
    
    CRITICAL: This function is called IN THE SERVER PROCESS via gRPC.
    We read paths from environment variables and load models fresh here.
    Models are loaded using native formats (LightGBM .txt, XGBoost .json)
    to completely avoid pickle serialization issues.
    """
    # Read paths from environment (set in notebook process)
    ensemble_dir_str = os.environ.get('NFL_ENSEMBLE_DIR')
    features_path_str = os.environ.get('NFL_FEATURES_PATH')
    
    if not ensemble_dir_str or not features_path_str:
        raise RuntimeError("Ensemble/features paths not found in environment")
    
    ensemble_dir = Path(ensemble_dir_str)
    
    # Add features location to path
    features_parent = str(Path(features_path_str).parent)
    if features_parent not in sys.path:
        sys.path.insert(0, features_parent)
    
    from features import add_time_lag_features, prepare_features, transform_for_inference
    
    # Load metadata
    metadata_path = ensemble_dir / 'metadata.json'
    with open(metadata_path, 'r') as f:
        metadata = json.load(f)
    
    feat_cols = metadata['feature_columns']
    player_pos_vals = metadata.get('player_position_values', None)
    weights = metadata['ensemble_weights']
    
    # Load LightGBM models (native .txt format - NO PICKLE)
    lgb_x = lgb.Booster(model_file=str(ensemble_dir / metadata['model_files']['lgb_x']))
    lgb_y = lgb.Booster(model_file=str(ensemble_dir / metadata['model_files']['lgb_y']))
    
    # Load XGBoost models (native .json format - NO PICKLE)
    xgb_x = xgb.Booster()
    xgb_x.load_model(str(ensemble_dir / metadata['model_files']['xgb_x']))
    xgb_y = xgb.Booster()
    xgb_y.load_model(str(ensemble_dir / metadata['model_files']['xgb_y']))

    # Convert inputs to pandas for feature pipeline
    test_pd = _to_pandas(test)
    test_in_pd = _to_pandas(test_input)

    # Merge like training: left join on identifiers if available
    key_cols = [c for c in ['game_id','play_id','nfl_id','frame_id'] 
               if c in test_pd.columns and c in test_in_pd.columns]
    if key_cols:
        df = pd.merge(test_pd, test_in_pd, on=key_cols, how='left', suffixes=(None,'_in'))
    else:
        df = test_pd.copy()

    # Feature engineering for inference
    df = add_time_lag_features(df)
    _ = prepare_features(df)
    X_pred = transform_for_inference(df, feat_cols, player_pos_vals)

    # Predict with ensemble (weighted average)
    lgb_px = lgb_x.predict(X_pred)
    xgb_px = xgb_x.predict(xgb.DMatrix(X_pred))
    px = weights['lightgbm'] * lgb_px + weights['xgboost'] * xgb_px
    
    lgb_py = lgb_y.predict(X_pred)
    xgb_py = xgb_y.predict(xgb.DMatrix(X_pred))
    py = weights['lightgbm'] * lgb_py + weights['xgboost'] * xgb_py

    predictions = pd.DataFrame({'x': px, 'y': py})
    assert len(predictions) == len(test_pd)
    return predictions


# Start inference server (serve on hidden test; local gateway otherwise)
inference_server = nfl_inf.NFLInferenceServer(predict)

if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
    inference_server.serve()
else:
    # Provide path to published public competition files for local gateway
    inference_server.run_local_gateway(('/kaggle/input/nfl-big-data-bowl-2026-prediction/',))
