In [1]:
import polars as pl
import numpy as np
from tensorflow.keras import layers
from sklearn.preprocessing import StandardScaler
import tensorflow as tf

In [None]:

class ModelPreprocessor:
    """
    1. Select fields we want to use as variables for candidate (nfl_id, frame_id, absolute_yardline_number, player_position, play_direction, x, y)
    """
    def __init__(self):
        pass

    def preprocess(self, input_data_path: str, output_data_path: str):
        self.input_data_path = input_data_path 
        self.output_data_path = output_data_path
        train_input = pl.scan_csv(self.input_data_path).filter(pl.col("player_to_predict") == True).with_columns(
            pl.lit(0).alias("sort_flag")
        )
        train_output = pl.scan_csv(self.output_data_path).with_columns(
            pl.lit(1).alias("sort_flag")
        ) 
        self.train_data = pl.concat([train_input, train_output], how = "diagonal").sort(["game_id","play_id","nfl_id","sort_flag","frame_id"])
        self.train_data = self.train_data.with_columns(
            pl.col("player_to_predict").over(["game_id","play_id","nfl_id"]).forward_fill().alias("player_to_predict"),
            pl.col("play_direction").over(["game_id","play_id","nfl_id"]).forward_fill().alias("play_direction"),
            pl.col("absolute_yardline_number").over(["game_id","play_id","nfl_id"]).forward_fill().alias("absolute_yardline_number"),
            pl.col("player_name").over(["game_id","play_id","nfl_id"]).forward_fill().alias("player_name"),
            pl.col("player_height").over(["game_id","play_id","nfl_id"]).forward_fill().alias("player_height"),
            pl.col("player_weight").over(["game_id","play_id","nfl_id"]).forward_fill().alias("player_weight"),
            pl.col("player_birth_date").over(["game_id","play_id","nfl_id"]).forward_fill().alias("player_birth_date"),
            pl.col("player_position").over(["game_id","play_id","nfl_id"]).forward_fill().alias("player_position"),
            pl.col("player_side").over(["game_id","play_id","nfl_id"]).forward_fill().alias("player_side"),
            pl.col("player_role").over(["game_id","play_id","nfl_id"]).forward_fill().alias("player_role"),
            pl.col("num_frames_output").over(["game_id","play_id","nfl_id"]).forward_fill().alias("num_frames_output"),
            pl.col("ball_land_x").over(["game_id","play_id","nfl_id"]).forward_fill().alias("ball_land_x"),
            pl.col("ball_land_y").over(["game_id","play_id","nfl_id"]).forward_fill().alias("ball_land_y"),
            pl.col("play_id").cum_count().over(["game_id","play_id","nfl_id"]).alias("global_frame_id"),
            pl.col("a").fill_null(0.0).alias("a"),
            pl.col("s").fill_null(0.0).alias("s"),
            pl.col("dir").fill_null(0.0).alias("dir"),
            pl.col("o").fill_null(0.0).alias("o"),
            ((pl.col("game_id").cast(dtype = pl.Utf8).str.slice(0,8).str.strptime(dtype = pl.Date, format = "%Y%m%d") - pl.col("player_birth_date").str.strptime(dtype = pl.Date, format = "%Y-%m-%d")).dt.total_days() / 365.25).alias("current_player_age")
        ).with_columns(
            pl.when(pl.col("player_height").str.len_chars() == 3).then((pl.col("player_height").str.slice(0,1).cast(pl.Int32)*12) + pl.col("player_height").str.slice(2,1).cast(pl.Int32)).otherwise((pl.col("player_height").str.slice(0,1).cast(pl.Int32)*12) + pl.col("player_height").str.slice(2,2).cast(pl.Int32)).alias("player_height"),
            pl.col("current_player_age").over(["game_id","play_id","nfl_id"]).forward_fill().alias("current_player_age")
        ).drop(['player_birth_date','player_name','player_to_predict','sort_flag']).collect()
        

    def impute(self):
        print("Creating autoimputer...")
        play_dir_vectorizer = layers.TextVectorization(
            output_mode = 'multi_hot',
            vocabulary = self.train_data['play_direction'].unique().to_list()
        )

        player_pos_vectorizer = layers.TextVectorization(
            output_mode = 'multi_hot',
            vocabulary = self.train_data['player_position'].unique().to_list()
        )

        player_side_vectorizer = layers.TextVectorization(
            output_mode = 'multi_hot',
            vocabulary = self.train_data['player_side'].unique().to_list()
        )

        player_role_vectorizer = layers.TextVectorization(
            output_mode = 'multi_hot',
            vocabulary = self.train_data['player_role'].unique().to_list()
        )

        nfl_id_vectorizer = layers.TextVectorization(
            output_mode = 'int'
        )
        nfl_id_embedder = layers.Embedding(
            input_dim = self.train_data['nfl_id'].n_unique() + 2, # to account for default empty string ('') and unknown vocab ('[UNK]')
            output_dim = 8
        )
        nfl_id_vectorizer.adapt(self.train_data['nfl_id'].cast(pl.Utf8).to_numpy())
        vectorized_player_ids = nfl_id_vectorizer(self.train_data['nfl_id'].cast(pl.Utf8).to_list())

        encoded_fields = layers.Concatenate(axis=-1)([
            play_dir_vectorizer(self.train_data['play_direction']), 
            player_pos_vectorizer(self.train_data['player_position']),
            player_side_vectorizer(self.train_data['player_side']),
            player_role_vectorizer(self.train_data['player_role']),
            layers.Reshape((8,))(nfl_id_embedder(vectorized_player_ids))
        ])

        scaler = StandardScaler()
        standardized_fields = scaler.fit_transform(self.train_data.select([
            "frame_id","absolute_yardline_number","player_height","player_weight",
            "x","y","s","a","dir","o","num_frames_output","ball_land_x","ball_land_y","global_frame_id","current_player_age"
        ]))
        standardized_fields = tf.convert_to_tensor(standardized_fields, dtype = tf.float32)

        imputer_train_data = layers.Concatenate(axis=1)([
            encoded_fields,
            standardized_fields
        ])

        class AutoImputer(tf.keras.Model):
            def __init__(self, input_dim):
                super().__init__()
                self.encoder = tf.keras.Sequential([
                    layers.Dense(64, activation = 'relu'),
                    layers.Dense(32, activation = 'relu')
                ])
                self.decoder = tf.keras.Sequential([
                    layers.Dense(64, activation = 'relu'),
                    layers.Dense(input_dim, activation = 'linear')
                ])
            def call(self, inputs):
                encoded = self.encoder(inputs)
                decoded = self.decoder(encoded)
                return decoded
            
        autoimp = AutoImputer(input_dim = imputer_train_data.shape[1])
        autoimp.compile(optimizer = 'adam', loss = 'mse')
        print("Training autoimputer...")
        autoimp.fit(imputer_train_data, imputer_train_data, epochs = 20, batch_size = 2048)
        
        # Split into 2 datasets: one needing a, s, dir, and o imputed...and another needing no imputation.
        train_untouched = self.train_data.with_columns(
            pl.col("play_id").cum_count(reverse = True).over(["game_id","play_id","nfl_id"]).alias("new_rank")
        ).filter(pl.col("num_frames_output") < pl.col("new_rank"))

        train_impute = self.train_data.with_columns(
            pl.col("play_id").cum_count(reverse = True).over(["game_id","play_id","nfl_id"]).alias("new_rank")
        ).filter(pl.col("num_frames_output") >= pl.col("new_rank"))
        

    def train_test_split(self, train_pct: float):
        pass

In [6]:
preprocessor = ModelPreprocessor()
preprocessor.preprocess(input_data_path = './nfl-big-data-bowl-2026-prediction/train/input_*.csv', output_data_path = './nfl-big-data-bowl-2026-prediction/train/output_*.csv')
preprocessor.impute()

Creating autoimputer...
Training autoimputer...
Epoch 1/20
[1m912/912[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - loss: 0.0258
Epoch 2/20
[1m912/912[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - loss: 1.9591e-04
Epoch 3/20
[1m912/912[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - loss: 1.2414e-04
Epoch 4/20
[1m912/912[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - loss: 9.1755e-05
Epoch 5/20
[1m912/912[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - loss: 7.0404e-05
Epoch 6/20
[1m912/912[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - loss: 5.6454e-05
Epoch 7/20
[1m912/912[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - loss: 4.8449e-05
Epoch 8/20
[1m912/912[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - loss: 4.1079e-05
Epoch 9/20
[1m912/912[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - loss: 3.7704e-05
Epoch 10/20

In [19]:
data = preprocessor.train_data.with_columns(
    pl.col("play_id").cum_count(reverse = True).over(["game_id","play_id","nfl_id"]).alias("new_rank")
).filter(
    pl.col("num_frames_output") >= pl.col("new_rank")
)

<tf.Tensor: shape=(1866376, 8), dtype=float32, numpy=
array([[-0.01560276, -0.02569612,  0.03832675, ...,  0.03581517,
        -0.02562199, -0.01934985],
       [-0.01560276, -0.02569612,  0.03832675, ...,  0.03581517,
        -0.02562199, -0.01934985],
       [-0.01560276, -0.02569612,  0.03832675, ...,  0.03581517,
        -0.02562199, -0.01934985],
       ...,
       [-0.04462384,  0.03885417,  0.03869602, ..., -0.02074506,
         0.02701716, -0.0063647 ],
       [-0.04462384,  0.03885417,  0.03869602, ..., -0.02074506,
         0.02701716, -0.0063647 ],
       [-0.04462384,  0.03885417,  0.03869602, ..., -0.02074506,
         0.02701716, -0.0063647 ]], dtype=float32)>