In [1]:
import numpy as np
import polars as pl
from pandas.core.common import random_state
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder, MaxAbsScaler

In [2]:
import base64
from IPython.display import Image, display


def mm(graph):
    """Display a Mermaid diagram from a string"""
    graphbytes = graph.encode("ascii")
    base64_bytes = base64.b64encode(graphbytes)
    base64_string = base64_bytes.decode("ascii")
    url = "https://mermaid.ink/img/" + base64_string

    display(Image(url=url, width=800, height=600))


In [3]:
mm("""

   graph LR

       PAR[classification.parquet]
       CAT(catetorical columns)
        NUM(numerical columns)

       VAR[exclude low variance columns]
         IMP[impute missing values]
       OE[OrdinalEncoder]
       SCALE[MaxAbsScaler]


         PAR .-> CAT
            PAR .-> NUM


    subgraph cat_pipeline
        CAT --> OE
    end

    subgraph num_pipeline
     NUM --> VAR
            VAR --> IMP
            IMP --> SCALE
    end

    subgraph prepared_data

    end

    OE .-> prepared_data
    SCALE .-> prepared_data


   """)

In [4]:
df = pl.read_parquet('/Users/vajk/Desktop/GitHub/Big-data-Ball-2025/data/classification.parquet')
pipe_df = df.with_columns(
    pl.col(pl.Boolean).cast(pl.String)
)
pipe_df = pipe_df.with_columns(pl.col(pl.String).fill_null('0'))


In [5]:
X_ = pipe_df.drop('NextPlay')
y_ = pipe_df.drop_in_place('NextPlay')
# bool to string
# string null fill
# string encoding
X_train, X_test, y_train, y_test = train_test_split(
    X_, y_, test_size=0.2, random_state=42
    , stratify=y_
)

In [6]:
oe = OrdinalEncoder().set_output(transform='polars')
oe_df = oe.fit_transform(X_train.select(pl.col(pl.String)))

In [7]:
X_train_encoded = X_train.select(pl.all().exclude(oe_df.columns)).hstack(oe_df)

In [8]:
selector = VarianceThreshold(threshold=0.8).set_output(transform='polars')
variance_df = selector.fit_transform(X_train_encoded)
X_train_var0 = X_train_encoded.select(variance_df.columns)

In [9]:
X_train_encoded.var()

gameId,playId,quarter,down,yardsToGo,yardlineNumber,preSnapHomeScore,preSnapVisitorScore,absoluteYardlineNumber,preSnapHomeTeamWinProbability,preSnapVisitorTeamWinProbability,expectedPoints,playClockAtSnap,passLength,targetX,targetY,dropbackDistance,timeToThrow,timeInTackleBox,timeToSack,qbKneel,penaltyYards,prePenaltyYardsGained,yardsGained,homeTeamWinProbabilityAdded,visitorTeamWinProbilityAdded,expectedPointsAdded,pff_runPassOption,epa,SeriesFirstDown,Yards,IsRush,IsPass,IsIncomplete,IsTouchdown,IsSack,IsChallenge,…,rain_sum,snowfall_sum,precipitation_hours,wind_speed_10m_max,wind_gusts_10m_max,wind_direction_10m_dominant,Closed,possessionTeam,defensiveTeam,yardlineSide,gameClock,playNullifiedByPenalty,offenseFormation,receiverAlignment,passResult,playAction,dropbackType,passLocationType,passTippedAtLine,unblockedPressure,qbSpike,qbSneak,rushLocationType,isDropback,pff_runConceptPrimary,pff_runConceptSecondary,pff_passCoverage,pff_manZone,Formation,PlayType,PassType,Challenger,PenaltyTeam,PenaltyType,homeTeamAbbr,visitorTeamAbbr,City
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
36013000.0,1397600.0,1.289683,0.683849,15.267079,160.201144,89.452717,85.300959,592.655137,0.084197,0.084197,2.728402,39.458964,100.904074,681.411281,227.276609,3.724776,1.016142,0.668734,1.77335,0.010035,78.188823,74.750862,78.58439,0.001658,0.001658,1.966845,0.091656,4.674644,0.206114,78.697387,0.241531,0.248405,0.147402,0.037175,0.036745,0.003405,…,44.800452,0.0,23.577329,61.677474,156.795181,9907.158365,0.233686,85.396324,86.846347,88.89202,73496.252575,0.0,2.4071,2.252378,1.468137,0.142287,14.022387,0.569892,0.270448,0.312755,0.246654,0.259867,2.006306,0.239291,13.531625,120.824492,20.674337,0.827336,0.47146,1.349658,8.802883,0.0,8.434968,7.634275,85.275961,86.88728,77.759838


The Pipeline so far

In [10]:
from sklearn.preprocessing import MaxAbsScaler
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
import numpy as np


class PolarsTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, df=None):
        self.df = df

    def transform(self, X: pl.DataFrame, y=None) -> pl.DataFrame:
        pipe_df = X.with_columns(
            pl.col(pl.Boolean).cast(pl.String)
        )
        pipe_df = pipe_df.with_columns(pl.col(pl.String).fill_null('0'))

        return pipe_df

    def cat_columns(self, X: pl.DataFrame) -> list:
        cat_df = self.transform(X)
        return cat_df.select(pl.col(pl.String)).columns

    def num_columns(self, X: pl.DataFrame) -> list:
        num_df = self.transform(X)
        return num_df.select(pl.all().exclude(pl.String)).columns


cat_attribs = PolarsTransformer().cat_columns(df)

cat_pipeline = make_pipeline(

    OrdinalEncoder(dtype=np.int64).set_output(transform='polars')

)

num_attribs = PolarsTransformer().num_columns(df)

num_pipeline = make_pipeline(
    VarianceThreshold(threshold=0.8).set_output(transform='polars'),
    SimpleImputer(strategy="median").set_output(transform='polars'),
    MaxAbsScaler().set_output(transform='polars')
)
preprocessing = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
    ("cat", cat_pipeline, cat_attribs),
]).set_output(transform='polars')

In [11]:
def cat_pipiline(df: pl.DataFrame) -> pl.DataFrame:
    cat_attribs = PolarsTransformer().cat_columns(df)

    cat_pipeline = make_pipeline(

        OrdinalEncoder(dtype=np.int64).set_output(transform='polars')

    )

    num_attribs = PolarsTransformer().num_columns(df)

    num_pipeline = make_pipeline(
        VarianceThreshold(threshold=0.8).set_output(transform='polars'),
        SimpleImputer(strategy="median").set_output(transform='polars'),
        MaxAbsScaler().set_output(transform='polars')
    )
    preprocessing = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        ("cat", cat_pipeline, cat_attribs),
    ]).set_output(transform='polars')

    df_prepared = (preprocessing.fit_transform(PolarsTransformer().transform(df))).select(pl.all().shrink_dtype())

    return df_prepared

In [12]:


X_ = df.drop('NextPlay')
y_ = df.drop_in_place('NextPlay')

X_train, X_test, y_train, y_test = train_test_split(
    X_, y_, test_size=0.2, random_state=42
   # , stratify=y_
)

In [13]:
df_= cat_pipiline(X_train)