In [1]:
import fastkaggle
import polars as pl

from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder,StandardScaler, FunctionTransformer, SplineTransformer
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import cross_validate

In [2]:
comp = "playground-series-s3e19"
path = fastkaggle.setup_comp(comp)

In [3]:
train_df = pl.read_csv(path/"train.csv", try_parse_dates=True)

In [4]:
train_df.head(4)

id,date,country,store,product,num_sold
i64,date,str,str,str,i64
0,2017-01-01,"""Argentina""","""Kaggle Learn""","""Using LLMs to …",63
1,2017-01-01,"""Argentina""","""Kaggle Learn""","""Using LLMs to …",66
2,2017-01-01,"""Argentina""","""Kaggle Learn""","""Using LLMs to …",9
3,2017-01-01,"""Argentina""","""Kaggle Learn""","""Using LLMs to …",59


In [5]:
def get_ordinal(df):
    return (
        df.with_columns(pl.col(pl.Date).dt.ordinal_day())
    )

get_ordinal(train_df).head(4)

id,date,country,store,product,num_sold
i64,i16,str,str,str,i64
0,1,"""Argentina""","""Kaggle Learn""","""Using LLMs to …",63
1,1,"""Argentina""","""Kaggle Learn""","""Using LLMs to …",66
2,1,"""Argentina""","""Kaggle Learn""","""Using LLMs to …",9
3,1,"""Argentina""","""Kaggle Learn""","""Using LLMs to …",59


In [6]:
# define processors for various types
time_processor = make_pipeline(
    FunctionTransformer(get_ordinal),
    SplineTransformer(n_knots=6, extrapolation="periodic"),
)

cat_processor = OneHotEncoder(sparse_output=False)

num_processor = StandardScaler()

# process all in parallel
processor = make_column_transformer(
    (cat_processor, ["country","store","product"]),
    (time_processor,["date"]),
    verbose_feature_names_out=False,
).set_output(transform='polars')

processor.fit_transform(train_df).head(4)

country_Argentina,country_Canada,country_Estonia,country_Japan,country_Spain,store_Kagglazon,store_Kaggle Learn,store_Kaggle Store,product_Using LLMs to Improve Your Coding,product_Using LLMs to Train More LLMs,product_Using LLMs to Win Friends and Influence People,product_Using LLMs to Win More Kaggle Competitions,product_Using LLMs to Write Better,date_sp_0,date_sp_1,date_sp_2,date_sp_3,date_sp_4
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.166667,0.666667,0.166667,0.0,0.0
1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.166667,0.666667,0.166667,0.0,0.0
1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.166667,0.666667,0.166667,0.0,0.0
1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.166667,0.666667,0.166667,0.0,0.0


In [7]:
pipe = make_pipeline(processor, HistGradientBoostingRegressor())
cv = TimeSeriesSplit()
X, y = train_df.drop("num_sold"), train_df.get_column("num_sold").to_numpy()

In [8]:
cross_validate(pipe, X, y, cv=cv, scoring="neg_mean_absolute_error")

{'fit_time': array([0.39704061, 0.52769208, 0.66638875, 0.82450032, 0.93134308]),
 'score_time': array([0.05116963, 0.05499387, 0.0515027 , 0.06411457, 0.05736375]),
 'test_score': array([-23.31735914, -17.0018814 , -21.03593693, -21.58537922,
        -22.77693762])}

In [9]:
pipe.fit(X,y)

In [10]:
X_test = pl.read_csv(path/"test.csv", try_parse_dates=True)
X_test.head(4)

id,date,country,store,product
i64,date,str,str,str
136950,2022-01-01,"""Argentina""","""Kaggle Learn""","""Using LLMs to …"
136951,2022-01-01,"""Argentina""","""Kaggle Learn""","""Using LLMs to …"
136952,2022-01-01,"""Argentina""","""Kaggle Learn""","""Using LLMs to …"
136953,2022-01-01,"""Argentina""","""Kaggle Learn""","""Using LLMs to …"


In [11]:
y_pred = pipe.predict(X_test)

In [12]:
ss = pl.read_csv(path/"sample_submission.csv")
ss.head(4)

id,num_sold
i64,i64
136950,100
136951,100
136952,100
136953,100


In [13]:
ss = ss.with_columns(num_sold=y_pred)
ss.write_csv(path/"subm.csv")
ss.head(4)

id,num_sold
i64,f64
136950,39.775445
136951,39.142544
136952,7.952741
136953,40.856892


In [14]:
# from kaggle import api
# api.competition_submit_cli(path/'subm.csv', 'ordinal-day-boost-tree5', comp)