In [1]:
import fastkaggle
import polars as pl
import holidays

from sklearn.preprocessing import OneHotEncoder, FunctionTransformer, SplineTransformer
from sklearn.model_selection import TimeSeriesSplit, cross_validate
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.ensemble import HistGradientBoostingRegressor

In [2]:
comp = "playground-series-s3e19"
path = fastkaggle.setup_comp(comp)

In [3]:
train_df = pl.read_csv(path/"train.csv", try_parse_dates=True)

In [4]:
train_df.head(4)

id,date,country,store,product,num_sold
i64,date,str,str,str,i64
0,2017-01-01,"""Argentina""","""Kaggle Learn""","""Using LLMs to …",63
1,2017-01-01,"""Argentina""","""Kaggle Learn""","""Using LLMs to …",66
2,2017-01-01,"""Argentina""","""Kaggle Learn""","""Using LLMs to …",9
3,2017-01-01,"""Argentina""","""Kaggle Learn""","""Using LLMs to …",59


In [5]:
def get_holiday(df)->dict[str,dict]:
    countries = df.get_column("country").unique()
    year_range = df.get_column("date").dt.year().unique()
    country_holiday = {}
    for c in countries:
        country_holiday[c]= set(holidays.country_holidays(c,years=year_range))
    return country_holiday

holiday_map = get_holiday(train_df)

In [6]:
def check_holiday(df):
    return df.with_columns(
        holiday = pl.struct(["date","country"]).map_elements(lambda row: row["date"] in holiday_map[row["country"]])
    )

check_holiday(train_df).head(3)

id,date,country,store,product,num_sold,holiday
i64,date,str,str,str,i64,bool
0,2017-01-01,"""Argentina""","""Kaggle Learn""","""Using LLMs to …",63,True
1,2017-01-01,"""Argentina""","""Kaggle Learn""","""Using LLMs to …",66,True
2,2017-01-01,"""Argentina""","""Kaggle Learn""","""Using LLMs to …",9,True


In [7]:
def get_ordinal(df):
    return (
        df.with_columns(pl.col(pl.Date).dt.ordinal_day())
    )

get_ordinal(train_df).head(4)

id,date,country,store,product,num_sold
i64,i16,str,str,str,i64
0,1,"""Argentina""","""Kaggle Learn""","""Using LLMs to …",63
1,1,"""Argentina""","""Kaggle Learn""","""Using LLMs to …",66
2,1,"""Argentina""","""Kaggle Learn""","""Using LLMs to …",9
3,1,"""Argentina""","""Kaggle Learn""","""Using LLMs to …",59


In [8]:
add_holiday = FunctionTransformer(check_holiday)

cat_processor = OneHotEncoder(sparse_output=False, drop="if_binary")

date_processor = make_pipeline(
    FunctionTransformer(get_ordinal),
    SplineTransformer(n_knots=6, extrapolation="periodic")
)

col_transformer = make_column_transformer(
    (cat_processor, ["country","store","product","holiday"]), # holiday feat generated in add_holiday step
    (date_processor, ["date"]),
    verbose_feature_names_out=False,
)

processor = make_pipeline(add_holiday, col_transformer).set_output(transform="polars")

processor

In [9]:
processor.fit_transform(train_df).head(2)

country_Argentina,country_Canada,country_Estonia,country_Japan,country_Spain,store_Kagglazon,store_Kaggle Learn,store_Kaggle Store,product_Using LLMs to Improve Your Coding,product_Using LLMs to Train More LLMs,product_Using LLMs to Win Friends and Influence People,product_Using LLMs to Win More Kaggle Competitions,product_Using LLMs to Write Better,holiday_True,date_sp_0,date_sp_1,date_sp_2,date_sp_3,date_sp_4
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.166667,0.666667,0.166667,0.0,0.0
1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.166667,0.666667,0.166667,0.0,0.0


In [10]:
mod_pipe= make_pipeline(processor, HistGradientBoostingRegressor())
cv = TimeSeriesSplit()
X, y = train_df.drop("num_sold"), train_df.get_column("num_sold").to_numpy()

In [11]:
cross_validate(mod_pipe, X, y, cv=cv, scoring="neg_mean_absolute_error")

{'fit_time': array([0.55826235, 0.63754964, 0.74954653, 1.04708934, 1.05370259]),
 'score_time': array([0.09883761, 0.08386254, 0.0686202 , 0.14131069, 0.14245558]),
 'test_score': array([-23.29121455, -17.06843877, -21.03071491, -21.60931814,
        -22.80976669])}

In [12]:
mod_pipe.fit(X,y)

In [13]:
X_test = pl.read_csv(path/"test.csv", try_parse_dates=True)
X_test.head(4)

id,date,country,store,product
i64,date,str,str,str
136950,2022-01-01,"""Argentina""","""Kaggle Learn""","""Using LLMs to …"
136951,2022-01-01,"""Argentina""","""Kaggle Learn""","""Using LLMs to …"
136952,2022-01-01,"""Argentina""","""Kaggle Learn""","""Using LLMs to …"
136953,2022-01-01,"""Argentina""","""Kaggle Learn""","""Using LLMs to …"


In [14]:
y_pred = mod_pipe.predict(X_test)

In [15]:
ss = pl.read_csv(path/"sample_submission.csv")
ss.head(4)

id,num_sold
i64,i64
136950,100
136951,100
136952,100
136953,100


In [16]:
ss = ss.with_columns(num_sold=y_pred)
ss.write_csv(path/"subm.csv")
ss.head(4)

id,num_sold
i64,f64
136950,39.453135
136951,41.645442
136952,5.0211
136953,38.715653


In [17]:
# from kaggle import api
# api.competition_submit_cli(path/'subm.csv', 'holiday-ordinal-day-boost-tree2', comp)