In [97]:
import fastkaggle
import polars as pl

In [98]:
comp = "playground-series-s3e19"
path = fastkaggle.setup_comp(comp)

In [99]:
train_df = pl.read_csv(path/"train.csv")

In [100]:
train_df.head(4)

id,date,country,store,product,num_sold
i64,str,str,str,str,i64
0,"""2017-01-01""","""Argentina""","""Kaggle Learn""","""Using LLMs to …",63
1,"""2017-01-01""","""Argentina""","""Kaggle Learn""","""Using LLMs to …",66
2,"""2017-01-01""","""Argentina""","""Kaggle Learn""","""Using LLMs to …",9
3,"""2017-01-01""","""Argentina""","""Kaggle Learn""","""Using LLMs to …",59


In [101]:
def create_ordinal_date(path, train=True):
    q = (
        pl.scan_csv(path)
        # .sort("date")
        .with_columns(pl.col("date").str.to_date().dt.ordinal_day().alias("ordinal_date"),
                      pl.col("country","store","product").cast(pl.Categorical)
                      )
    )
    X = q.select("country","store","product","ordinal_date")
    if train:
        y = q.select("num_sold")
        return X.collect().to_pandas(), y.collect().to_numpy().flatten()
    else:
        return X.collect().to_pandas()

In [102]:
X, y = create_ordinal_date(path/"train.csv")

In [103]:
X.shape, y.shape, y.dtype

((136950, 4), (136950,), dtype('int64'))

In [104]:
from sklearn.model_selection import TimeSeriesSplit

In [105]:
for tr, te in TimeSeriesSplit().split(X):
    print(tr.shape, te.shape)

(22825,) (22825,)
(45650,) (22825,)
(68475,) (22825,)
(91300,) (22825,)
(114125,) (22825,)


In [106]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.ensemble import HistGradientBoostingRegressor

In [107]:
pipe = make_pipeline(
    make_column_transformer(
        (OneHotEncoder(sparse_output=False), ["country","store","product"]),
        remainder="passthrough"
    ),
    HistGradientBoostingRegressor()
)

In [108]:
pipe

In [110]:
from sklearn.model_selection import cross_validate

In [111]:
cv_res = cross_validate(pipe,X,y,cv=TimeSeriesSplit(),scoring="neg_mean_absolute_error")

In [112]:
cv_res

{'fit_time': array([0.3396132 , 0.57705975, 0.66372609, 0.7218039 , 0.7305038 ]),
 'score_time': array([0.04069471, 0.06114841, 0.05341673, 0.04270148, 0.05048275]),
 'test_score': array([-21.78317884, -16.64291447, -20.91937681, -21.77820542,
        -22.51108685])}

In [113]:
20/y.mean()

0.1208293952023363

In [114]:
pipe.fit(X,y)

In [115]:
X_test = create_ordinal_date(path/"test.csv",train=False)

In [116]:
X_test.head(4)

Unnamed: 0,country,store,product,ordinal_date
0,Argentina,Kaggle Learn,Using LLMs to Improve Your Coding,1
1,Argentina,Kaggle Learn,Using LLMs to Train More LLMs,1
2,Argentina,Kaggle Learn,Using LLMs to Win Friends and Influence People,1
3,Argentina,Kaggle Learn,Using LLMs to Win More Kaggle Competitions,1


In [117]:
y_pred = pipe.predict(X_test)

In [118]:
y_pred[:10]

array([39.07398254, 39.60684501,  8.57274156, 36.95253281, 30.33986549,
       60.8770722 , 59.74902168, 15.73220107, 56.73015771, 45.83114046])

In [119]:
ss = pl.read_csv(path/"sample_submission.csv")

In [120]:
ss

id,num_sold
i64,i64
136950,100
136951,100
136952,100
136953,100
136954,100
…,…
164320,100
164321,100
164322,100
164323,100


In [121]:
ss = ss.with_columns(num_sold=y_pred)
ss.write_csv(path/"subm.csv")
!head {path/"subm.csv"}

id,num_sold
136950,39.07398253927838
136951,39.60684500692635
136952,8.572741555717766
136953,36.95253280682472
136954,30.339865487828632
136955,60.877072202506945
136956,59.749021676951656
136957,15.732201074264564
136958,56.73015771479267


In [122]:
from kaggle import api
api.competition_submit_cli(path/'subm.csv', 'ordinal-day-boost-tree2', comp)

100%|██████████| 678k/678k [00:00<00:00, 720kB/s] 


Successfully submitted to Forecasting Mini-Course Sales