In [1]:
import fastkaggle
import polars as pl
import holidays

from sklearn.preprocessing import OneHotEncoder, FunctionTransformer, SplineTransformer
from sklearn.model_selection import TimeSeriesSplit, cross_validate
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.ensemble import HistGradientBoostingRegressor

In [2]:
comp = "playground-series-s3e19"
path = fastkaggle.setup_comp(comp)

In [3]:
train_df = pl.read_csv(path/"train.csv", try_parse_dates=True)

In [4]:
train_df.head(4)

id,date,country,store,product,num_sold
i64,date,str,str,str,i64
0,2017-01-01,"""Argentina""","""Kaggle Learn""","""Using LLMs to …",63
1,2017-01-01,"""Argentina""","""Kaggle Learn""","""Using LLMs to …",66
2,2017-01-01,"""Argentina""","""Kaggle Learn""","""Using LLMs to …",9
3,2017-01-01,"""Argentina""","""Kaggle Learn""","""Using LLMs to …",59


In [5]:
(
    train_df
    .group_by("date","store").agg(pl.col("num_sold").sum())
    .sort("date")
    .plot.line(x="date",y="num_sold",by="store")
)

In [6]:
(
    train_df.group_by("date","store").agg(pl.col("num_sold").sum())
    .pivot(index="date",columns="store", values="num_sold")
    .sort("date")
    .with_columns(
        pl.all().exclude("date")/ pl.all().exclude("date").sum()
    )
    .plot.line()
)

In [7]:
(
    train_df.group_by("date","store").agg(pl.col("num_sold").sum())
    .pivot(index="date",columns="store", values="num_sold")
    .sort("date")
    .with_columns(
        pl.all().exclude("date")/ pl.all().exclude("date").sum()
    )
    .plot.kde()
)

In [8]:
def foo(df, feat):
    return (
    df.group_by("date", feat).agg(pl.col("num_sold").sum())
    .pivot(index="date",columns=feat, values="num_sold")
    .sort("date")
    .with_columns( # columns -> probability distributions
        pl.all().exclude("date")/ pl.all().exclude("date").sum()
    )
    .plot.kde()
)

foo(train_df,"country")

In [9]:
def boo(df,feat):
    return (
    df.group_by("date",feat).agg(pl.col("num_sold").sum())
    .pivot(index="date",columns=feat, values="num_sold")
    .sort("date")
    .with_columns(
        pl.all().exclude("date")/ pl.all().exclude("date").sum()
    )
    .plot.line()
)

boo(train_df,"country")

In [11]:
# saved GDP data. only run this the first time

# import requests 

# def get_gdp_per_capita(country,year):
#     alpha3 = {'Argentina':'ARG','Canada':'CAN','Estonia':'EST','Japan':'JPN','Spain':'ESP'}
#     url="https://api.worldbank.org/v2/country/{0}/indicator/NY.GDP.PCAP.CD?date={1}&format=json".format(alpha3[country],year)
#     response = requests.get(url).json()
#     return response[1][0]['value']


# def get_gdp(df):
#     q = (
#         df.lazy()
#         .select(pl.col("country").unique())
#         .join(df.lazy().select(pl.col("date").dt.year().unique().alias("year")), how="cross")
#         .with_columns(
#             pl.struct(["country","year"])
#             .map_elements(lambda row: get_gdp_per_capita(row["country"],row["year"]))
#             .alias("GDP")
#         )
#     )
#     return q.collect()

# gdp_df = get_gdp(train_df)
# gdp_df.write_csv(path/"gdp.csv")

In [10]:
gdp_df = pl.read_csv(path/"gdp.csv")

In [13]:
(
    train_df
    .group_by(pl.col("date").dt.year().cast(pl.Int64),"country").agg(pl.last().sum())
    .rename({"date":"year"})
    .join(gdp_df, on=["country","year"])
    .plot.scatter(x="GDP",y="num_sold",by="country")
)

## what is the slope

In [15]:
df = (
    train_df
    .group_by(pl.col("date").dt.year().cast(pl.Int64),"country").agg(pl.last().sum())
    .rename({"date":"year"})
    .join(gdp_df, on=["country","year"])
)

In [26]:
from sklearn.linear_model import LinearRegression

y, X = df.select("GDP").to_numpy(), df.select("num_sold").to_numpy()
model = LinearRegression().fit(X,y)
model.coef_, model.intercept_

(array([[0.0331231]]), array([83.81165106]))