In [1]:
import polars as pl
import altair as alt
from read_parquet_and_reorder import read_parquet_and_reorder
from one_hot_encode import one_hot_encode
from blog import logger

alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

In [2]:
logger.setLevel("INFO")

In [3]:
df = read_parquet_and_reorder("df.parquet")


logger.info(df.shape)
df_per_100g = df.select("code", *[c for c in df.columns if c.endswith("_100g")])
df = df.select(c for c in df.columns if not c.endswith("_100g"))

columns = [
    "categories_en",
    "ingredients_tags",
    "ingredients_analysis_tags",
    "traces_en",
    "food_groups_en",
    "nutrient_levels_tags",
    "main_category_en",
    "packaging_en",
]
df_dict: dict[str, pl.DataFrame] = {
    c: df.pipe(one_hot_encode, c, n=10, remove_prefix=["en:", "de:"]) for c in columns
} | {"nutrients": df_per_100g}

In [4]:
df_for_ml = df.select("code")
for key, _df in df_dict.items():
    logger.info(key)
    df_for_ml = df_for_ml.join(_df, on="code", suffix=key)
df_for_ml = df_for_ml.fill_null(0.0).drop(
    "nutrition-score-fr_100g", "nutrition-score-uk_100g"
)

In [19]:
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Normalizer
import collections
import numpy as np
from sklearn.linear_model import LinearRegression

y = df.select("nutriscore_score").to_numpy().flatten()
logger.info(y.shape)

X = df_for_ml.drop("code").to_numpy()
logger.info(X.shape)


transformer = Normalizer().fit(X)
X_train, X_test, y_train, y_test = train_test_split(
    transformer.transform(X), y, test_size=0.10, random_state=2023
)


# clf = tree.DecisionTreeClassifier(max_depth=15)
clf = tree.DecisionTreeRegressor()
# clf = LinearRegression()
clf = clf.fit(X_train, y_train)


df_tree = pl.concat(
    [
        pl.DataFrame(
            {
                "actual score": y_test,
                "predicted score": clf.predict(X_test),
                "label": "test",
            }
        ),
        pl.DataFrame(
            {
                "actual score": y_train,
                "predicted score": clf.predict(X_train),
                "label": "train",
            }
        ),
    ]
).with_columns(err=pl.col("predicted score") - pl.col("actual score"))

In [43]:
alt.Chart(df_tree).mark_rect(clip=True).encode(
    x=alt.X("actual score:Q").bin(step=1).scale(domain=(-15, 40)),
    y=alt.Y("predicted score:Q").bin(step=1).scale(domain=(-15, 40)),
    color=alt.Color("count():Q").scale(scheme="viridis", reverse=True),
    column=alt.Column("label:N"),
).properties(width=500, height=500).resolve_scale(color="independent")

In [44]:
importance = pl.DataFrame(
    {
        "field": [c for c in df_for_ml.columns if c != "code"],
        "value": clf.feature_importances_,
    }
).sort("value", descending=True)

alt.Chart(importance.head(20)).mark_bar().encode(
    y=alt.Y("field:O", sort=None),
    x="value:Q",
)

In [45]:
corr = (
    df_for_ml.with_columns(pl.col("code").cast(int))
    .join(df.select("code", "nutriscore_score"), on="code")
    .to_pandas()
    .corr()
)

In [46]:
corr.loc[corr["nutriscore_score"].abs() > 0.2][["nutriscore_score"]].sort_values(
    "nutriscore_score"
).iloc[:-1, :].style.format(precision=2).background_gradient(
    cmap="RdBu", vmin=-1, vmax=1
)

Unnamed: 0,nutriscore_score
saturated-fat-in-low-quantity,-0.52
fat-in-low-quantity,-0.41
Beverages,-0.39
Plant-based foods and beverages,-0.39
Plant-based foods,-0.39
Cereals and potatoesfood_groups_en,-0.33
Cereals and potatoes,-0.27
sugars-in-low-quantity,-0.27
Cereals and their products,-0.27
vegan,-0.25
