In [1]:
import os
import re
from datetime import datetime
from pathlib import Path

import lightgbm as lgb
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import polars as pl
import seaborn as sns
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold, train_test_split,KFold


In [2]:
exp_name = "001_tutorial"
input_dir = "../input"
output_dir = "../output"

exp_dir = f"{output_dir}/{exp_name}"
feature_dir = f"{input_dir}/feature"

Path(feature_dir).mkdir(exist_ok=True, parents=True)
Path(exp_dir).mkdir(exist_ok=True, parents=True)

# %%
raw_train = pl.read_csv(f"{input_dir}/train.csv")
raw_test = pl.read_csv(f"{input_dir}/test.csv")
anime = pl.read_csv(f"{input_dir}/anime.csv")
sub = pl.read_csv(f"{input_dir}/sample_submission.csv")

In [3]:
def join_anime(df, anime_df):
    df = df.join(anime_df, how="left", on="anime_id")

    if "row_nr" not in df.columns:
        df = df.with_row_count()
    return df

raw_X_test = join_anime(raw_test, anime)
raw_train = join_anime(raw_train, anime)



In [4]:

from scorta.utils.logger import timing_logger
from abc import ABC

class PlFeature(ABC):
    def __init__(self, feature_dir:Path|str,suffix:str=""):
        if self.__class__.__name__.isupper():
            self.name = self.__class__.__name__.lower()
        else:
            self.name = re.sub("([A-Z])", lambda x: "_" + x.group(1).lower(), self.__class__.__name__).lstrip("_")

        self.name = self.__class__.__name__
        self.feature_dir = Path(feature_dir)
        self.feature_dir.mkdir(exist_ok=True, parents=True)
        self.feature_path = self.feature_dir / f"{self.name}_{suffix}.parquet"

    def fit(self)->pl.DataFrame:
        raise NotImplementedError

    def create_feautre(self):
        df = self.fit()
        self.save(df)

    def save(self,df:pl.DataFrame) -> None:
        df.write_parquet(self.feature_path)

    def load(self) -> pl.DataFrame:
        return pl.read_parquet(self.feature_path)

class MemberRatio(PlFeature):
    def __init__(self,df:pl.DataFrame, feature_dir:str,suffix:str):
        super().__init__(feature_dir,suffix)
        self.df = df
        self.key_cols = ["user_id","anime_id"]
        self.feature_cols = ["watching_rate","completed_rate","on_hold_rate","dropped_rate"]

    @timing_logger
    def fit(self) -> pl.DataFrame:
        df = self.df.with_columns(
            [
                (pl.col("watching") / pl.col("members")).alias("watching_rate"),
                (pl.col("completed") / pl.col("members")).alias("completed_rate"),
                (pl.col("on_hold") / pl.col("members")).alias("on_hold_rate"),
                (pl.col("dropped") / pl.col("members")).alias("dropped_rate"),
            ]
        )

        return df[self.key_cols + self.feature_cols]


feats = [MemberRatio(raw_train,f"{input_dir}/feature","train"),MemberRatio(raw_X_test,f"{input_dir}/feature","test")]

for feat in feats:
    feat.create_feautre()


[32m2023-12-08 18:09:12[0m | INFO | [34mlogger.py:wrapper:49[0m | [1m[PID:75885] Starting fit() at ../../../../../../../../../var/folders/z0/sjx59b5j2ql1n4gd96t8rlg40000gq/T/ipykernel_75885/481370097.py:20[0m
[32m2023-12-08 18:09:12[0m | INFO | [34mlogger.py:wrapper:55[0m | [1m[PID:75885] Finished fit() in 0.1354 seconds.
[0m
[32m2023-12-08 18:09:12[0m | INFO | [34mlogger.py:wrapper:49[0m | [1m[PID:75885] Starting fit() at ../../../../../../../../../var/folders/z0/sjx59b5j2ql1n4gd96t8rlg40000gq/T/ipykernel_75885/481370097.py:20[0m
[32m2023-12-08 18:09:12[0m | INFO | [34mlogger.py:wrapper:55[0m | [1m[PID:75885] Finished fit() in 0.0112 seconds.
[0m


In [10]:

class FeatureMerger():
    def __init__(self,features:list[PlFeature])-> None:
       self.features = features

    def merge(self,df:pl.DataFrame)-> pl.DataFrame:
        for feature in self.features:
            feat_df = feature.load()
            df = df.join(feat_df,how="left",on=feature.key_cols)
        return df

tr_fm = FeatureMerger([MemberRatio(raw_train, feature_dir, "train")])
te_fm = FeatureMerger([MemberRatio(raw_X_test, feature_dir, "test")])

X_train = tr_fm.merge(raw_train)
X_test = te_fm.merge(raw_X_test).select(~cs.string())

import polars.selectors as cs
y_train = X_train["score"].to_numpy()
X_train = X_train.select(~cs.string()).drop("score")



In [7]:
from scorta.model.gradient_boost import GBTWrapper
gbdt = GBTWrapper("lgb","reg")
models, oof = gbdt.fit(X_train,y_train)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000856 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2777
[LightGBM] [Info] Number of data points in the train set: 109120, number of used features: 11
[LightGBM] [Info] Start training from score 7.768759
Training until validation scores don't improve for 1 rounds
Training until validation scores don't improve for 1 rounds
Early stopping, best iteration is:
[96]	valid_0's l2: 1.97115
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000819 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2777
[LightGBM] [Info] Number of data points in the train set: 109121, number of used features: 11
[LightGBM] [Info] Start training from score 7.768725
Trai

In [15]:
preds = np.array([gbdt.predict(X_test,i) for i in range(5)]).mean(axis=0)

