In [None]:
import sys

import numpy as np
import pandas as pd
import polars as pl

sys.path.append("../src")
from preprocess import Preprocessor
from model import Model
from cfg import cfg

In [None]:
# データの読み込み
data = pl.read_csv("../Team_Project_doc_data_24/train+test.csv")
submit = pl.read_csv("../Team_Project_doc_data_24/submit.csv")

In [None]:
# 前処理クラスの定義
# scale=Trueで標準化、encodingでエンコーディングの方法を指定（"onehot" or "label"）
preprocessor = Preprocessor(scale=False, encoding="label")

In [None]:
# train_testに対して前処理を行う
# 前処理後、目的変数列は除かれる点に注意
transformed_data = preprocessor.preprocess_data(data)
transformed_data.head()

In [None]:
replace_map = {"yes": 1, "no": 0}
data = data.with_columns(pl.col("y").replace(replace_map).cast(pl.Int8))

In [None]:
data["y"].value_counts()

# LightGBM

In [None]:
# モデルの定義
model = Model(cfg.lgbm_params, model_type="lgb")
model.fit(transformed_data.to_pandas(), data["y"].to_numpy())

In [None]:
# submitに対して前処理を行う
transformed_submit = preprocessor.preprocess_data(submit, mode="test")
transformed_submit.head()

In [None]:
# 予測
y_prob, y_pred = model.predict(transformed_submit.to_pandas())
y_pred = ["yes" if i == 1 else "no" for i in y_pred]
submit = submit.with_columns(
    pl.Series("probability of yes (or score)", y_prob),
    pl.Series("y (yes or no)  ", y_pred),
)

In [None]:
submit.head()

# CatBoost

In [None]:
# モデルの定義
cat_cols = [col for col in data.columns if data[col].dtype == pl.String]
model = Model(cfg.cat_params, model_type="catboost", categorical_cols=cat_cols)
model.fit(transformed_data.to_pandas(), data["y"].to_numpy())

In [None]:
# 予測
y_prob, y_pred = model.predict(transformed_submit.to_pandas())
y_pred = ["yes" if i == 1 else "no" for i in y_pred]
submit = submit.with_columns(
    pl.Series("probability of yes (or score)", y_prob),
    pl.Series("y (yes or no)  ", y_pred),
)

In [None]:
submit.head()