In [17]:
import sys

import numpy as np
import pandas as pd
import polars as pl

sys.path.append("../src")
from preprocess import Preprocessor
from model import Model
from cfg import cfg

In [18]:
# データの読み込み
data = pl.read_csv("../Team_Project_doc_data_24/train+test.csv")
submit = pl.read_csv("../Team_Project_doc_data_24/submit.csv")

In [19]:
# 前処理クラスの定義
# scale=Trueで標準化、encodingでエンコーディングの方法を指定（"onehot" or "label"）
preprocessor = Preprocessor(scale=False, encoding="label")

In [20]:
# train_testに対して前処理を行う
# 前処理後、目的変数列は除かれる点に注意
transformed_data = preprocessor.preprocess_data(data)
transformed_data.head()

age,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,job,marital,education,default,housing,loan,contact,month,day_of_week,poutcome
i64,i64,i64,i64,f64,f64,f64,f64,f64,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32
47,2,999,0,-0.1,93.2,-42.0,4.021,5195.8,0,1,6,0,2,0,0,7,0,1
52,3,999,0,1.1,93.994,-36.4,4.857,5191.0,7,0,3,0,0,0,1,6,1,1
30,1,999,0,-0.1,93.2,-42.0,4.076,5195.8,0,2,2,0,0,0,1,7,2,1
28,2,999,1,-1.8,92.893,-46.2,1.344,5099.1,7,2,7,0,0,0,0,6,3,0
29,1,999,0,1.4,93.918,-42.7,4.961,5228.1,9,1,5,0,2,2,0,3,3,1


In [21]:
replace_map = {"yes": 1, "no": 0}
data = data.with_columns(pl.col("y").replace(replace_map).cast(pl.Int8))

In [22]:
data["y"].value_counts()

y,count
i8,u32
0,8887
1,1113


# LightGBM

In [23]:
# モデルの定義
model = Model(cfg.lgbm_params, model_type="lgb")
model.fit(transformed_data.to_pandas(), data["y"].to_numpy())

Fold 0: 0.7846
Fold 1: 0.8113
Fold 2: 0.7931
Fold 3: 0.8080
Fold 4: 0.7973
AUC: 0.7890
F1: 0.4896


In [24]:
# submitに対して前処理を行う
transformed_submit = preprocessor.preprocess_data(submit, mode="test")
transformed_submit.head()

age,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,job,marital,education,default,housing,loan,contact,month,day_of_week,poutcome
i64,i64,i64,i64,f64,f64,f64,f64,f64,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32
59,2,999,0,1.1,93.994,-36.4,4.857,5191.0,5,1,5,1,0,0,1,6,1,1
29,1,999,0,1.4,93.918,-42.7,4.962,5228.1,0,1,3,0,2,0,0,3,3,1
58,8,999,0,1.4,93.918,-42.7,4.961,5228.1,5,1,5,1,0,0,1,3,3,1
35,1,999,0,-0.1,93.2,-42.0,4.021,5195.8,4,1,6,0,2,0,0,7,0,1
34,4,999,0,1.4,93.444,-36.1,4.963,5228.1,9,1,5,1,2,0,0,1,2,1


In [25]:
# 予測
y_prob, y_pred = model.predict(transformed_submit.to_pandas())
y_pred = ["yes" if i == 1 else "no" for i in y_pred]
submit = submit.with_columns(
    pl.Series("probability of yes (or score)", y_prob),
    pl.Series("y (yes or no)  ", y_pred),
)

In [26]:
submit.head()

age,job,marital,education,default,housing,loan,contact,month,day_of_week,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y (yes or no),probability of yes (or score)
i64,str,str,str,str,str,str,str,str,str,i64,i64,i64,str,f64,f64,f64,f64,f64,str,f32
59,"""retired""","""married""","""professional.course""","""unknown""","""no""","""no""","""telephone""","""may""","""mon""",2,999,0,"""nonexistent""",1.1,93.994,-36.4,4.857,5191.0,"""no""",0.037189
29,"""admin.""","""married""","""high.school""","""no""","""yes""","""no""","""cellular""","""jul""","""tue""",1,999,0,"""nonexistent""",1.4,93.918,-42.7,4.962,5228.1,"""no""",0.068227
58,"""retired""","""married""","""professional.course""","""unknown""","""no""","""no""","""telephone""","""jul""","""tue""",8,999,0,"""nonexistent""",1.4,93.918,-42.7,4.961,5228.1,"""no""",0.045656
35,"""management""","""married""","""university.degree""","""no""","""yes""","""no""","""cellular""","""nov""","""fri""",1,999,0,"""nonexistent""",-0.1,93.2,-42.0,4.021,5195.8,"""no""",0.086182
34,"""technician""","""married""","""professional.course""","""unknown""","""yes""","""no""","""cellular""","""aug""","""thu""",4,999,0,"""nonexistent""",1.4,93.444,-36.1,4.963,5228.1,"""no""",0.076053


# CatBoost

In [33]:
# モデルの定義
cat_cols = [col for col in data.columns if data[col].dtype == pl.String]
model = Model(cfg.cat_params, model_type="catboost", categorical_cols=cat_cols)
model.fit(transformed_data.to_pandas(), data["y"].to_numpy())

0:	test: 0.7446899	best: 0.7446899 (0)	total: 175ms	remaining: 29m 12s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.7753258039
bestIteration = 21

Shrink model to first 22 iterations.
Fold 0: 0.7753
0:	test: 0.7362509	best: 0.7362509 (0)	total: 17.9ms	remaining: 2m 59s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.8170887423
bestIteration = 36

Shrink model to first 37 iterations.
Fold 1: 0.8171
0:	test: 0.7515627	best: 0.7515627 (0)	total: 15ms	remaining: 2m 30s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.7966681892
bestIteration = 43

Shrink model to first 44 iterations.
Fold 2: 0.7967
0:	test: 0.7485647	best: 0.7485647 (0)	total: 29.7ms	remaining: 4m 57s
250:	test: 0.8044167	best: 0.8054665 (195)	total: 4.82s	remaining: 3m 7s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.8078562398
bestIteration = 287

Shrink model to first 288 iterations.
Fold 3: 0.8079
0:	test: 0.7225876	best: 0.7225876 

In [34]:
# 予測
y_prob, y_pred = model.predict(transformed_submit.to_pandas())
y_pred = ["yes" if i == 1 else "no" for i in y_pred]
submit = submit.with_columns(
    pl.Series("probability of yes (or score)", y_prob),
    pl.Series("y (yes or no)  ", y_pred),
)

In [35]:
submit.head()

age,job,marital,education,default,housing,loan,contact,month,day_of_week,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y (yes or no),probability of yes (or score)
i64,str,str,str,str,str,str,str,str,str,i64,i64,i64,str,f64,f64,f64,f64,f64,str,f32
59,"""retired""","""married""","""professional.course""","""unknown""","""no""","""no""","""telephone""","""may""","""mon""",2,999,0,"""nonexistent""",1.1,93.994,-36.4,4.857,5191.0,"""no""",0.034427
29,"""admin.""","""married""","""high.school""","""no""","""yes""","""no""","""cellular""","""jul""","""tue""",1,999,0,"""nonexistent""",1.4,93.918,-42.7,4.962,5228.1,"""no""",0.070402
58,"""retired""","""married""","""professional.course""","""unknown""","""no""","""no""","""telephone""","""jul""","""tue""",8,999,0,"""nonexistent""",1.4,93.918,-42.7,4.961,5228.1,"""no""",0.039274
35,"""management""","""married""","""university.degree""","""no""","""yes""","""no""","""cellular""","""nov""","""fri""",1,999,0,"""nonexistent""",-0.1,93.2,-42.0,4.021,5195.8,"""no""",0.075668
34,"""technician""","""married""","""professional.course""","""unknown""","""yes""","""no""","""cellular""","""aug""","""thu""",4,999,0,"""nonexistent""",1.4,93.444,-36.1,4.963,5228.1,"""no""",0.064705
