In [2]:
import os, sys

import sgpp, sgml, dproc, sgutil
import pandas as pd
import polars as pl
import numpy as np
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

print(sys.version)

from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer

3.12.9 (main, Mar 15 2025, 13:36:28) [GCC 13.3.0]


In [3]:
sc = sgutil.SGCache('img', 'result', 'model')
p = make_pipeline(
    sgpp.PolarsProcessor(),
    sgpp.ExprProcessor({
        'ELm_num': (pl.col('Episode_Length_minutes').clip(5, 120) - 5) / 115,
        'GP': pl.col('Guest_Popularity_percentage').clip(0, 100) / 100,
        'HP': pl.col('Host_Popularity_percentage').clip(0, 100) / 100,
        'NAd': (pl.when(pl.col('Number_of_Ads') > 4).then(0).otherwise(pl.col('Number_of_Ads'))).fill_null(0.0) /3 ,
        'Number_of_Ads': (pl.when(pl.col('Number_of_Ads') > 4).then(0).otherwise(pl.col('Number_of_Ads'))).fill_null(0).cast(pl.Int8),
        'ELm_na': pl.col('Episode_Length_minutes').is_null(),
        'GPp_na': pl.col('Guest_Popularity_percentage').is_null(),
        'ELm_sqrt': ((pl.col('Episode_Length_minutes').fill_null(pl.col('Episode_Length_minutes').mean()).clip(5, 120) - 5) / 115)  ** 0.5
    }),
    sgpp.PandasConverter(index_col = 'id'),
    sgpp.ApplyWrapper(SimpleImputer().set_output(transform = 'pandas'), ['ELm_num', 'GP'])
)
p.fit(['data/train.csv'])
df_train = p.transform(['data/train.csv'])
df_test = p.transform(['data/test.csv'])

In [4]:
df_train_1 = df_train.loc[~df_train['ELm_na']]
df_test_1 = df_test.loc[~df_test['ELm_na']]
target = 'Listening_Time_minutes'
resi = 'resi'
X_all = [i for i in df_train.columns if i != target]

In [5]:
import lightgbm as lgb
import xgboost as xgb
import catboost as cb
from sklearn.metrics import root_mean_squared_error
from sklearn.model_selection import KFold, cross_validate, cross_val_score, ShuffleSplit, train_test_split
from sklearn.linear_model import LinearRegression
from mlxtend.feature_selection import SequentialFeatureSelector
kf = KFold(5, shuffle = True, random_state = 123)
ss = ShuffleSplit(1, train_size = 0.8, random_state = 123)

def get_validation_splitter(validation_fraction):
    return lambda x: train_test_split(x, test_size = validation_fraction)

config = {
    'predict_func': lambda m, df, X: pd.Series(m.predict(df[X]), index = df.index),
    'score_func': lambda df, prds: root_mean_squared_error(df[target], prds),
    'validation_splitter': get_validation_splitter,
    'progress_callback': sgml.ProgressCallBack(), 
    'return_train_scores': True,
    'y': target,
}

config2 = {
    'predict_func': lambda m, df, X: pd.Series(m.predict(df[X]), index = df.index),
    'score_func': lambda df, prds: root_mean_squared_error(df[resi], prds),
    'validation_splitter': get_validation_splitter,
    'progress_callback': sgml.ProgressCallBack(), 
    'return_train_scores': False,
    'y': resi,
}

lr_adapter = sgml.SklearnAdapter(LinearRegression)
lgb_adapter = sgml.LGBMAdapter(lgb.LGBMRegressor, progress = 50)
xgb_adapter = sgml.XGBAdapter(xgb.XGBRegressor, progress = 50)

- 선형 모델의 잔차를 타겟으로 하는 모델을 만듭니다.
- 이를 위해 Feature Engineering에서 Target Encoding을 위한 속성 선택 과정에서 도출한 변수를 사용합니다.

In [6]:
df_train_1 = df_train_1.assign(
    prd = sc.read_prd('LR_Elm_notna'),
    resi = lambda x: x[target] - x['prd']
)
df_test_1 = df_test_1.assign(
    prd = sc.get_predictor_cv('LR_Elm_notna', config)(df_test_1)
)

In [7]:
rmse_oof, prd_oof_lgb = sc.read_result('cv_lgb2')
root_mean_squared_error(
    df_train.loc[~df_train['ELm_na'], target], prd_oof_lgb.loc[~df_train['ELm_na']]
)

9.462844973040315

In [14]:
X_tgt_sel, _ = sc.read_result('tgt_sel_resi2') # 속성 선택 과정을 통해 도출된 조합된 속성
from sklearn.preprocessing import TargetEncoder

def tge_proc(df_train, df_valid):
    X_tgt_var = pd.Series(X_tgt_sel).str.split('__').explode().unique().tolist()
    tge = make_pipeline(
        sgpp.CatCombiner2(pd.Series(X_tgt_sel).str.split('__').tolist()), TargetEncoder()
    ).set_output(transform = 'pandas')
    return tge.fit_transform(df_train[X_tgt_var], df_train[resi]), tge.transform(df_valid[X_tgt_var])

In [17]:
for i, (train_idx, valid_idx) in enumerate(kf.split(df_train_1[X_all], df_train_1[resi])):
    print("generating {} Fold".format(i))
    sc.cache_result(
        'kf_sp_resi_{}'.format(i),
        lambda : tge_proc(df_train_1.iloc[train_idx], df_train_1.iloc[valid_idx]), rerun = 0
    )

generating 0 Fold
generating 1 Fold
generating 2 Fold
generating 3 Fold
generating 4 Fold


In [18]:
_ =sc.cache_result(
    'tgt_enc_resi', lambda : tge_proc(df_train_1, df_test_1), rerun = 0
)

In [20]:
def cv_tgt_resi(hparams, adapter, test_run = False, **argv):
    rmse_oof = list()
    prd_oof = list()
    for i, (train_idx, valid_idx) in enumerate(kf.split(df_train_1[X_all], df_train_1[target])):
        df_tgt_train, df_tgt_valid = sc.read_result('kf_sp_resi_{}'.format(i))
        result = sgml.train(
            dproc.join_and_assign(
                df_train_1.iloc[train_idx].rename(columns = {'Episode_Title': 'ET', 'Genre': 'G'}), 
                df_tgt_train
            ), hparams, config2, adapter, **argv
        )
        predictor = sgml.assemble_predictor(**result[0], spec = result[1], config = config2)
        prd_oof.append(
            predictor(dproc.join_and_assign(
                df_train_1.iloc[valid_idx].rename(columns = {'Episode_Title': 'ET', 'Genre': 'G'}), 
                df_tgt_valid
            ))
        )
        rmse_oof.append(
            root_mean_squared_error(df_train_1.iloc[valid_idx][resi], prd_oof[-1])
        )
        if test_run:
            break
    return rmse_oof, pd.concat(prd_oof).sort_index()

# LGB3

In [56]:
hparams = {
    'model_params': {'num_leaves': 255, 'n_estimators': 3000, 'learning_rate': 0.01, 'colsample_bytree': 0.7, 'metric': 'rmse'},
    'X_cat': ['Podcast_Name', 'ET', 'G', 'Publication_Day', 'Publication_Time', 'Episode_Sentiment'],
    'X_num': ['GP', 'HP', 'NAd', 'ELm_num'] + X_tgt_sel,
    'validation_fraction': 0.1
}

In [57]:
result = sc.cache_result(
    "cv_lgb3", 
    lambda : cv_tgt_resi(hparams, lgb_adapter, test_run = True),
    rerun = True
)

Round:   0%|          | 0/3000 [00:00<?, ?it/s]



In [58]:
result

([9.494932022843424],
 id
 7        -11.925543
 11         2.764987
 12        -0.947146
 13        -5.415752
 15        10.825530
             ...    
 749986    -4.286009
 749988    -0.393125
 749993    -0.410707
 749995    -1.558802
 749997    -2.453536
 Length: 132582, dtype: float64)