In [1]:
import os, sys

import sgpp, sgml, dproc, sgutil, sgnn
import pandas as pd
import polars as pl
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

print(sys.version)

from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer

2025-04-28 21:23:07.558767: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-04-28 21:23:07.689836: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1745842987.748185     720 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1745842987.766046     720 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1745842987.882439     720 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

3.12.9 (main, Mar 15 2025, 13:36:28) [GCC 13.3.0]


In [2]:
sc = sgutil.SGCache('img', 'result', 'model')
p = make_pipeline(
    sgpp.PolarsProcessor(),
    sgpp.ExprProcessor({
        'ELm_num': (pl.col('Episode_Length_minutes').clip(5, 120) - 5) / 115,
        'GP': pl.col('Guest_Popularity_percentage').clip(0, 100) / 100,
        'HP': pl.col('Host_Popularity_percentage').clip(0, 100) / 100,
        'NAd': (pl.when(pl.col('Number_of_Ads') > 4).then(0).otherwise(pl.col('Number_of_Ads'))).fill_null(0.0) /3 ,
        'Number_of_Ads': (pl.when(pl.col('Number_of_Ads') > 4).then(0).otherwise(pl.col('Number_of_Ads'))).fill_null(0).cast(pl.Int8),
        'ELm_na': pl.col('Episode_Length_minutes').is_null(),
        'GPp_na': pl.col('Guest_Popularity_percentage').is_null(),
        'ELm_sqrt': ((pl.col('Episode_Length_minutes').fill_null(pl.col('Episode_Length_minutes').mean()).clip(5, 120) - 5) / 115)  ** 0.5
    }),
    sgpp.PandasConverter(index_col = 'id'),
    sgpp.ApplyWrapper(SimpleImputer().set_output(transform = 'pandas'), ['ELm_num', 'GP'])
)
p.fit(['data/train.csv'])
df_train = p.transform(['data/train.csv'])
df_test = p.transform(['data/test.csv'])

In [3]:
import lightgbm as lgb
import xgboost as xgb
import catboost as cb
from sklearn.metrics import root_mean_squared_error
from sklearn.model_selection import KFold, cross_validate, cross_val_score, ShuffleSplit
from sklearn.linear_model import LinearRegression
from mlxtend.feature_selection import SequentialFeatureSelector
kf = KFold(5, shuffle = True, random_state = 123)
ss = ShuffleSplit(1, train_size = 0.8, random_state = 123)
target = 'Listening_Time_minutes'
X_all = [i for i in df_train.columns if i != target]

def get_validation_splitter(validation_fraction):
    return lambda x: train_test_split(x, test_size = validation_fraction)

config = {
    'predict_func': lambda m, df, X: pd.Series(m.predict(df[X]), index = df.index),
    'score_func': lambda df, prds: root_mean_squared_error(df[target], prds),
    'validation_splitter': get_validation_splitter,
    'progress_callback': sgml.ProgressCallBack(), 
    'return_train_scores': True,
    'y': target,
}

lr_adapter = sgml.SklearnAdapter(LinearRegression)
lgb_adapter = sgml.LGBMAdapter(lgb.LGBMRegressor, progress = 50)
xgb_adapter = sgml.XGBAdapter(xgb.XGBRegressor, progress = 50)
cb_adapter = sgml.CBAdapter(cb.CatBoostRegressor, progress = 50)
nn_adapter = sgnn.NNAdapter(model=sgnn.NNRegressor, progress = 50)

2.Feature Engineering.ipynb 에서 발견한 속성들로 학습을 시킵니다.

In [4]:
X_tgt_sel, _ = sc.read_result('tgt_sel')
from sklearn.preprocessing import TargetEncoder

def tge_proc(df_train, df_valid):
    X_tgt_var = pd.Series(X_tgt_sel).str.split(',').explode().unique().tolist()
    tge = make_pipeline(
        sgpp.CatCombiner2(pd.Series(X_tgt_sel).str.split(',').tolist()), TargetEncoder()
    ).set_output(transform = 'pandas')
    return tge.fit_transform(df_train[X_tgt_var], df_train[target]), tge.transform(df_valid[X_tgt_var])

In [6]:
for i, (train_idx, valid_idx) in enumerate(kf.split(df_train[X_all], df_train[target])):
    print("generating {} Fold".format(i))
    sc.cache_result(
        'kf_sp_{}2'.format(i),
        lambda : tge_proc(df_train.iloc[train_idx], df_train.iloc[valid_idx]), rerun = 0
    )

generating 0 Fold
generating 1 Fold
generating 2 Fold
generating 3 Fold
generating 4 Fold


In [7]:
_ =sc.cache_result(
    'tgt_enc2', lambda : tge_proc(df_train, df_test), rerun = 0
)

In [5]:
def cv_tgt2(hparams, adapter, test_run = False, **argv):
    rmse_oof = list()
    prd_oof = list()
    for i, (train_idx, valid_idx) in enumerate(kf.split(df_train[X_all], df_train[target])):
        df_tgt_train, df_tgt_valid = sc.cache_result(
            'kf_sp_{}2'.format(i),
            lambda : tge_proc(df_train.iloc[train_idx], df_train.iloc[valid_idx])
        )
        result = sgml.train(
            dproc.join_and_assign(
                df_train.iloc[train_idx].rename(columns = {'Episode_Title': 'ET', 'Genre': 'G'}), 
                df_tgt_train.rename(columns = lambda x: x.replace(',', '_'))), hparams, config, adapter, **argv
        )
        predictor = sgml.assemble_predictor(**result[0], spec = result[1], config = config)
        prd_oof.append(
            predictor(dproc.join_and_assign(
                df_train.iloc[valid_idx].rename(columns = {'Episode_Title': 'ET', 'Genre': 'G'}), 
                df_tgt_valid.rename(columns = lambda x: x.replace(',', '_'))
            ))
        )
        rmse_oof.append(
            root_mean_squared_error(df_train.iloc[valid_idx][target], prd_oof[-1])
        )
        if test_run:
            break
    return rmse_oof, pd.concat(prd_oof).sort_index()

# Linear Regression 2

In [17]:
X_tgt = X_tgt_sel
hparams = {
    'X_num': ['GP', 'HP', 'NAd', 'ELm_sqrt', 'ELm_num', 'ELm_na'] + [i.replace(',', '_') for i in X_tgt]
}

In [18]:
rmse_oof, prd_oof = sc.cache_result(
    'lr2', lambda : cv_tgt2(hparams, lr_adapter)
)
rmse_oof

[12.805628602113751,
 12.831107080031513,
 12.869922126139686,
 12.89564465274633,
 12.911804869939672]

In [19]:
df_tgt_train, df_tgt_test = sc.read_result('tgt_enc2')
result = sc.cache_result(
    "lr2_m2", lambda : sgml.train(
        dproc.join_and_assign(df_train, df_tgt_train.rename(columns = lambda x: x.replace(',', '_'))), hparams, config, lr_adapter
    )
)
predictor = sgml.assemble_predictor(**result[0], config = config, spec = result[1])
prd = predictor(
    dproc.join_and_assign(df_test, df_tgt_test.rename(columns = lambda x: x.replace(',', '_')))
)
prd.rename(target).to_csv('result/submission7_2.csv')

# LGB 2

In [20]:
X_tgt = X_tgt_sel
hparams = {
    'model_params': {'num_leaves': 1024, 'n_estimators': 1500, 'learning_rate': 0.01, 'colsample_bytree': 0.7},
    'X_num': ['GP', 'HP', 'NAd', 'ELm_num', 'ELm_na'] + [i.replace(',', '_') for i in X_tgt],
    'metric': 'rmse'
}

In [8]:
rmse_oof, prd_oof_lgb = sc.cache_result(
    'cv_lgb2', lambda : cv_tgt2(hparams, lgb_adapter)
)
np.mean(rmse_oof), rmse_oof

(11.927139537250884,
 [11.883674552343056,
  11.891175999570464,
  11.909504203491974,
  11.974334369606053,
  11.977008561242876])

In [22]:
df_tgt_train, df_tgt_test = sc.read_result('tgt_enc2')
result = sc.cache_result(
    "lgb2_2", lambda : sgml.train(
        dproc.join_and_assign(df_train, df_tgt_train.rename(columns = lambda x: x.replace(',', '_'))), hparams, config, lgb_adapter
    )
)
predictor = sgml.assemble_predictor(**result[0], config = config, spec = result[1])
prd = predictor(
    dproc.join_and_assign(df_test, df_tgt_test.rename(columns = lambda x: x.replace(',', '_')))
)
prd.rename(target).to_csv('result/submission5_2.csv')

Round:   0%|          | 0/1500 [00:00<?, ?it/s]

LR2, LGB2 모두 교차 검증의 성능과 Public Score와 차이가 상당히 납니다. 왜 그런지... 디버깅이 필요해 보입니다.

> 디버깅 완료: Train과 Test 각각 ELm과 GP의 결측치를 복원한게 문제가 됐습니다.

# XGB 2

In [15]:
X_tgt = X_tgt_sel
hparams = {
    'model_params': {'max_depth': 10, 'n_estimators': 1500, 'learning_rate': 0.02, 'colsample_bytree': 0.7},
    'X_num': ['GP', 'HP', 'NAd', 'ELm_num', 'ELm_na'] + [i.replace(',', '_') for i in X_tgt],
}

In [9]:
rmse_oof, prd_oof_xgb = sc.cache_result(
    'cv_xgb2', lambda : cv_tgt2(hparams, xgb_adapter, use_gpu = True), rerun = 0
)
np.mean(rmse_oof), rmse_oof

(11.955599975585937,
 [11.905922889709473,
  11.917500495910645,
  11.939695358276367,
  12.005415916442871,
  12.009465217590332])

In [9]:
df_tgt_train, df_tgt_test = sc.read_result('tgt_enc2')
result = sc.cache_result(
    "xgb2", lambda : sgml.train(
        dproc.join_and_assign(df_train, df_tgt_train.rename(columns = lambda x: x.replace(',', '_'))), hparams, config, xgb_adapter
    )
)

Round:   0%|          | 0/1500 [00:00<?, ?it/s]

# CAT 1

In [21]:
X_tgt = X_tgt_sel
hparams = {
    'model_params': {'max_depth': 12, 'n_estimators': 1500, 'learning_rate': 0.05},
    'X_cat': ['Podcast_Name', 'ET', 'G', 'Publication_Day', 'Publication_Time', 'Episode_Sentiment'],
    'X_num': ['GP', 'HP', 'NAd', 'ELm_num', 'ELm_na'] + [i.replace(',', '_') for i in X_tgt],
}

In [10]:
rmse_oof, prd_oof_cb = sc.cache_result(
    'cv_cb1', lambda : cv_tgt2(hparams, cb_adapter, use_gpu = True)
)
np.mean(rmse_oof), rmse_oof

(12.024958809105506,
 [11.97769259753349,
  11.992119435823222,
  12.008359769471664,
  12.072927122912313,
  12.073695119786848])

In [23]:
df_tgt_train, df_tgt_test = sc.read_result('tgt_enc2')
result = sc.cache_result(
    "cb1", lambda : sgml.train(
        dproc.join_and_assign(
            df_train.rename(columns = {'Episode_Title': 'ET', 'Genre': 'G'}), 
            df_tgt_train.rename(columns = lambda x: x.replace(',', '_'))
        ), hparams, config, cb_adapter
    )
)

Round:   0%|          | 0/1500 [00:00<?, ?it/s]

# NN1

In [12]:
X_tgt = X_tgt_sel
nn_params = {
    'config':  [
        {'unit': 128, 'activation': 'relu', 'batch_norm': False},
        {'unit': 128, 'activation': 'relu', 'batch_norm': False},
        {'unit': 64, 'activation': 'relu', 'batch_norm': False},
        {'unit': 64, 'activation': 'relu', 'batch_norm': False},
    ]
}

hparams = {
    'model_params': {
        'model_params': nn_params,
        'epochs': 30,
        'optimizer': ('Adam', {'learning_rate': 0.0001}),
        'batch_size': 128, 'shuffle_size': 102400,
        'early_stopping': None, 'reduce_lr_on_plateau': None, 'lr_scheduler': None
    }, 
    'X_num':  [i.replace(',', '_') for i in X_tgt],
    'validation_fraction': 0.1
}

In [None]:
cv_tgt2(hparams, nn_adapter, test_run = True, use_gpu = True)

# Ensemble 2

In [23]:
root_mean_squared_error(
    df_train[target],
    pd.concat([prd_oof_lgb, prd_oof_xgb], axis=1).dot([0.7, 0.3])
)

11.917298470770655

In [24]:
df_tgt_train, df_tgt_test = sc.read_result('tgt_enc2')
result = sc.read_result("lgb2_2")
predictor = sgml.assemble_predictor(**result[0], config = config, spec = result[1])
prd_lgb2_2 = predictor(
    dproc.join_and_assign(df_test, df_tgt_test.rename(columns = lambda x: x.replace(',', '_')))
)

In [25]:
result = sc.read_result("xgb2")
predictor = sgml.assemble_predictor(**result[0], config = config, spec = result[1])
prd_xgb2 = predictor(
    dproc.join_and_assign(df_test, df_tgt_test.rename(columns = lambda x: x.replace(',', '_')))
)

In [29]:
(pd.concat([prd_lgb2_2, prd_xgb2], axis=1).dot([0.7, 0.3])).rename(target).to_csv('result/submission10.csv')

# Ensemble 3

In [20]:
root_mean_squared_error(
    df_train[target],
    pd.concat([prd_oof_lgb, prd_oof_xgb, prd_oof_cb], axis=1).dot([0.6, 0.3, 0.1])
)

11.916032568369

In [26]:
result = sc.read_result("cb1")
predictor = sgml.assemble_predictor(**result[0], config = config, spec = result[1])
prd_cb = predictor(
    dproc.join_and_assign(
        df_test.rename(columns = {'Episode_Title': 'ET', 'Genre': 'G'}), 
        df_tgt_test.rename(columns = lambda x: x.replace(',', '_'))
    )
)

In [27]:
(pd.concat([
    prd_lgb2_2, prd_xgb2, prd_cb
], axis=1).dot([0.6, 0.3, 0.1])).rename(target).to_csv('result/submission11.csv')