In [5]:
import os, sys

import sgpp, sgml, dproc, sgutil
import pandas as pd
import polars as pl
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

print(sys.version)

from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer

3.12.9 (main, Mar 15 2025, 13:36:28) [GCC 13.3.0]


In [6]:
sc = sgutil.SGCache('img', 'result', 'model')
p = make_pipeline(
    sgpp.PolarsProcessor(),
    sgpp.ExprProcessor({
        'ELm_num': (pl.col('Episode_Length_minutes').clip(5, 120) - 5) / 115,
        'GP': pl.col('Guest_Popularity_percentage').clip(0, 100) / 100,
        'HP': pl.col('Host_Popularity_percentage').clip(0, 100) / 100,
        'NAd': (pl.when(pl.col('Number_of_Ads') > 4).then(0).otherwise(pl.col('Number_of_Ads'))).fill_null(0.0) /3 ,
        'Number_of_Ads': (pl.when(pl.col('Number_of_Ads') > 4).then(0).otherwise(pl.col('Number_of_Ads'))).fill_null(0).cast(pl.Int8),
        'ELm_na': pl.col('Episode_Length_minutes').is_null(),
        'GPp_na': pl.col('Guest_Popularity_percentage').is_null(),
        'ELm_sqrt': ((pl.col('Episode_Length_minutes').fill_null(pl.col('Episode_Length_minutes').mean()).clip(5, 120) - 5) / 115)  ** 0.5
    }),
    sgpp.PandasConverter(index_col = 'id'),
    sgpp.ApplyWrapper(SimpleImputer().set_output(transform = 'pandas'), ['ELm_num', 'GP'])
)
p.fit(['data/train.csv'])
df_train = p.transform(['data/train.csv'])
df_test = p.transform(['data/test.csv'])

In [7]:
df_train_1 = df_train.loc[~df_train['ELm_na']]
df_test_1 = df_test.loc[~df_test['ELm_na']]
target = 'Listening_Time_minutes'
resi = 'resi'
X_all = [i for i in df_train.columns if i != target]

- 거의 동일한 방법으로 Target Encoder 기반으로 제출셋을 만들었지만 5 Fold로 했을 때 성능이 좋으면서 
  Leader Board상의 스코어와 비슷한 결과를 보여주므로, 4 Fold에서 5 Fold로 변경합니다.

- Elm_num, Elm_sqrt 나머지 변수는 Target Encoder로 모델링하는 편이 오히려 낫다고 판단이 1-5 Elm_num이 결측인 것을 제외하지 않고 실험 결과를 미루어 봤을 때 생각이 됩니다.

- Elm_num이 미결측일 때는 잔차를 이용한 Target Encoder가 더 도움이 된다고 판단이 되어 Elm_num이 미결측일 때는 따로 모델을 만듭니다.

- df_train_1 'ELm_num', 'ELm_sqrt' 만을 사용하여 선형회귀모델을 만들고, 잔차를 구하여 resi 변수를 두고 이를 Target Encoding 하여 모델을 제작합니다.

In [8]:
import lightgbm as lgb
import xgboost as xgb
import catboost as cb
from sklearn.metrics import root_mean_squared_error
from sklearn.model_selection import KFold, cross_validate, cross_val_score, ShuffleSplit
from sklearn.linear_model import LinearRegression
from mlxtend.feature_selection import SequentialFeatureSelector
from sklearn.preprocessing import TargetEncoder
from sklearn.model_selection import train_test_split

kf = KFold(5, shuffle = True, random_state = 123)
ss = ShuffleSplit(1, train_size = 0.8, random_state = 123)

def get_validation_splitter(validation_fraction):
    return lambda x: train_test_split(x, test_size = validation_fraction)

config = {
    'predict_func': lambda m, df, X: pd.Series(m.predict(df[X]), index = df.index),
    'score_func': lambda df, prds: root_mean_squared_error(df[target], prds),
    'validation_splitter': get_validation_splitter,
    'progress_callback': sgml.ProgressCallBack(), 
    'return_train_scores': True,
    'y': target,
}

config2 = {
    'predict_func': lambda m, df, X: pd.Series(m.predict(df[X]), index = df.index),
    'score_func': lambda df, prds: root_mean_squared_error(df[resi], prds),
    'validation_splitter': get_validation_splitter,
    'progress_callback': sgml.ProgressCallBack(), 
    'return_train_scores': True,
    'y': resi,
}

lr_adapter = sgml.SklearnAdapter(LinearRegression)
lgb_adapter = sgml.LGBMAdapter(lgb.LGBMRegressor, progress = 50)
xgb_adapter = sgml.XGBAdapter(xgb.XGBRegressor, progress = 50)

# LR_Elm_notna

Target_1 'GP', 'HP', 'NAd',  Elm, Elm_sqrt로 선형회귀모델 만들기

In [9]:
hparams = {
    'X_num': ['GP', 'HP', 'NAd', 'ELm_sqrt', 'ELm_num']
}

result = sc.cv_result('LR_Elm_notna', df_train_1, kf, hparams, config, 
                      lr_adapter, result_proc = [sgml.lr_learning_result])
np.mean(result['valid_scores'])

10.683446884155273

In [10]:
sc.train_cv('LR_Elm_notna', df_train_1, config)

{'model': LinearRegression(),
 'preprocessor': None,
 'spec': ['ELm_num', 'GP', 'NAd', 'HP', 'ELm_sqrt']}

In [11]:
df_train_1 = df_train_1.assign(
    prd = sc.read_prd('LR_Elm_notna'),
    resi = lambda x: x[target] - x['prd']
)
df_test_1 = df_test_1.assign(
    prd = sc.get_predictor_cv('LR_Elm_notna', config)(df_test_1)
)

# Target Encoding1

- 상호작용 수준이 낮은 것에서 시작하여 , target encoding 결과 RMSE를 구해 내림 차순으로 변수를 추가 했을 때 성능 향상 되면 변수 선택합니다.

- 4수준의 상호작용까지 변수 선택 작업을 합니다.

In [8]:
from itertools import combinations

X_values = [
    'Episode_Length_minutes', 'Host_Popularity_percentage', 'Guest_Popularity_percentage', 'Number_of_Ads',
    'Episode_Sentiment', 'Episode_Title', 'Genre', 'Podcast_Name', 'Publication_Day', 'Publication_Time'
]
df_train_tgt1, df_test_tgt1 = train_test_split(
    df_train, train_size = 0.75, random_state =123, shuffle=True
)

In [9]:
def get_best_combi_target(combi):
    tgt = make_pipeline(
        sgpp.CatCombiner2(combi), 
        TargetEncoder(cv=4, random_state = 123)
    ).set_output(transform='pandas')
    df_train_tgt1_r = tgt.fit_transform(df_train_tgt1[X_values], df_train_tgt1[target])
    df_test_tgt1_r = tgt.transform(df_test_tgt1[X_values])
    df_rmse = df_test_tgt1_r.apply(
        lambda x: root_mean_squared_error(df_test_tgt1[target], x)
    )
    df_rmse = df_rmse.sort_values()
    reg_lr = LinearRegression()
    X_lr_best = [df_rmse.index[0]]
    rmse_best = df_rmse.iloc[0]
    for i in df_rmse.index[1:]:
        X_lr = X_lr_best + [i]
        reg_lr.fit(df_train_tgt1_r[X_lr], df_train_tgt1[target])
        rmse = root_mean_squared_error(
            df_test_tgt1[target], reg_lr.predict(df_test_tgt1_r[X_lr])
        )
        if rmse_best > rmse:
            rmse_best = rmse
            X_lr_best = X_lr
    return X_lr_best, rmse_best

In [10]:
X_lr_best, rmse_best = get_best_combi_target(
    [list(i) for i in combinations(X_values, 1)] + [list(i) for i in combinations(X_values, 2)]
)
X_lr_best, rmse_best

(['Episode_Length_minutes',
  'Episode_Length_minutes,Number_of_Ads',
  'Episode_Length_minutes,Episode_Sentiment',
  'Episode_Length_minutes,Publication_Time',
  'Episode_Length_minutes,Publication_Day',
  'Episode_Length_minutes,Genre',
  'Episode_Length_minutes,Podcast_Name',
  'Episode_Length_minutes,Episode_Title',
  'Episode_Length_minutes,Guest_Popularity_percentage',
  'Episode_Length_minutes,Host_Popularity_percentage',
  'Host_Popularity_percentage,Guest_Popularity_percentage',
  'Number_of_Ads,Episode_Title',
  'Number_of_Ads,Podcast_Name',
  'Host_Popularity_percentage',
  'Number_of_Ads,Episode_Sentiment',
  'Number_of_Ads,Genre',
  'Number_of_Ads,Publication_Day',
  'Number_of_Ads',
  'Episode_Sentiment,Episode_Title',
  'Guest_Popularity_percentage',
  'Episode_Title,Publication_Time',
  'Episode_Title,Publication_Day',
  'Episode_Title',
  'Host_Popularity_percentage,Number_of_Ads',
  'Episode_Sentiment,Podcast_Name',
  'Episode_Title,Podcast_Name',
  'Podcast_Name,Publ

In [11]:
X_lr_best, rmse_best = get_best_combi_target(
    [i.split(',') for i in X_lr_best] + \
    [list(i) for i in combinations(pd.Series([i.split(',') for i in X_lr_best if len(i.split(',')) == 2]).explode().unique(), 3)]
)
X_lr_best, rmse_best

(['Episode_Length_minutes',
  'Episode_Length_minutes,Number_of_Ads',
  'Episode_Length_minutes,Episode_Sentiment',
  'Episode_Length_minutes,Publication_Time',
  'Episode_Length_minutes,Publication_Day',
  'Episode_Length_minutes,Genre',
  'Episode_Length_minutes,Number_of_Ads,Episode_Sentiment',
  'Episode_Length_minutes,Episode_Sentiment,Publication_Time',
  'Episode_Length_minutes,Number_of_Ads,Publication_Time',
  'Episode_Length_minutes,Episode_Sentiment,Publication_Day',
  'Episode_Length_minutes,Number_of_Ads,Publication_Day',
  'Episode_Length_minutes,Publication_Time,Publication_Day',
  'Episode_Length_minutes,Episode_Sentiment,Genre',
  'Episode_Length_minutes,Number_of_Ads,Genre',
  'Episode_Length_minutes,Publication_Time,Genre',
  'Episode_Length_minutes,Podcast_Name',
  'Episode_Length_minutes,Publication_Day,Genre',
  'Episode_Length_minutes,Episode_Title',
  'Episode_Length_minutes,Episode_Sentiment,Podcast_Name',
  'Episode_Length_minutes,Number_of_Ads,Podcast_Name',


In [12]:
sc.cache_result(
    'tgt_sel',
    lambda :  get_best_combi_target(
        [i.split(',') for i in X_lr_best] + \
        [list(i) for i in combinations(pd.Series([i.split(',') for i in X_lr_best if len(i.split(',')) == 3]).explode().unique(), 4)]
    ), rerun = 0
)

(['Episode_Length_minutes',
  'Episode_Length_minutes,Number_of_Ads',
  'Episode_Length_minutes,Episode_Sentiment',
  'Episode_Length_minutes,Publication_Time',
  'Episode_Length_minutes,Publication_Day',
  'Episode_Length_minutes,Genre',
  'Episode_Length_minutes,Number_of_Ads,Episode_Sentiment',
  'Episode_Length_minutes,Episode_Sentiment,Publication_Time',
  'Episode_Length_minutes,Number_of_Ads,Publication_Time',
  'Episode_Length_minutes,Episode_Sentiment,Publication_Day',
  'Episode_Length_minutes,Number_of_Ads,Publication_Day',
  'Episode_Length_minutes,Publication_Time,Publication_Day',
  'Episode_Length_minutes,Episode_Sentiment,Genre',
  'Episode_Length_minutes,Number_of_Ads,Genre',
  'Episode_Length_minutes,Publication_Time,Genre',
  'Episode_Length_minutes,Number_of_Ads,Episode_Sentiment,Publication_Time',
  'Episode_Length_minutes,Podcast_Name',
  'Episode_Length_minutes,Publication_Day,Genre',
  'Episode_Length_minutes,Number_of_Ads,Episode_Sentiment,Publication_Day',
  '

# Target Encoding2

- Episode_Length_minutes가 미결측인 경우의 resi에 대한 Target Encoding

In [12]:
X_values = [
    'Episode_Length_minutes', 'Host_Popularity_percentage', 'Guest_Popularity_percentage', 'Number_of_Ads',
    'Episode_Sentiment', 'Episode_Title', 'Genre', 'Podcast_Name', 'Publication_Day', 'Publication_Time'
]

df_train_tgt2, df_test_tgt2 = train_test_split(
    df_train_1, train_size = 0.75, random_state =123, shuffle=True
)

In [13]:
def get_best_combi_resi(combi):
    tgt = make_pipeline(
        sgpp.CatCombiner2(combi), 
        TargetEncoder(cv=4, random_state = 123)
    ).set_output(transform='pandas')
    df_train_tgt2_r = tgt.fit_transform(df_train_tgt2[X_values], df_train_tgt2['resi'])
    df_test_tgt2_r = tgt.transform(df_test_tgt2[X_values])
    df_rmse = df_test_tgt2_r.apply(
        lambda x: root_mean_squared_error(df_test_tgt2['resi'], x)
    )
    df_rmse = df_rmse.sort_values()
    reg_lr = LinearRegression()
    X_lr_best = [df_rmse.index[0]]
    rmse_best = df_rmse.iloc[0]
    for i in df_rmse.index[1:]:
        X_lr = X_lr_best + [i]
        reg_lr.fit(df_train_tgt2_r[X_lr], df_train_tgt2['resi'])
        rmse = root_mean_squared_error(
            df_test_tgt2['resi'], reg_lr.predict(df_test_tgt2_r[X_lr])
        )
        if rmse_best > rmse:
            rmse_best = rmse
            X_lr_best = X_lr
    return X_lr_best, rmse_best

In [11]:
from itertools import combinations
X_lr_best, rmse_best = get_best_combi_resi(
    [list(i) for i in combinations(X_values, 1)] + [list(i) for i in combinations(X_values, 2)]
)
X_lr_best, rmse_best

(['Episode_Length_minutes__Host_Popularity_percentage',
  'Episode_Length_minutes__Guest_Popularity_percentage',
  'Episode_Length_minutes',
  'Host_Popularity_percentage__Guest_Popularity_percentage',
  'Episode_Length_minutes__Number_of_Ads',
  'Episode_Length_minutes__Episode_Sentiment',
  'Episode_Sentiment__Podcast_Name',
  'Episode_Sentiment__Genre',
  'Number_of_Ads__Podcast_Name',
  'Number_of_Ads__Episode_Sentiment',
  'Episode_Sentiment__Episode_Title',
  'Episode_Sentiment__Publication_Day',
  'Episode_Sentiment__Publication_Time',
  'Podcast_Name',
  'Podcast_Name__Publication_Day',
  'Podcast_Name__Publication_Time',
  'Episode_Sentiment',
  'Number_of_Ads__Episode_Title',
  'Episode_Title__Genre',
  'Episode_Title',
  'Episode_Title__Publication_Time',
  'Publication_Day__Publication_Time',
  'Episode_Title__Publication_Day',
  'Publication_Time',
  'Host_Popularity_percentage',
  'Episode_Title__Podcast_Name',
  'Guest_Popularity_percentage',
  'Episode_Length_minutes__P

In [26]:
X_lr_best, rmse_best = sc.cache_result(
    'tgt_sel_resi',
    lambda : get_best_combi_resi(
        [i.split('__') for i in X_lr_best] + [list(i) for i in combinations(X_values, 3)]
    ), rerun = 0
)
X_lr_best, rmse_best

(['Episode_Length_minutes__Host_Popularity_percentage',
  'Episode_Length_minutes__Host_Popularity_percentage__Number_of_Ads',
  'Episode_Length_minutes__Host_Popularity_percentage__Episode_Sentiment',
  'Episode_Length_minutes__Host_Popularity_percentage__Publication_Time',
  'Episode_Length_minutes__Host_Popularity_percentage__Publication_Day',
  'Episode_Length_minutes__Guest_Popularity_percentage',
  'Episode_Length_minutes__Episode_Title__Podcast_Name',
  'Episode_Length_minutes__Host_Popularity_percentage__Genre',
  'Episode_Length_minutes__Host_Popularity_percentage__Episode_Title',
  'Episode_Length_minutes__Host_Popularity_percentage__Podcast_Name',
  'Episode_Length_minutes',
  'Episode_Length_minutes__Host_Popularity_percentage__Guest_Popularity_percentage',
  'Episode_Length_minutes__Guest_Popularity_percentage__Episode_Title',
  'Host_Popularity_percentage__Episode_Title__Podcast_Name',
  'Episode_Length_minutes__Guest_Popularity_percentage__Episode_Sentiment',
  'Host_Pop

In [27]:
X_lr_best, rmse_best = sc.cache_result(
    'tgt_sel_resi2',
    lambda : get_best_combi_resi(
        (
            [i.split('__') for i in X_lr_best] + 
            [list(i) for i in combinations(pd.Series([i.split('__') for i in X_lr_best if len(i.split('__')) == 3]).explode().unique(), 4)]
        )
    ), rerun = 0
)
X_lr_best, rmse_best

(['Episode_Length_minutes__Host_Popularity_percentage',
  'Episode_Length_minutes__Host_Popularity_percentage__Number_of_Ads',
  'Episode_Length_minutes__Host_Popularity_percentage__Episode_Sentiment',
  'Episode_Length_minutes__Host_Popularity_percentage__Publication_Time',
  'Episode_Length_minutes__Host_Popularity_percentage__Number_of_Ads__Episode_Sentiment',
  'Episode_Length_minutes__Host_Popularity_percentage__Publication_Day',
  'Episode_Length_minutes__Host_Popularity_percentage__Episode_Sentiment__Publication_Time',
  'Episode_Length_minutes__Number_of_Ads__Publication_Day__Episode_Title',
  'Episode_Length_minutes__Host_Popularity_percentage__Number_of_Ads__Publication_Time',
  'Episode_Length_minutes__Publication_Time__Publication_Day__Episode_Title',
  'Episode_Length_minutes__Host_Popularity_percentage__Number_of_Ads__Publication_Day',
  'Episode_Length_minutes__Host_Popularity_percentage__Episode_Sentiment__Publication_Day',
  'Episode_Length_minutes__Episode_Sentiment__