In [1]:
import os, sys

import sgpp, sgml, dproc, sgutil, sgnn
import pandas as pd
import polars as pl
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

print(sys.version)

from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer

2025-04-30 01:45:51.310014: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-04-30 01:45:51.347987: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1745945151.368395   74212 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1745945151.374811   74212 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1745945151.402705   74212 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

3.12.9 (main, Mar 15 2025, 13:36:28) [GCC 13.3.0]


In [2]:
sc = sgutil.SGCache('img', 'result', 'model')
p = make_pipeline(
    sgpp.PolarsProcessor(),
    sgpp.ExprProcessor({
        'ELm_num': (pl.col('Episode_Length_minutes').clip(5, 120) - 5) / 115,
        'GP': pl.col('Guest_Popularity_percentage').clip(0, 100) / 100,
        'HP': pl.col('Host_Popularity_percentage').clip(0, 100) / 100,
        'NAd': (pl.when(pl.col('Number_of_Ads') > 4).then(0).otherwise(pl.col('Number_of_Ads'))).fill_null(0.0) /3 ,
        'Number_of_Ads': (pl.when(pl.col('Number_of_Ads') > 4).then(0).otherwise(pl.col('Number_of_Ads'))).fill_null(0).cast(pl.Int8),
        'ELm_na': pl.col('Episode_Length_minutes').is_null(),
        'GPp_na': pl.col('Guest_Popularity_percentage').is_null(),
        'ELm_sqrt': ((pl.col('Episode_Length_minutes').fill_null(pl.col('Episode_Length_minutes').mean()).clip(5, 120) - 5) / 115)  ** 0.5
    }),
    sgpp.PandasConverter(index_col = 'id'),
    sgpp.ApplyWrapper(SimpleImputer().set_output(transform = 'pandas'), ['ELm_num', 'GP'])
)
p.fit(['data/train.csv'])
df_train = p.transform(['data/train.csv'])
df_test = p.transform(['data/test.csv'])

In [3]:
import lightgbm as lgb
import xgboost as xgb
import catboost as cb
from sklearn.metrics import root_mean_squared_error
from sklearn.model_selection import KFold, cross_validate, cross_val_score, ShuffleSplit
from sklearn.linear_model import LinearRegression
from mlxtend.feature_selection import SequentialFeatureSelector
kf = KFold(4, shuffle = True, random_state = 123)
ss = ShuffleSplit(1, train_size = 0.75, random_state = 123)
target = 'Listening_Time_minutes'
X_all = [i for i in df_train.columns if i != target]

def get_validation_splitter(validation_fraction):
    return lambda x: train_test_split(x, test_size = validation_fraction)

config = {
    'predict_func': lambda m, df, X: pd.Series(m.predict(df[X]), index = df.index),
    'score_func': lambda df, prds: root_mean_squared_error(df[target], prds),
    'validation_splitter': get_validation_splitter,
    'progress_callback': sgml.ProgressCallBack(), 
    'return_train_scores': True,
    'y': target,
}

lr_adapter = sgml.SklearnAdapter(LinearRegression)
lgb_adapter = sgml.LGBMAdapter(lgb.LGBMRegressor, progress = 50)
xgb_adapter = sgml.XGBAdapter(xgb.XGBRegressor, progress = 50)

In [4]:
from sklearn.preprocessing import TargetEncoder
X_tgt = [
    'Podcast_Name', 'Episode_Title', 'Episode_Length_minutes', 'Genre', 'Host_Popularity_percentage', 'Publication_Day', 
    'Publication_Time','Guest_Popularity_percentage', 'Number_of_Ads','Episode_Sentiment'
]
def tge_proc(df_train, df_valid, X_vars):
    tge = make_pipeline(
        sgpp.CatCombiner2(X_vars), TargetEncoder(cv = 4)
    ).set_output(transform = 'pandas')
    return tge.fit_transform(df_train[X_tgt], df_train[target]), tge.transform(df_valid[X_tgt])

In [5]:
from itertools import combinations
X_inter2_4 = list()
for i in range(2, 5):
    X_inter2_4.extend([list(i) for i in combinations(X_tgt, i)])
for i, (train_idx, valid_idx) in enumerate(ss.split(df_train[X_all], df_train[target])):
    print("generating {} Fold".format(i))
    sc.cache_result(
        'ss_tgt_2_4_{}'.format(i),
        lambda : tge_proc(df_train.iloc[train_idx], df_train.iloc[valid_idx], X_inter2_4), rerun = 0
    )

generating 0 Fold


In [6]:
X_inter5 = [list(i) for i in combinations(X_tgt, 5)]
for i, (train_idx, valid_idx) in enumerate(ss.split(df_train[X_all], df_train[target])):
    print("generating {} Fold".format(i))
    sc.cache_result(
        'ss_tgt_5_{}'.format(i),
        lambda : tge_proc(df_train.iloc[train_idx], df_train.iloc[valid_idx], X_inter5), rerun = 0
    )

generating 0 Fold


In [7]:
X_inter6 = [list(i) for i in combinations(X_tgt, 6)]
for i, (train_idx, valid_idx) in enumerate(ss.split(df_train[X_all], df_train[target])):
    print("generating {} Fold".format(i))
    sc.cache_result(
        'ss_tgt_6_{}'.format(i),
        lambda : tge_proc(df_train.iloc[train_idx], df_train.iloc[valid_idx], X_inter6), rerun = 0
    )

generating 0 Fold


In [8]:
X_inter7 = [list(i) for i in combinations(X_tgt, 7)]
for i, (train_idx, valid_idx) in enumerate(ss.split(df_train[X_all], df_train[target])):
    print("generating {} Fold".format(i))
    sc.cache_result(
        'ss_tgt_7_{}'.format(i),
        lambda : tge_proc(df_train.iloc[train_idx], df_train.iloc[valid_idx], X_inter7), rerun = 0
    )

generating 0 Fold


In [9]:
X_inter8 = [list(i) for i in combinations(X_tgt, 8)]
for i, (train_idx, valid_idx) in enumerate(ss.split(df_train[X_all], df_train[target])):
    print("generating {} Fold".format(i))
    sc.cache_result(
        'ss_tgt_8_{}'.format(i),
        lambda : tge_proc(df_train.iloc[train_idx], df_train.iloc[valid_idx], X_inter8), rerun = 0
    )

generating 0 Fold


In [10]:
X_inter9 = [list(i) for i in combinations(X_tgt, 9)]
for i, (train_idx, valid_idx) in enumerate(ss.split(df_train[X_all], df_train[target])):
    print("generating {} Fold".format(i))
    sc.cache_result(
        'ss_tgt_9_{}'.format(i),
        lambda : tge_proc(df_train.iloc[train_idx], df_train.iloc[valid_idx], X_inter9), rerun = 0
    )

generating 0 Fold


In [15]:
def cv_tgt3(hparams, adapter, tgt_set, test_run = False, **argv):
    rmse_oof = list()
    prd_oof = list()
    for i, (train_idx, valid_idx) in enumerate(ss.split(df_train[X_all], df_train[target])):
        df_tgts_train, df_tgts_valid = list(), list()
        for j in tgt_set:
            df_tgt_train, df_tgt_valid = sc.read_result('{}_{}'.format(j, i))
            df_tgts_train.append(df_tgt_train)
            df_tgts_valid.append(df_tgt_valid)
        df_tgts_train = pd.concat(df_tgts_train, axis=1)
        df_tgts_valid = pd.concat(df_tgts_valid, axis=1)
        result = sgml.train(
            dproc.join_and_assign(
                df_train.iloc[train_idx], df_tgts_train
            ), hparams, config, adapter, **argv
        )
        predictor = sgml.assemble_predictor(**result[0], spec = result[1], config = config)
        prd_oof.append(
            predictor(dproc.join_and_assign(
                df_train.iloc[valid_idx], df_tgts_valid
            ))
        )
        rmse_oof.append(
            root_mean_squared_error(df_train.iloc[valid_idx][target], prd_oof[-1])
        )
        if test_run:
            break
    return rmse_oof, pd.concat(prd_oof).sort_index()

# XGB 3

In [16]:
X_tgt = ['__'.join(i) for i in X_inter2_4 + X_inter5 + X_inter6 + X_inter7 + X_inter8 + X_inter9]
hparams = {
    'model_params': {'max_depth': 10, 'n_estimators': 1500, 'learning_rate': 0.02, 'colsample_bytree': 0.7},
    'X_num': X_tgt, 'validation_fraction': 0.1
}

In [27]:
rmse_oof, prd_oof_xgb3 = sc.cache_result(
    'cv_xgb3', lambda : cv_tgt3(hparams, xgb_adapter, ['ss_tgt_2_4', 'ss_tgt_5', 'ss_tgt_6', 'ss_tgt_7', 'ss_tgt_8', 'ss_tgt_9'], use_gpu = True), rerun = 0
)
np.mean(rmse_oof), rmse_oof

(12.128799438476562, [12.128799438476562])

# XGB 4

In [28]:
X_tgt = ['__'.join(i) for i in X_inter2_4 + X_inter5 + X_inter6 + X_inter7 + X_inter8]
hparams = {
    'model_params': {'max_depth': 10, 'n_estimators': 1500, 'learning_rate': 0.02, 'colsample_bytree': 0.7},
    'X_num': X_tgt, 'validation_fraction': 0.1
}

In [29]:
rmse_oof, prd_oof_xgb4 = sc.cache_result(
    'cv_xgb4', lambda : cv_tgt3(hparams, xgb_adapter, ['ss_tgt_2_4', 'ss_tgt_5', 'ss_tgt_6', 'ss_tgt_7', 'ss_tgt_8'], use_gpu = True), rerun = 0
)
np.mean(rmse_oof), rmse_oof

(12.12886905670166, [12.12886905670166])

# XGB 5

In [30]:
X_tgt = ['__'.join(i) for i in X_inter2_4 + X_inter5 + X_inter6 + X_inter7]
hparams = {
    'model_params': {'max_depth': 10, 'n_estimators': 1500, 'learning_rate': 0.02, 'colsample_bytree': 0.7},
    'X_num': X_tgt, 'validation_fraction': 0.1
}

In [31]:
rmse_oof, prd_oof_xgb5 = sc.cache_result(
    'cv_xgb5', lambda : cv_tgt3(hparams, xgb_adapter, ['ss_tgt_2_4', 'ss_tgt_5', 'ss_tgt_6', 'ss_tgt_7'], use_gpu = True), rerun = 0
)
np.mean(rmse_oof), rmse_oof

(12.129864692687988, [12.129864692687988])

In [34]:
((prd_oof_xgb3 + prd_oof_xgb4  + prd_oof_xgb5) / 3).pipe(
    lambda x: root_mean_squared_error(df_train.loc[x.index, target], x)
)

12.109758377075195

# LGB 5

In [40]:
X_tgt = ['__'.join(i) for i in X_inter2_4 + X_inter5 + X_inter6 + X_inter7]
hparams = {
    'model_params': {'num_leaves': 31, 'n_estimators': 1500, 'learning_rate': 0.02, 'colsample_bytree': 0.7, 'metric': 'RMSE'},
    'X_num': X_tgt, 'validation_fraction': 0.1
}

In [None]:
rmse_oof, prd_oof_lgb5 = sc.cache_result(
    'cv_lgb5', lambda : cv_tgt3(hparams, lgb_adapter, ['ss_tgt_2_4', 'ss_tgt_5', 'ss_tgt_6', 'ss_tgt_7']), rerun = 0
)
np.mean(rmse_oof), rmse_oof

# XGB 6

In [None]:
X_tgt = ['__'.join(i) for i in X_inter2_4 + X_inter5 + X_inter6]
hparams = {
    'model_params': {'max_depth': 10, 'n_estimators': 1500, 'learning_rate': 0.02, 'colsample_bytree': 0.7},
    'X_num': X_tgt, 'validation_fraction': 0.1
}

In [None]:
rmse_oof, prd_oof_xgb = sc.cache_result(
    'cv_xgb6', lambda : cv_tgt3(hparams, xgb_adapter, ['ss_tgt_2_4', 'ss_tgt_5', 'ss_tgt_6'], use_gpu = True), rerun = 0
)
np.mean(rmse_oof), rmse_oof

In [None]:
sc.cache_result(
    'tgt_2_4'.format(i),
    lambda : tge_proc(df_train, df_test, X_inter2_4), rerun = 0
)

In [None]:
sc.cache_result(
    'tgt_5'.format(i),
    lambda : tge_proc(df_train, df_test, X_inter5), rerun = 0
)

In [None]:
sc.cache_result(
    'tgt_6'.format(i),
    lambda : tge_proc(df_train, df_test, X_inter6), rerun = 0
)

In [None]:
sc.cache_result(
    'tgt_7'.format(i),
    lambda : tge_proc(df_train, df_test, X_inter7), rerun = 0
)

In [None]:
sc.cache_result(
    'tgt_8'.format(i),
    lambda : tge_proc(df_train, df_test, X_inter8), rerun = 0
)

In [None]:
sc.cache_result(
    'tgt_9'.format(i),
    lambda : tge_proc(df_train, df_test, X_inter9), rerun = 0
)