In [1]:
import os, sys

import sgpp, sgml, dproc, sgutil
import pandas as pd
import polars as pl
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

print(sys.version)

from sklearn.pipeline import make_pipeline

3.12.9 (main, Mar 15 2025, 13:36:28) [GCC 13.3.0]


In [2]:
X_tgt = ['Episode_Length_minutes',
  'Number_of_Ads,Episode_Title,Episode_Sentiment',
  'Number_of_Ads,Episode_Title',
  'Number_of_Ads,Podcast_Name,Episode_Sentiment',
  'Number_of_Ads,Podcast_Name',
  'Host_Popularity_percentage',
  'Number_of_Ads,Episode_Sentiment,Genre,Publication_Time',
  'Number_of_Ads,Episode_Sentiment,Genre,Publication_Day',
  'Number_of_Ads,Episode_Sentiment,Publication_Time,Publication_Day',
  'Number_of_Ads,Publication_Time',
  'Episode_Title,Episode_Sentiment,Publication_Time',
  'Guest_Popularity_percentage',
  'Episode_Title,Publication_Day',
  'Podcast_Name,Episode_Sentiment,Publication_Day',
  'Podcast_Name,Episode_Sentiment,Genre',
  'Episode_Sentiment,Publication_Time,Publication_Day',
  'Publication_Time,Publication_Day,Genre',
  'Genre,Publication_Time']

In [8]:
sc = sgutil.SGCache('img', 'result', 'model')
p = make_pipeline(
    sgpp.PolarsProcessor(),
    sgpp.ExprProcessor({
        'ELm_num': (pl.col('Episode_Length_minutes').fill_null(pl.col('Episode_Length_minutes').mean()).clip(5, 120) - 5) / 115,
        'GP': (pl.col('Guest_Popularity_percentage').fill_null(pl.col('Guest_Popularity_percentage').clip(0,100).mean()).clip(0, 100) / 100),
        'HP': pl.col('Host_Popularity_percentage').clip(0, 100) / 100,
        'NAd': (pl.when(pl.col('Number_of_Ads') > 4).then(0).otherwise(pl.col('Number_of_Ads'))).fill_null(0.0) /3 ,
        'Number_of_Ads': (pl.when(pl.col('Number_of_Ads') > 4).then(0).otherwise(pl.col('Number_of_Ads'))).fill_null(0).cast(pl.Int8),
        'ELm_na': pl.col('Episode_Length_minutes').is_null(),
        'GPp_na': pl.col('Guest_Popularity_percentage').is_null(),
        'ELm_sqrt': ((pl.col('Episode_Length_minutes').fill_null(pl.col('Episode_Length_minutes').mean()).clip(5, 120) - 5) / 115)  ** 0.5
    }),
    sgpp.PandasConverter(index_col = 'id'),
    sgpp.ApplyWrapper(
        sgpp.CatCombiner2([i.split(',') for i in X_tgt]), pd.Series([i.split(',') for i in X_tgt]).explode().unique().tolist()
    )
)
p.fit(['data/train.csv'])
df_train = p.transform(['data/train.csv'])
df_test = p.transform(['data/test.csv'])

In [4]:
import lightgbm as lgb
import xgboost as xgb
import catboost as cb
from sklearn.metrics import root_mean_squared_error
from sklearn.model_selection import KFold, cross_validate, cross_val_score, ShuffleSplit
from sklearn.linear_model import LinearRegression
from mlxtend.feature_selection import SequentialFeatureSelector
kf = KFold(5, shuffle = True, random_state = 123)
ss = ShuffleSplit(1, train_size = 0.8, random_state = 123)
target = 'Listening_Time_minutes'
X_all = [i for i in df_train.columns if i != target]

def get_validation_splitter(validation_fraction):
    return lambda x: train_test_split(x, test_size = validation_fraction)

config = {
    'predict_func': lambda m, df, X: pd.Series(m.predict(df[X]), index = df.index),
    'score_func': lambda df, prds: root_mean_squared_error(df[target], prds),
    'validation_splitter': get_validation_splitter,
    'progress_callback': sgml.ProgressCallBack(), 
    'return_train_scores': True,
    'y': target,
}

lr_adapter = sgml.SklearnAdapter(LinearRegression)
lgb_adapter = sgml.LGBMAdapter(lgb.LGBMRegressor, progress = 50)
xgb_adapter = sgml.XGBAdapter(xgb.XGBRegressor, progress = 50)

# Linear Regression 1

In [10]:
hparams = {
    'X_tgt': X_tgt,
    'X_num': ['GP', 'HP', 'NAd', 'ELm_sqrt', 'ELm_num', 'ELm_na']
}

result = sc.cv_result('LR_1', df_train, kf, hparams, config, lr_adapter, result_proc = [sgml.lr_learning_result], rerun = 0)
np.mean(result['valid_scores'])

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

13.16960240605351

# LGB 1

In [22]:
hparams = {
    'model_params': {'num_leaves': 511, 'n_estimators': 1500, 'learning_rate': 0.01, 'colsample_bytree': 0.7},
    'X_tgt': X_tgt, 'X_num': ['GP', 'HP', 'NAd', 'ELm_sqrt', 'ELm_num', 'ELm_na'],
    'validation_fraction': 0.1, 'metric': 'rmse'
    #'X_cat': ['Episode_Length_minutes'], 'cat': {'handle_unknown': 'use_encoded_value', 'unknown_value': 1000000, 'encoded_missing_value': 1000000, 'min_frequency': 2}
}

result = sgml.cv(df_train, ss, hparams, config, lgb_adapter, rerun=0)
np.mean(result['valid_scores'])

Fold:   0%|          | 0/1 [00:00<?, ?it/s]

Round:   0%|          | 0/1500 [00:00<?, ?it/s]



12.997998424254941

In [17]:
result['model_result'][0]['valid_result'].idxmin()

metric  set     
l2      training    2999
        valid_1     2177
dtype: int64