In [1]:
import os, sys

import sgpp, sgml, dproc, sgutil
import pandas as pd
import polars as pl
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

print(sys.version)

from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import FunctionTransformer

3.12.6 (main, Sep 30 2024, 02:19:13) [GCC 9.4.0]


In [2]:
sc = sgutil.SGCache('img', 'result', 'model')
p = make_pipeline(
    sgpp.PolarsProcessor(),
    sgpp.ExprProcessor({
        'ELm_num': (pl.col('Episode_Length_minutes').clip(5, 120) - 5) / 115,
        'GP': pl.col('Guest_Popularity_percentage').clip(0, 100) / 100,
        'HP': pl.col('Host_Popularity_percentage').clip(0, 100) / 100,
        'NAd': (pl.when(pl.col('Number_of_Ads') > 4).then(0).otherwise(pl.col('Number_of_Ads'))).fill_null(0.0) /3 ,
        'Number_of_Ads': (pl.when(pl.col('Number_of_Ads') > 4).then(0).otherwise(pl.col('Number_of_Ads'))).fill_null(0).cast(pl.Int8),
        'ELm_na': pl.col('Episode_Length_minutes').is_null(),
        'GPp_na': pl.col('Guest_Popularity_percentage').is_null(),
        'ELm_sqrt': ((pl.col('Episode_Length_minutes').fill_null(pl.col('Episode_Length_minutes').mean()).clip(5, 120) - 5) / 115)  ** 0.5,
        'ELm_num2': pl.col('Episode_Length_minutes').clip(5, 120),
    }),
    sgpp.PandasConverter(index_col = 'id'),
    sgpp.ApplyWrapper(SimpleImputer().set_output(transform = 'pandas'), ['ELm_num', 'ELm_num2', 'GP']),
    sgpp.ApplyWrapper(FunctionTransformer(np.log1p).set_output(transform = 'pandas'), ['ELm_num', 'GP', 'HP'], postfix = '_log1p')
)
p.fit(['data/train.csv'])
df_train = p.transform(['data/train.csv'])
df_test = p.transform(['data/test.csv'])

In [3]:
target = 'Listening_Time_minutes'
resi = 'resi'
X_all = [i for i in df_train.columns if i != target]

In [4]:
df_train[target + '_log1p'] = np.log1p(df_train[target])

In [7]:
import lightgbm as lgb
import xgboost as xgb
import catboost as cb
from sklearn.metrics import root_mean_squared_error
from sklearn.model_selection import KFold, cross_validate, cross_val_score, ShuffleSplit, train_test_split
from sklearn.linear_model import LinearRegression
from mlxtend.feature_selection import SequentialFeatureSelector
kf = KFold(5, shuffle = True, random_state = 123)
ss = ShuffleSplit(1, train_size = 0.8, random_state = 123)

def get_validation_splitter(validation_fraction):
    return lambda x: train_test_split(x, test_size = validation_fraction)

config = {
    'predict_func': lambda m, df, X: pd.Series(m.predict(df[X]), index = df.index),
    'score_func': lambda df, prds: root_mean_squared_error(df[target], prds),
    'validation_splitter': get_validation_splitter,
    'progress_callback': sgml.ProgressCallBack(), 
    'return_train_scores': True,
    'target_func': lambda X, y: np.log1p(y),
    'target_invfunc': lambda X, y: np.exp(y) - 1,
    'y': target,
}
lr_adapter = sgml.SklearnAdapter(LinearRegression)
lgb_adapter = sgml.LGBMAdapter(lgb.LGBMRegressor, progress = 50)
xgb_adapter = sgml.XGBAdapter(xgb.XGBRegressor, progress = 50)

# LR_log1p

In [6]:
hparams = {
    'X_ohe': ['Podcast_Name', 'Episode_Title', 'Genre', 'Publication_Day', 'Publication_Time', 'Episode_Sentiment'],
    'ohe': {'drop': 'first'},
    'X_num': ['GP_log1p', 'HP_log1p', 'NAd', 'ELm_num_log1p']
}

result = sc.cv_result('LR_log1p', df_train.loc[~df_train['ELm_na']], kf, hparams, config, 
                      lr_adapter, result_proc = [sgml.lr_learning_result], rerun = 0)
np.mean(result['valid_scores'])

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

np.float64(13.968929283906679)

# LR_div

In [8]:
config2 = {
    'predict_func': lambda m, df, X: pd.Series(m.predict(df[X]), index = df.index),
    'score_func': lambda df, prds: root_mean_squared_error(df[target], prds),
    'validation_splitter': get_validation_splitter,
    'progress_callback': sgml.ProgressCallBack(), 
    'return_train_scores': True,
    'target_func': lambda X, y: y / X['ELm_num2'],
    'target_invfunc': lambda X, y: y * X['ELm_num2'],
    'y': target,
}

In [9]:
hparams = {
    'X_ohe': ['Podcast_Name', 'Episode_Title', 'Genre', 'Publication_Day', 'Publication_Time', 'Episode_Sentiment'],
    'ohe': {'drop': 'first'},
    'X_num': ['GP', 'HP', 'NAd', 'ELm_num']
}

result = sc.cv_result('LR_div', df_train.loc[~df_train['ELm_na']], kf, hparams, config, 
                      lr_adapter, result_proc = [sgml.lr_learning_result], rerun = 0)
np.mean(result['valid_scores'])

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

np.float64(15.778656439978969)

# LGB_div

In [12]:
hparams = {
    'model_params': {'num_leaves': 255, 'n_estimators': 3000, 'learning_rate': 0.05, 'colsample_bytree': 0.7, 'metric': 'rmse'},
    'X_cat': ['Podcast_Name', 'Episode_Title', 'Genre', 'Publication_Day', 'Publication_Time', 'Episode_Sentiment'],
    'X_num': ['GP', 'HP', 'NAd', 'ELm_num']
}

result = sc.cv_result('lgb_div', df_train.loc[~df_train['ELm_na']], kf, hparams, config, 
                      lgb_adapter, rerun = 0)
np.mean(result['valid_scores'])

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

Round:   0%|          | 0/3000 [00:00<?, ?it/s]

Round:   0%|          | 0/3000 [00:00<?, ?it/s]

Round:   0%|          | 0/3000 [00:00<?, ?it/s]

Round:   0%|          | 0/3000 [00:00<?, ?it/s]

Round:   0%|          | 0/3000 [00:00<?, ?it/s]

np.float64(10.294693914658714)

# LGB_ELm_impute 

In [44]:
df_train_ELm = pd.concat([
    df_train.loc[~df_train['ELm_na']],
    df_test.loc[~df_test['ELm_na']],
])

In [46]:
import lightgbm as lgb
from sklearn.model_selection import cross_validate

X_cat = ['Episode_Sentiment', 'Episode_Title', 'Genre', 'Podcast_Name', 'Publication_Day', 'Publication_Time']
X_lgb_num = ['Host_Popularity_percentage', 'Guest_Popularity_percentage', 'Number_of_Ads']
X_lgb = X_cat + X_lgb_num
cross_validate(
    lgb.LGBMRegressor(verbose=-1, num_leaves = 15, n_estimators = 500, learning_rate = 0.05), 
    df_train_ELm[X_lgb], df_train_ELm['ELm_num'], scoring = 'r2', cv = kf, 
    params = {'categorical_feature': X_cat}, return_train_score = True
)

{'fit_time': array([2.53317261, 2.52076435, 2.4920342 , 2.4984982 , 2.49239779]),
 'score_time': array([0.63646126, 0.64848351, 0.65622282, 0.63845563, 0.67402911]),
 'test_score': array([0.02980957, 0.029526  , 0.0285018 , 0.03142865, 0.03078145]),
 'train_score': array([0.05117267, 0.05116901, 0.05130769, 0.0506525 , 0.05057917])}

In [54]:
def lgb_imp_elm():
    reg_lgb = lgb.LGBMRegressor(verbose=-1, num_leaves = 15, n_estimators = 500, learning_rate = 0.05)
    reg_lgb.fit(df_train_ELm[X_lgb], df_train_ELm['ELm_num'])
    return df_train.loc[df_train['ELm_na']].pipe(
        lambda x: pd.Series(reg_lgb.predict(x[X_lgb]), index = x.index)
    ), df_test.loc[df_test['ELm_na']].pipe(
        lambda x: pd.Series(reg_lgb.predict(x[X_lgb]), index = x.index)
    )
s_train_elm, s_test_elm = sc.cache_result('lgb_impute_elm', lgb_imp_elm)

# LR_ELm_imp

In [48]:
config3 = {
    'predict_func': lambda m, df, X: pd.Series(m.predict(df[X]), index = df.index),
    'score_func': lambda df, prds: root_mean_squared_error(df[target], prds),
    'validation_splitter': get_validation_splitter,
    'progress_callback': sgml.ProgressCallBack(), 
    'return_train_scores': True,
    'y': target,
}

In [60]:
df_train.loc[df_train['ELm_na'], 'ELm_num'] = df_train['ELm_num'].mean()
df_test.loc[df_test['ELm_na'], 'ELm_num'] = df_train['ELm_num'].mean()

In [61]:
hparams = {
    'X_ohe': ['Podcast_Name', 'Episode_Title', 'Genre', 'Publication_Day', 'Publication_Time', 'Episode_Sentiment'],
    'ohe': {'drop': 'first'},
    'X_num': ['GP', 'HP', 'NAd', 'ELm_num']
}

result = sc.cv_result('LR_ELm_imp', df_train, kf, hparams, config, 
                      lr_adapter, result_proc = [sgml.lr_learning_result], rerun = 0)
np.mean(result['valid_scores'])

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

np.float64(17.39910022213298)

In [70]:
root_mean_squared_error(
    df_train.loc[df_train['ELm_na'], target],
    sc.read_prd('LR_ELm_imp', df_train.index).loc[df_train['ELm_na']]
)

np.float64(27.136443178239656)

In [71]:
root_mean_squared_error(
    df_train.loc[~df_train['ELm_na'], target],
    sc.read_prd('LR_ELm_imp', df_train.index).loc[~df_train['ELm_na']]
)

np.float64(15.676603280220364)