In [1]:
import os, sys

import sgpp, sgml, dproc, sgutil
import pandas as pd
import polars as pl
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

print(sys.version)

from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer

3.12.6 (main, Sep 30 2024, 02:19:13) [GCC 9.4.0]


In [2]:
sc = sgutil.SGCache('img', 'result', 'model')
p = make_pipeline(
    sgpp.PolarsProcessor(),
    sgpp.ExprProcessor({
        'ELm_num': (pl.col('Episode_Length_minutes').clip(5, 120) - 5) / 115,
        'GP': pl.col('Guest_Popularity_percentage').clip(0, 100) / 100,
        'HP': pl.col('Host_Popularity_percentage').clip(0, 100) / 100,
        'NAd': (pl.when(pl.col('Number_of_Ads') > 4).then(0).otherwise(pl.col('Number_of_Ads'))).fill_null(0.0) /3 ,
        'Number_of_Ads': (pl.when(pl.col('Number_of_Ads') > 4).then(0).otherwise(pl.col('Number_of_Ads'))).fill_null(0).cast(pl.Int8),
        'ELm_na': pl.col('Episode_Length_minutes').is_null(),
        'GPp_na': pl.col('Guest_Popularity_percentage').is_null(),
        'ELm_sqrt': ((pl.col('Episode_Length_minutes').fill_null(pl.col('Episode_Length_minutes').mean()).clip(5, 120) - 5) / 115)  ** 0.5
    }),
    sgpp.PandasConverter(index_col = 'id'),
    sgpp.ApplyWrapper(SimpleImputer().set_output(transform = 'pandas'), ['ELm_num', 'GP'])
)
p.fit(['data/train.csv'])
df_train = p.transform(['data/train.csv'])
df_test = p.transform(['data/test.csv'])

In [3]:
import lightgbm as lgb
import xgboost as xgb
import catboost as cb
from sklearn.metrics import root_mean_squared_error
from sklearn.model_selection import KFold, cross_validate, cross_val_score, ShuffleSplit
from sklearn.linear_model import LinearRegression
from mlxtend.feature_selection import SequentialFeatureSelector
kf = KFold(4, shuffle = True, random_state = 123)
ss = ShuffleSplit(1, train_size = 0.75, random_state = 123)
target = 'Listening_Time_minutes'
X_all = [i for i in df_train.columns if i != target]

def get_validation_splitter(validation_fraction):
    return lambda x: train_test_split(x, test_size = validation_fraction)

config = {
    'predict_func': lambda m, df, X: pd.Series(m.predict(df[X]), index = df.index),
    'score_func': lambda df, prds: root_mean_squared_error(df[target], prds),
    'validation_splitter': get_validation_splitter,
    'progress_callback': sgml.ProgressCallBack(), 
    'return_train_scores': False,
    'y': target,
}

lr_adapter = sgml.SklearnAdapter(LinearRegression)
lgb_adapter = sgml.LGBMAdapter(lgb.LGBMRegressor, progress = 50)
xgb_adapter = sgml.XGBAdapter(xgb.XGBRegressor, progress = 50)

In [4]:
from sklearn.preprocessing import TargetEncoder
X_tgt = [
    'Podcast_Name', 'Episode_Title', 'Episode_Length_minutes', 'Genre', 'Host_Popularity_percentage', 'Publication_Day', 
    'Publication_Time','Guest_Popularity_percentage', 'Number_of_Ads','Episode_Sentiment'
]
def tge_proc(df_train, df_valid, X_vars):
    tge = make_pipeline(
        sgpp.CatCombiner2(X_vars), TargetEncoder(cv = 4)
    ).set_output(transform = 'pandas')
    return tge.fit_transform(df_train[X_tgt], df_train[target]), tge.transform(df_valid[X_tgt])

In [5]:
from itertools import combinations
X_inter2_4 = list()
for i in range(2, 5):
    X_inter2_4.extend([list(i) for i in combinations(X_tgt, i)])
X_inter5 = [list(i) for i in combinations(X_tgt, 5)]
X_inter6 = [list(i) for i in combinations(X_tgt, 6)]
X_inter7 = [list(i) for i in combinations(X_tgt, 7)]
X_inter8 = [list(i) for i in combinations(X_tgt, 8)]
X_inter9 = [list(i) for i in combinations(X_tgt, 9)]
X_inter1 = [list(i) for i in combinations(X_tgt, 1)]

In [6]:
for i, (train_idx, valid_idx) in enumerate(ss.split(df_train[X_all], df_train[target])):
    print("generating {} Fold".format(i))
    sc.cache_result(
        'ss_tgt_2_4_{}'.format(i),
        lambda : tge_proc(df_train.iloc[train_idx], df_train.iloc[valid_idx], X_inter2_4), rerun = 0
    )

generating 0 Fold


In [7]:
for i, (train_idx, valid_idx) in enumerate(ss.split(df_train[X_all], df_train[target])):
    print("generating {} Fold".format(i))
    sc.cache_result(
        'ss_tgt_5_{}'.format(i),
        lambda : tge_proc(df_train.iloc[train_idx], df_train.iloc[valid_idx], X_inter5), rerun = 0
    )

generating 0 Fold


In [8]:
for i, (train_idx, valid_idx) in enumerate(ss.split(df_train[X_all], df_train[target])):
    print("generating {} Fold".format(i))
    sc.cache_result(
        'ss_tgt_6_{}'.format(i),
        lambda : tge_proc(df_train.iloc[train_idx], df_train.iloc[valid_idx], X_inter6), rerun = 0
    )

generating 0 Fold


In [9]:
for i, (train_idx, valid_idx) in enumerate(ss.split(df_train[X_all], df_train[target])):
    print("generating {} Fold".format(i))
    sc.cache_result(
        'ss_tgt_7_{}'.format(i),
        lambda : tge_proc(df_train.iloc[train_idx], df_train.iloc[valid_idx], X_inter7), rerun = 0
    )

generating 0 Fold


In [10]:
for i, (train_idx, valid_idx) in enumerate(ss.split(df_train[X_all], df_train[target])):
    print("generating {} Fold".format(i))
    sc.cache_result(
        'ss_tgt_8_{}'.format(i),
        lambda : tge_proc(df_train.iloc[train_idx], df_train.iloc[valid_idx], X_inter8), rerun = 0
    )

generating 0 Fold


In [11]:
for i, (train_idx, valid_idx) in enumerate(ss.split(df_train[X_all], df_train[target])):
    print("generating {} Fold".format(i))
    sc.cache_result(
        'ss_tgt_9_{}'.format(i),
        lambda : tge_proc(df_train.iloc[train_idx], df_train.iloc[valid_idx], X_inter9), rerun = 0
    )

generating 0 Fold


In [12]:
for i, (train_idx, valid_idx) in enumerate(ss.split(df_train[X_all], df_train[target])):
    print("generating {} Fold".format(i))
    sc.cache_result(
        'ss_tgt_1_{}'.format(i),
        lambda : tge_proc(df_train.iloc[train_idx], df_train.iloc[valid_idx], X_inter1), rerun = 0
    )

generating 0 Fold


In [5]:
def cv_tgt3(hparams, adapter, tgt_set, test_run = False, **argv):
    rmse_oof = list()
    prd_oof = list()
    for i, (train_idx, valid_idx) in enumerate(ss.split(df_train[X_all], df_train[target])):
        df_tgts_train, df_tgts_valid = list(), list()
        for j in tgt_set:
            df_tgt_train, df_tgt_valid = sc.read_result('{}_{}'.format(j, i))
            df_tgts_train.append(df_tgt_train)
            df_tgts_valid.append(df_tgt_valid)
        df_tgts_train = pd.concat(df_tgts_train, axis=1)
        df_tgts_valid = pd.concat(df_tgts_valid, axis=1)
        result = sgml.train(
            dproc.join_and_assign(
                df_train.iloc[train_idx], df_tgts_train
            ), hparams, config, adapter, **argv
        )
        predictor = sgml.assemble_predictor(**result[0], spec = result[1], config = config)
        prd_oof.append(
            predictor(dproc.join_and_assign(
                df_train.iloc[valid_idx], df_tgts_valid
            ))
        )
        rmse_oof.append(
            root_mean_squared_error(df_train.iloc[valid_idx][target], prd_oof[-1])
        )
        if test_run:
            break
    return rmse_oof, pd.concat(prd_oof).sort_index()

# XGB 3

In [7]:
X_tgt = ['__'.join(i) for i in X_inter1 + X_inter2_4 + X_inter5 + X_inter6 + X_inter7 + X_inter8 + X_inter9]
hparams_xgb3 = {
    'model_params': {'max_depth': 10, 'n_estimators': 1500, 'learning_rate': 0.02, 'colsample_bytree': 0.25},
    'X_num': ['GP', 'HP', 'NAd', 'ELm_num'] + X_tgt, 'validation_fraction': 0.1
}

In [8]:
rmse_oof, prd_oof_xgb3 = sc.cache_result(
    'cv_xgb3', lambda : cv_tgt3(
        hparams_xgb3, xgb_adapter, 
        ['ss_tgt_1', 'ss_tgt_2_4', 'ss_tgt_5', 'ss_tgt_6', 'ss_tgt_7', 'ss_tgt_8', 'ss_tgt_9'], use_gpu = True
    ), rerun = 0
)
np.mean(rmse_oof), rmse_oof

(np.float32(11.934904), [np.float32(11.934904)])

# XGB 4

In [8]:
X_tgt = ['__'.join(i) for i in  X_inter1 + X_inter2_4 + X_inter5 + X_inter6 + X_inter7 + X_inter8]
hparams_xgb4 = {
    'model_params': {'max_depth': 10, 'n_estimators': 1500, 'learning_rate': 0.02, 'colsample_bytree': 0.5},
    'X_num': ['GP', 'HP', 'NAd', 'ELm_num'] + X_tgt, 'validation_fraction': 0.1
}

In [10]:
rmse_oof, prd_oof_xgb4 = sc.cache_result(
    'cv_xgb4', lambda : cv_tgt3(
        hparams_xgb4, xgb_adapter, 
        ['ss_tgt_1', 'ss_tgt_2_4', 'ss_tgt_5', 'ss_tgt_6', 'ss_tgt_7', 'ss_tgt_8'], use_gpu = True
    ), rerun = 0
)
np.mean(rmse_oof), rmse_oof

(np.float32(11.934055), [np.float32(11.934055)])

# XGB 5

In [9]:
X_tgt = ['__'.join(i) for i in X_inter1 + X_inter2_4 + X_inter5 + X_inter6 + X_inter7]
hparams_xgb5 = {
    'model_params': {'max_depth': 10, 'n_estimators': 1500, 'learning_rate': 0.02, 'colsample_bytree': 0.7},
    'X_num': ['GP', 'HP', 'NAd', 'ELm_num'] + X_tgt, 'validation_fraction': 0.1
}

In [12]:
rmse_oof, prd_oof_xgb5 = sc.cache_result(
    'cv_xgb5', lambda : cv_tgt3(
        hparams_xgb5, xgb_adapter, 
        ['ss_tgt_1', 'ss_tgt_2_4', 'ss_tgt_5', 'ss_tgt_6', 'ss_tgt_7'], use_gpu = True
    ), rerun = 0
)
np.mean(rmse_oof), rmse_oof

(np.float32(11.944567), [np.float32(11.944567)])

In [13]:
((prd_oof_xgb3 + prd_oof_xgb4  + prd_oof_xgb5) / 3).pipe(
    lambda x: root_mean_squared_error(df_train.loc[x.index, target], x)
)

np.float32(11.913565)

# LGB 5

In [10]:
X_tgt = ['__'.join(i) for i in X_inter1 + X_inter2_4 + X_inter5 + X_inter6 + X_inter7]
hparams_lgb5 = {
    'model_params': {'num_leaves': 1024, 'n_estimators': 1500, 'learning_rate': 0.02, 'colsample_bytree': 0.7, 'metric': 'RMSE'},
    'X_num': ['GP', 'HP', 'NAd', 'ELm_num'] + X_tgt, 'validation_fraction': 0.1
}

In [15]:
rmse_oof, prd_oof_lgb5 = sc.cache_result(
    'cv_lgb5', lambda : cv_tgt3(
        hparams_lgb5, lgb_adapter, 
        ['ss_tgt_1', 'ss_tgt_2_4', 'ss_tgt_5', 'ss_tgt_6', 'ss_tgt_7']
    ), rerun = 0
)
np.mean(rmse_oof), rmse_oof

(np.float64(11.933838613850925), [np.float64(11.933838613850925)])

In [16]:
((prd_oof_xgb3 + prd_oof_xgb4  + prd_oof_xgb5 + prd_oof_lgb5) / 4).pipe(
    lambda x: root_mean_squared_error(df_train.loc[x.index, target], x)
)

np.float64(11.899139085079813)

# LGB 6

In [11]:
X_tgt = ['__'.join(i) for i in X_inter1 + X_inter2_4 + X_inter5 + X_inter6 + X_inter7 + X_inter8]
hparams_lgb6 = {
    'model_params': {'num_leaves': 512, 'n_estimators': 1500, 'learning_rate': 0.02, 'colsample_bytree': 0.7, 'metric': 'RMSE'},
    'X_num': ['GP', 'HP', 'NAd', 'ELm_num'] + X_tgt, 'validation_fraction': 0.1
}

In [18]:
rmse_oof, prd_oof_lgb6 = sc.cache_result(
    'cv_lgb6', lambda : cv_tgt3(
        hparams_lgb5, lgb_adapter, 
        ['ss_tgt_1', 'ss_tgt_2_4', 'ss_tgt_5', 'ss_tgt_6', 'ss_tgt_7', 'ss_tgt_8']
    ), rerun = 0
)
np.mean(rmse_oof), rmse_oof

Round:   0%|          | 0/1500 [00:00<?, ?it/s]

(np.float64(11.927822328915358), [np.float64(11.927822328915358)])

In [32]:
((prd_oof_xgb3 + prd_oof_xgb4  + prd_oof_xgb5 + prd_oof_lgb5 + prd_oof_lgb6) / 5).pipe(
    lambda x: root_mean_squared_error(df_train.loc[x.index, target], x)
)

np.float64(11.891587539908157)

# XGB 6

In [34]:
X_tgt = ['__'.join(i) for i in X_inter1 + X_inter2_4 + X_inter5 + X_inter6]
hparams_xgb6 = {
    'model_params': {'max_depth': 10, 'n_estimators': 1500, 'learning_rate': 0.02, 'colsample_bytree': 0.8},
    'X_num': ['GP', 'HP', 'NAd', 'ELm_num'] + X_tgt, 'validation_fraction': 0.1
}

In [35]:
rmse_oof, prd_oof_xgb6 = sc.cache_result(
    'cv_xgb6', lambda : cv_tgt3(hparams_xgb6, xgb_adapter, ['ss_tgt_1', 'ss_tgt_2_4', 'ss_tgt_5', 'ss_tgt_6'], use_gpu = True), rerun = 1
)
np.mean(rmse_oof), rmse_oof

Round:   0%|          | 0/1500 [00:00<?, ?it/s]

(np.float32(11.950178), [np.float32(11.950178)])

In [36]:
((prd_oof_xgb3 + prd_oof_xgb4  + prd_oof_xgb5 + prd_oof_lgb5 + prd_oof_lgb6 + prd_oof_xgb6) / 6).pipe(
    lambda x: root_mean_squared_error(df_train.loc[x.index, target], x)
)

np.float64(11.893755357580401)

# XGB 7

In [9]:
X_tgt = ['__'.join(i) for i in  X_inter1 + X_inter2_4 + X_inter5 + X_inter7]
hparams_xgb7 = {
    'model_params': {'max_depth': 11, 'n_estimators': 1500, 'learning_rate': 0.03, 'colsample_bytree': 0.5, 'min_child_weight': 50},
    'X_num': ['GP', 'HP', 'NAd', 'ELm_num'] + X_tgt, 'validation_fraction': 0.1
}

In [None]:
rmse_oof, prd_oof_xgb7 = sc.cache_result(
    'cv_xgb7', lambda : cv_tgt3(
        hparams_xgb7, xgb_adapter, 
        ['ss_tgt_1', 'ss_tgt_2_4', 'ss_tgt_5', 'ss_tgt_6', 'ss_tgt_7', 'ss_tgt_8'], use_gpu = True
    ), rerun = 1
)
np.mean(rmse_oof), rmse_oof

In [None]:
_ = sc.cache_result(
    'tgt_2_4', lambda : tge_proc(df_train, df_test, X_inter2_4), rerun = 0
)

In [None]:
_ = sc.cache_result(
    'tgt_5', lambda : tge_proc(df_train, df_test, X_inter5), rerun = 0
)

In [None]:
_ = sc.cache_result(
    'tgt_6', lambda : tge_proc(df_train, df_test, X_inter6), rerun = 0
)

In [None]:
_ = sc.cache_result(
    'tgt_7', lambda : tge_proc(df_train, df_test, X_inter7), rerun = 0
)

In [None]:
_ = sc.cache_result(
    'tgt_8', lambda : tge_proc(df_train, df_test, X_inter8), rerun = 0
)

In [None]:
_ = sc.cache_result(
    'tgt_9', lambda : tge_proc(df_train, df_test, X_inter9), rerun = 0
)

In [None]:
_ = sc.cache_result(
    'tgt_1', lambda : tge_proc(df_train, df_test, X_inter1), rerun = 0
)

In [6]:
df_tgts_train, df_tgts_test = list(), list()
for i in ['tgt_1', 'tgt_2_4', 'tgt_5', 'tgt_6', 'tgt_7', 'tgt_8', 'tgt_9']:
    df_tgt_train, df_tgt_valid = sc.read_result(i)
    df_tgts_train.append(df_tgt_train)
    df_tgts_test.append(df_tgt_valid)
df_tgts_train = pd.concat(df_tgts_train, axis=1)
df_tgts_test = pd.concat(df_tgts_test, axis=1)
df_train = dproc.join_and_assign(df_train, df_tgts_train)
df_test = dproc.join_and_assign(df_test, df_tgts_test)

In [13]:
models = [
    ('xgb3_model', hparams_xgb3, xgb_adapter, {'use_gpu': True}), 
    ('xgb4_model', hparams_xgb4, xgb_adapter, {'use_gpu': True}),
    ('xgb5_model', hparams_xgb5, xgb_adapter, {'use_gpu': True}),
    ('lgb5_model', hparams_lgb5, lgb_adapter, {}),
    ('lgb6_model', hparams_lgb6, lgb_adapter, {}),
]

prds = list()
for i, hparams, adapter, argv in models:
    print(i)
    if 'validation_fraction' in hparams:
        hparams.pop('validation_fraction')
    result = sc.cache_result(
        i, lambda : sgml.train(
            df_train, hparams, config, adapter, **argv
        )
    )
    predictor = sgml.assemble_predictor(**result[0], spec = result[1], config = config)
    prds.append(predictor(df_test).rename(i))

xgb3_model


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




xgb4_model
xgb5_model
lgb5_model
lgb6_model


In [16]:
pd.concat(prds, axis = 1).mean(axis = 1).rename(target).to_csv('result/submission12.csv')

# Ensemble 2

In [None]:
!kaggle kernels output pirhosseinlou/xgboost-single-model -p /result/e1.csv

In [None]:
!kaggle kernels output itasps/xgboost-for-predicting-podcast-listening-time -p /result/e2.csv

In [15]:
pd.concat([
    pd.read_csv('result/submission12.csv', index_col = 'id')[target],
    pd.read_csv('result/e1.csv', index_col = 'id')[target],
    pd.read_csv('result/e2.csv', index_col = 'id')[target]
], axis=1).dot([0.4, 0.3, 0.3]).rename(target).to_csv('result/submission13.csv')

In [16]:
!head result/submission13.csv

id,Listening_Time_minutes
750000,54.25828380531418
750001,23.687879584149115
750002,48.06980681735792
750003,80.54972500361517
750004,50.04464040831872
750005,22.07875733989915
750006,93.67984337914278
750007,39.95214575307881
750008,63.836863387589446
