In [1]:
import os, sys

import sgpp, sgml, dproc, sgutil
import pandas as pd
import polars as pl
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

print(sys.version)

from sklearn.pipeline import make_pipeline

3.12.6 (main, Sep 30 2024, 02:19:13) [GCC 9.4.0]


In [2]:
sc = sgutil.SGCache('img', 'result', 'model')
p = make_pipeline(
    sgpp.PolarsProcessor(),
    sgpp.ExprProcessor({
        'ELm_num': (pl.col('Episode_Length_minutes').clip(5, 120) - 5) / 115,
        'GP': (pl.col('Guest_Popularity_percentage').fill_null(pl.col('Guest_Popularity_percentage').clip(0,100).mean()).clip(0, 100) / 100),
        'HP': pl.col('Host_Popularity_percentage').clip(0, 100) / 100,
        'NAd': (pl.when(pl.col('Number_of_Ads') > 4).then(0).otherwise(pl.col('Number_of_Ads'))).fill_null(0.0) /3 ,
        'Number_of_Ads': (pl.when(pl.col('Number_of_Ads') > 4).then(0).otherwise(pl.col('Number_of_Ads'))).fill_null(0).cast(pl.Int8),
        'ELm_na': pl.col('Episode_Length_minutes').is_null(),
        'GPp_na': pl.col('Guest_Popularity_percentage').is_null(),
        'ELm_sqrt': ((pl.col('Episode_Length_minutes').clip(5, 120) - 5) / 115)  ** 0.5
    }),
    sgpp.PandasConverter(index_col = 'id')
)
p.fit(['data/train.csv'])

In [3]:
df_train = p.transform(['data/train.csv'])
df_test = p.transform(['data/test.csv'])
df_train_1 = df_train.loc[~df_train['ELm_na']]
df_test_1 = df_test.loc[~df_test['ELm_na']]
target = 'Listening_Time_minutes'
resi = 'resi'
X_all = [i for i in df_train.columns if i != target]

In [4]:
import lightgbm as lgb
import xgboost as xgb
import catboost as cb
from sklearn.metrics import root_mean_squared_error
from sklearn.model_selection import KFold, cross_validate, cross_val_score, ShuffleSplit, train_test_split
from sklearn.linear_model import LinearRegression
from mlxtend.feature_selection import SequentialFeatureSelector
kf = KFold(5, shuffle = True, random_state = 123)
ss = ShuffleSplit(1, train_size = 0.8, random_state = 123)

def get_validation_splitter(validation_fraction):
    return lambda x: train_test_split(x, test_size = validation_fraction)

config = {
    'predict_func': lambda m, df, X: pd.Series(m.predict(df[X]), index = df.index),
    'score_func': lambda df, prds: root_mean_squared_error(df[target], prds),
    'validation_splitter': get_validation_splitter,
    'progress_callback': sgml.ProgressCallBack(), 
    'return_train_scores': True,
    'y': target,
}

config2 = {
    'predict_func': lambda m, df, X: pd.Series(m.predict(df[X]), index = df.index),
    'score_func': lambda df, prds: root_mean_squared_error(df[resi], prds),
    'validation_splitter': get_validation_splitter,
    'progress_callback': sgml.ProgressCallBack(), 
    'return_train_scores': True,
    'y': resi,
}

lr_adapter = sgml.SklearnAdapter(LinearRegression)
lgb_adapter = sgml.LGBMAdapter(lgb.LGBMRegressor, progress = 50)
xgb_adapter = sgml.XGBAdapter(xgb.XGBRegressor, progress = 50)

In [5]:
df_train_1 = df_train_1.assign(
    prd = sc.read_prd('LR_Elm_notna'),
    resi = lambda x: x[target] - x['prd']
)
df_test_1 = df_test_1.assign(
    prd = sc.get_predictor_cv('LR_Elm_notna', config)(df_test_1)
)

In [6]:
from itertools import combinations
X = [
    'Episode_Length_minutes', 'Host_Popularity_percentage', 'Guest_Popularity_percentage', 'Number_of_Ads',
    'Episode_Sentiment', 'Episode_Title', 'Genre', 'Podcast_Name', 'Publication_Day', 'Publication_Time'
]

df_c_list, df_tgt_list = list(), list()
for j in range(1, 5):
    df_c, df_tgt = sc.cache_result(
        'tgt_enc_resi_{}'.format(j), lambda : get_tgt_rmse_list(df_train_1, [list(i) for i in combinations(X, j)], resi), rerun = 0
    )
    df_c_list.append(df_c)
    df_tgt_list.append(df_tgt)

df_c2 = pd.concat(df_c_list, axis = 0)
df_tgt2 = pd.concat(df_tgt_list, axis = 1).rename(columns = lambda x: x + '_r')
X_tgt2s = {'tgt2_{}'.format(j): ['__'.join(i) + '_r' for i in combinations(X, j)] for j in range(1, 5)}
del df_c_list, df_tgt_list
df_test_tgt2 = sc.cache_result(
    'tgt_enc_resi_test',
    lambda : pd.concat([
        get_tgt_fit_transform(df_train_1, df_test_1, i, resi, j)
        for i, j in df_c2[['X_tgt', 'smooth']].values
    ], axis =1), rerun = 0
).rename(columns = lambda x: x + '_r')
df_train_1 = dproc.join_and_assign(df_train_1, df_tgt2)
df_test_1 = dproc.join_and_assign(df_test_1, df_test_tgt2)
del df_tgt2, df_test_tgt2

# LGB_2

전체 변수를 범주화 하는 건 효과성이 없습니다.

In [None]:
"""
hparams = {
    'X_num': X_tgt2s['tgt2_1'] + X_tgt2s['tgt2_2'] + X_tgt2s['tgt2_3'] + X_tgt2s['tgt2_4'],
    'X_cat': [
        'Episode_Length_minutes', 'Host_Popularity_percentage', 'Guest_Popularity_percentage', 'Number_of_Ads',
        'Episode_Sentiment', 'Episode_Title', 'Genre', 'Podcast_Name', 'Publication_Day', 'Publication_Time'
    ], 'cat': {'handle_unknown': 'use_encoded_value', 'unknown_value' : -1, 'min_frequency': 2, 'encoded_missing_value': -1},
    'model_params': {'num_leaves': 255, 'n_estimators': 1000, 'colsample_bytree': 0.7, 'learning_rate': 0.005, 'max_bins': 511},
    'validation_fraction': 0.1,
}

result = sgml.cv(df_train_1, ss, hparams, config2, lgb_adapter)
np.mean(result['valid_scores'])
"""

# LGB_3

기반 모델의 복잡도를 높일 수록 성능이 좋아짐을 보았습니다. 기반 모델의 복잡도를 높히는 시도를 어는 선까지 할 수 있는지 살펴 봅니다.

In [None]:
hparams = {
    'X_num': X_tgt2s['tgt2_1'] + X_tgt2s['tgt2_2'] + X_tgt2s['tgt2_3'] + X_tgt2s['tgt2_4'],
    'model_params': {'num_leaves': 511, 'n_estimators': 1000, 'colsample_bytree': 0.7, 'learning_rate': 0.01, 'max_bins': 511},
    'validation_fraction': 0.1,
}

result = sc.cv_result('LGB_R2', df_train_1, ss, hparams, config2, lgb_adapter)
np.mean(result['valid_scores'])

# XGB_3

In [None]:
hparams = {
    'X_num': X_tgt2s['tgt2_1'] + X_tgt2s['tgt2_2'] + X_tgt2s['tgt2_3'] + X_tgt2s['tgt2_4'],
    'X_cat': [
        'Episode_Length_minutes', 'Host_Popularity_percentage', 'Guest_Popularity_percentage', 'Number_of_Ads',
        'Episode_Sentiment', 'Episode_Title', 'Genre', 'Podcast_Name', 'Publication_Day', 'Publication_Time'
    ], 'cat': {'handle_unknown': 'use_encoded_value', 'unknown_value' : -1, 'min_frequency': 2, 'encoded_missing_value': -1},
    'model_params': {'max_depth': 10, 'n_estimators': 1000, 'colsample_bytree': 0.7, 'learning_rate': 0.005, 'enable_categorical': True},
    'validation_fraction': 0.1,
}

result = sgml.cv(df_train_1, ss, hparams, config2, xgb_adapter)
np.mean(result['valid_scores'])