In [4]:
import os, sys

import sgpp, sgml, dproc, sgutil
import pandas as pd
import polars as pl
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

print(sys.version)

from sklearn.pipeline import make_pipeline

3.12.6 (main, Sep 30 2024, 02:19:13) [GCC 9.4.0]


In [5]:
sc = sgutil.SGCache('img', 'result', 'model')
p = make_pipeline(
    sgpp.PolarsProcessor(),
    sgpp.ExprProcessor({
        'ELm_num': (pl.col('Episode_Length_minutes').clip(5, 120) - 5) / 115,
        'GP': (pl.col('Guest_Popularity_percentage').fill_null(pl.col('Guest_Popularity_percentage').clip(0,100).mean()).clip(0, 100) / 100),
        'HP': pl.col('Host_Popularity_percentage').clip(0, 100) / 100,
        'NAd': (pl.when(pl.col('Number_of_Ads') > 4).then(0).otherwise(pl.col('Number_of_Ads'))).fill_null(0.0) /3 ,
        'Number_of_Ads': (pl.when(pl.col('Number_of_Ads') > 4).then(0).otherwise(pl.col('Number_of_Ads'))).fill_null(0).cast(pl.Int8),
        'ELm_na': pl.col('Episode_Length_minutes').is_null(),
        'GPp_na': pl.col('Guest_Popularity_percentage').is_null(),
        'ELm_sqrt': ((pl.col('Episode_Length_minutes').clip(5, 120) - 5) / 115)  ** 0.5
    }),
    sgpp.PandasConverter(index_col = 'id')
)
p.fit(['data/train.csv'])

In [25]:
df_train = p.transform(['data/train.csv'])
df_test = p.transform(['data/test.csv'])
df_train_1 = df_train.loc[~df_train['ELm_na']]
df_test_1 = df_test.loc[~df_test['ELm_na']]
target = 'Listening_Time_minutes'
resi = 'resi'
X_all = [i for i in df_train.columns if i != target]

- 거의 동일한 방법으로 Target Encoder 기반으로 제출셋을 만들었지만 5 Fold로 했을 때 성능이 좋으면서 
  Leader Board상의 스코어와 비슷한 결과를 보여주므로, 4 Fold에서 5 Fold로 변경합니다.

- Elm_num, Elm_sqrt 나머지 변수는 Target Encoder로 모델링하는 편이 오히려 낫다고 판단이 1-5 Elm_num이 결측인 것을 제외하지 않고 실험 결과를 미루어 봤을 때 생각이 됩니다.

- Elm_num이 미결측일 때는 잔차를 이용한 Target Encoder가 더 도움이 된다고 판단이 되어 Elm_num이 미결측일 때는 따로 모델을 만듭니다.

- df_train_1 'ELm_num', 'ELm_sqrt' 만을 사용하여 선형회귀모델을 만들고, 잔차를 구하여 resi 변수를 두고 이를 Target Encoding 하여 모델을 제작합니다.

In [8]:
import lightgbm as lgb
import xgboost as xgb
import catboost as cb
from sklearn.metrics import root_mean_squared_error
from sklearn.model_selection import KFold, cross_validate, cross_val_score, ShuffleSplit
from sklearn.linear_model import LinearRegression
from mlxtend.feature_selection import SequentialFeatureSelector
kf = KFold(5, shuffle = True, random_state = 123)
ss = ShuffleSplit(1, train_size = 0.8, random_state = 123)

def get_validation_splitter(validation_fraction):
    return lambda x: train_test_split(x, test_size = validation_fraction)

config = {
    'predict_func': lambda m, df, X: pd.Series(m.predict(df[X]), index = df.index),
    'score_func': lambda df, prds: root_mean_squared_error(df[target], prds),
    'validation_splitter': get_validation_splitter,
    'progress_callback': sgml.ProgressCallBack(), 
    'return_train_scores': True,
    'y': target,
}

config2 = {
    'predict_func': lambda m, df, X: pd.Series(m.predict(df[X]), index = df.index),
    'score_func': lambda df, prds: root_mean_squared_error(df[resi], prds),
    'validation_splitter': get_validation_splitter,
    'progress_callback': sgml.ProgressCallBack(), 
    'return_train_scores': True,
    'y': resi,
}

lr_adapter = sgml.SklearnAdapter(LinearRegression)
lgb_adapter = sgml.LGBMAdapter(lgb.LGBMRegressor, progress = 50)

# LR_Elm_notna

Target_1 Elm, Elm_sqrt로 선형회귀모델 만들기

In [79]:
hparams = {
    'X_num': ['GP', 'HP', 'NAd', 'ELm_sqrt', 'ELm_num']
}

result = sc.cv_result('LR_Elm_notna', df_train_1, kf, hparams, config, 
                      lr_adapter, result_proc = [sgml.lr_learning_result])
np.mean(result['valid_scores'])

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

np.float32(10.683447)

In [82]:
sc.train_cv('LR_Elm_notna', df_train_1, config)

(LinearRegression(), None, ['NAd', 'HP', 'ELm_num', 'GP', 'ELm_sqrt'])

In [83]:
df_train_1 = df_train_1.assign(
    prd = sc.read_prd('LR_Elm_notna'),
    resi = lambda x: x[target] - x['prd']
)
df_test_1 = df_test_1.assign(
    prd = sc.get_predictor_cv('LR_Elm_notna', config)(df_test_1)
)

In [84]:
from itertools import combinations
from tqdm.notebook import tqdm 
from cuml.preprocessing import TargetEncoder

def get_tgt_rmse(df, X_tgt, target, smooth_space = np.linspace(1, 10, 10)):
    rmse_fold = list()
    best_prd = None
    best_rmse = np.inf
    for i in smooth_space:
        tgt = TargetEncoder(smooth = i, split_method ='continuous')
        prds = list()
        rmses = list()
        for train_idx, test_idx in kf.split(df[X_tgt], df[target]):
            df.iloc[train_idx].pipe(lambda x: tgt.fit(x[X_tgt], x[target]))
            df_valid = df.iloc[test_idx]
            prds.append(
                pd.Series(tgt.transform(df_valid[X_tgt]), index = df_valid.index)
            )
            rmses.append(
                root_mean_squared_error(df_valid[target], prds[-1])
            )
        rmse_fold.append(np.mean(rmses))
        if best_rmse > rmse_fold[-1]:
            best_rmse = rmse_fold[-1]
            best_prd = pd.concat(prds)
    return np.min(rmse_fold), smooth_space[np.argmin(rmse_fold)], best_prd

def get_tgt_rmse_list(df, X_values, target, smooth_space = [0.01, 0.1, 1, 10, 100, 1000]):
    results = list()
    best_prd_list = list()
    for i in tqdm(X_values):
        rmse, best_smooth, best_prd = get_tgt_rmse(df, i, target, smooth_space = smooth_space)
        results.append(
            pd.Series(
                [i, rmse, best_smooth], index = ['X_tgt', 'RMSE', 'smooth']
            )
        )
        best_prd_list.append(best_prd.rename('__'.join(i)))
    return pd.DataFrame(results).sort_values('RMSE'), pd.concat(best_prd_list, axis=1).sort_index()

In [85]:
def get_tgt_fit_transform(df_train, df_test, X_tgt, target, smooth):
    results = list()
    best_prd_list = list()
    tgt = TargetEncoder(smooth = smooth, split_method ='continuous')
    return pd.Series(
        tgt.fit(df_train[X_tgt], df_train[target]).transform(df_test[X_tgt]),
        index = df_test.index, name = '__'.join(X_tgt)
    )

# Target Encoding1

- Episode_Length_minutes를 제외한 타겟인코딩

In [65]:
X = [
    'Host_Popularity_percentage', 'Guest_Popularity_percentage', 'Number_of_Ads',
    'Episode_Sentiment', 'Episode_Title', 'Genre', 'Podcast_Name', 'Publication_Day', 'Publication_Time'
]

df_c_list, df_tgt_list, df_test_list = list(), list(), list()
for j in range(1, 4):
    (df_c, df_tgt), df_test_tgt = sc.cache_result(
        'tgt_enc_target_{}'.format(j), lambda : (
            get_tgt_rmse_list(df_train, [list(i) for i in combinations(X, j)], target),
            pd.concat([
                get_tgt_fit_transform(df_train, df_test, i, target, j)
                for i, j in df_c1[['X_tgt', 'smooth']].values
            ], axis =1)
        ), rerun = 0
    )
    df_c_list.append(df_c)
    df_tgt_list.append(df_tgt)
    df_test_list.append(df_test_tgt)

df_c1 = pd.concat(df_c_list, axis = 0)
df_tgt1 = pd.concat(df_tgt_list, axis = 1).rename(columns = lambda x: x + '_t')
df_test_tgt1 = pd.concat(df_test_list, axis = 1).rename(columns = lambda x: x + '_t')
X_tgt1s = {'tgt1_{}'.format(j): ['__'.join(i) + '_t' for i in combinations(X, j)] for j in range(1, 3)}
del df_c_list, df_tgt_list, df_test_list

In [69]:
df_train = dproc.join_and_assign(df_train, df_tgt1)
df_test = dproc.join_and_assign(df_test, df_test_tgt1)
del df_tgt1, df_test_tgt1

# Target Encoding2

- Episode_Length_minutes가 미결측인 경우의 resi에 대한 Target Encoding

In [66]:
X = [
    'Episode_Length_minutes', 'Host_Popularity_percentage', 'Guest_Popularity_percentage', 'Number_of_Ads',
    'Episode_Sentiment', 'Episode_Title', 'Genre', 'Podcast_Name', 'Publication_Day', 'Publication_Time'
]

df_c_list, df_tgt_list, df_test_list = list(), list(), list()
for j in range(1, 3):
    (df_c, df_tgt), df_test_tgt = sc.cache_result(
        'tgt_enc_resi_{}'.format(j), lambda : (
            get_tgt_rmse_list(df_train_1, [list(i) for i in combinations(X, j)], resi),
            pd.concat([
                get_tgt_fit_transform(df_train_1, df_test_1, i, resi, j)
                for i, j in df_c1[['X_tgt', 'smooth']].values
            ], axis = 1)
        ), rerun = 0
    )
    df_c_list.append(df_c)
    df_tgt_list.append(df_tgt)
    df_test_list.append(df_test_tgt)

df_c2 = pd.concat(df_c_list, axis = 0)
df_tgt2 = pd.concat(df_tgt_list, axis = 1).rename(columns = lambda x: x + '_r')
df_test_tgt2 = pd.concat(df_test_list, axis = 1).rename(columns = lambda x: x + '_r')
X_tgt2s = {'tgt2_{}'.format(j): ['__'.join(i) + '_r' for i in combinations(X, j)] for j in range(1, 3)}
del df_c_list, df_tgt_list, df_test_list

In [71]:
df_train_1 = dproc.join_and_assign(df_train_1, df_tgt2)
df_test_1 = dproc.join_and_assign(df_test_1, df_test_tgt2)
del df_tgt2, df_test_tgt2

# Linear Regression

In [74]:
hparams = {
    'X_num': X_tgt1s['tgt1_1'] + X_tgt1s['tgt1_2']
}

result = sc.cv_result('LR_A1', df_train, kf, hparams, config, 
                      lr_adapter, result_proc = [sgml.lr_learning_result])
np.mean(result['valid_scores'])

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

np.float64(25.53951626849385)

In [76]:
hparams = {
    'X_num': X_tgt2s['tgt2_1'] + X_tgt2s['tgt2_2']
}

result = sc.cv_result('LR_R1', df_train_1, kf, hparams, config2, 
                      lr_adapter, result_proc = [sgml.lr_learning_result])
np.mean(result['valid_scores'])

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

np.float64(9.896696106429346)

In [78]:
(
    (
        (np.mean(sc.read_cv('LR_A1')['valid_scores']) ** 2) * df_train['ELm_na'].sum() + (np.mean(sc.read_cv('LR_R1')['valid_scores']) ** 2) * (~df_train['ELm_na']).sum()
    ) / df_train.shape[0]
) ** 0.5

np.float64(12.740279286065407)