In [2]:
import os, sys

import sgpp, sgml, dproc, sgutil
import pandas as pd
import polars as pl
import numpy as np
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

print(sys.version)

from sklearn.pipeline import make_pipeline
print(sklearn.__version__)

3.12.9 (main, Mar 15 2025, 13:36:28) [GCC 13.3.0]
1.6.1


In [3]:
sc = sgutil.SGCache('img', 'result', 'model')
p = make_pipeline(
    sgpp.PolarsProcessor(),
    sgpp.ExprProcessor({
        'ELm_num': (pl.col('Episode_Length_minutes').fill_null(pl.col('Episode_Length_minutes').mean()).clip(5, 120) - 5) / 115,
        'GP': (pl.col('Guest_Popularity_percentage').fill_null(pl.col('Guest_Popularity_percentage').clip(0,100).mean()).clip(0, 100) / 100),
        'HP': pl.col('Host_Popularity_percentage').clip(0, 100) / 100,
        'NAd': (pl.when(pl.col('Number_of_Ads') > 4).then(0).otherwise(pl.col('Number_of_Ads'))).fill_null(0.0) /3 ,
        'Number_of_Ads': (pl.when(pl.col('Number_of_Ads') > 4).then(0).otherwise(pl.col('Number_of_Ads'))).fill_null(0).cast(pl.Int8),
        'ELm_na': pl.col('Episode_Length_minutes').is_null(),
        'GPp_na': pl.col('Guest_Popularity_percentage').is_null(),
        'ELm_sqrt': ((pl.col('Episode_Length_minutes').fill_null(pl.col('Episode_Length_minutes').mean()).clip(5, 120) - 5) / 115)  ** 0.5
    }),
    sgpp.PandasConverter(index_col = 'id'),
)
p.fit(['data/train.csv'])
df_train = p.transform(['data/train.csv'])
df_test = p.transform(['data/test.csv'])

In [4]:
import lightgbm as lgb
import xgboost as xgb
import catboost as cb
from sklearn.metrics import root_mean_squared_error
from sklearn.model_selection import KFold, cross_validate, cross_val_score, ShuffleSplit
from sklearn.linear_model import LinearRegression
from mlxtend.feature_selection import SequentialFeatureSelector
kf = KFold(5, shuffle = True, random_state = 123)
ss = ShuffleSplit(1, train_size = 0.8, random_state = 123)
target = 'Listening_Time_minutes'
X_all = [i for i in df_train.columns if i != target]

def get_validation_splitter(validation_fraction):
    return lambda x: train_test_split(x, test_size = validation_fraction)

config = {
    'predict_func': lambda m, df, X: pd.Series(m.predict(df[X]), index = df.index),
    'score_func': lambda df, prds: root_mean_squared_error(df[target], prds),
    'validation_splitter': get_validation_splitter,
    'progress_callback': sgml.ProgressCallBack(), 
    'return_train_scores': True,
    'y': target,
}

lr_adapter = sgml.SklearnAdapter(LinearRegression)
lgb_adapter = sgml.LGBMAdapter(lgb.LGBMRegressor, progress = 50)
xgb_adapter = sgml.XGBAdapter(xgb.XGBRegressor, progress = 50)

2.Feature Engineering.ipynb 에서 발견한 속성들로 학습을 시킵니다.

In [5]:
X_tgt_sel, _ = sc.read_result('tgt_sel')
from sklearn.preprocessing import TargetEncoder

def tge_proc(df_train, df_valid, df_test):
    X_tgt_var = pd.Series(X_tgt_sel).str.split(',').explode().unique().tolist()
    tge = make_pipeline(
        sgpp.CatCombiner2(pd.Series(X_tgt_sel).str.split(',').tolist()), TargetEncoder()
    ).set_output(transform = 'pandas')
    return (
        tge.fit_transform(df_train[X_tgt_var], df_train[target]), 
        tge.transform(df_valid[X_tgt_var]), 
        tge.transform(df_test[X_tgt_var])
    )

In [5]:
for i, (train_idx, valid_idx) in enumerate(kf.split(df_train[X_all], df_train[target])):
    print("generating {} Fold".format(i))
    sc.cache_result(
        'kf_sp_{}_3'.format(i),
        lambda : tge_proc(df_train.iloc[train_idx], df_train.iloc[valid_idx], df_test), rerun = 0
    )
X_tgt = X_tgt_sel

generating 0 Fold
generating 1 Fold
generating 2 Fold
generating 3 Fold
generating 4 Fold


In [13]:
def cv_tgt3(hparams, adapter):
    rmse_oof = list()
    prd_oof = list()
    test_prd = list()
    
    for i, (train_idx, valid_idx) in enumerate(kf.split(df_train[X_all], df_train[target])):
        df_tgt_train, df_tgt_valid, df_tgt_test = sc.cache_result(
            'kf_sp_{}_3'.format(i),
            lambda : tge_proc(df_train.iloc[train_idx], df_train.iloc[valid_idx])
        )
        result = sgml.train(
            dproc.join_and_assign(df_train.iloc[train_idx], df_tgt_train.rename(columns = lambda x: x.replace(',', '_'))), hparams, config, adapter
        )
        predictor = sgml.assemble_predictor(**result[0], config = config)
        prd_oof.append(
            predictor(dproc.join_and_assign(df_train.iloc[valid_idx], df_tgt_valid.rename(columns = lambda x: x.replace(',', '_'))))
        )
        rmse_oof.append(
            root_mean_squared_error(df_train.iloc[valid_idx][target], prd_oof[-1])
        )
        test_prd.append(
            predictor(dproc.join_and_assign(df_test, df_tgt_test.rename(columns = lambda x: x.replace(',', '_'))))
        )
    return rmse_oof, pd.concat(prd_oof).sort_index(), test_prd

# Linear Regression 2

In [7]:
hparams = {
    'X_num': ['GP', 'HP', 'NAd', 'ELm_sqrt', 'ELm_num', 'ELm_na'] + [i.replace(',', '_') for i in X_tgt]
}

In [12]:
rmse_oof, prd_oof, prd_test = sc.cache_result(
    'lr3', lambda : cv_tgt3(hparams, lr_adapter), rerun = 0
)
#rmse_oof

In [17]:
pd.concat(prd_test, axis=1).mean(axis=1).rename(target).to_csv('result/submission8.csv')

In [19]:
root_mean_squared_error(
    df_train[target], prd_oof
)

12.863499267030816

- Podcast_Name 을 제외하고 결과를 뽑아 봅니다.

In [15]:
hparams = {
    'X_num': ['GP', 'HP', 'NAd', 'ELm_sqrt', 'ELm_num', 'ELm_na'] + [i.replace(',', '_') for i in X_tgt if 'Podcast_Name' not in i.split(',')]
}

In [16]:
rmse_oof, prd_oof, prd_test = sc.cache_result(
    'lr3_2', lambda : cv_tgt3(hparams, lr_adapter), rerun = 0
)
root_mean_squared_error(
    df_train[target], prd_oof
)

12.884181703021346

In [17]:
pd.concat(prd_test, axis=1).mean(axis=1).rename(target).to_csv('result/submission9.csv')