In [2]:
import os, sys

import sgpp
import dproc
import sgutil
import pandas as pd
import polars as pl
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

print(sys.version)

from sklearn.pipeline import make_pipeline

3.12.6 (main, Sep 30 2024, 02:19:13) [GCC 9.4.0]


In [3]:
p = make_pipeline(
    sgpp.PolarsProcessor(),
    sgpp.ExprProcessor({
        'ELm_na': pl.col('Episode_Length_minutes').is_null(),
        'ELm_sqrt': pl.col('Episode_Length_minutes') ** 0.5,
        'GPp_na': pl.col('Guest_Popularity_percentage').is_null(),
    }),
    sgpp.PandasConverter(index_col = 'id')
)
p.fit(['data/train.csv'])

In [4]:
df_train = p.transform(['data/train.csv'])
df_test = p.transform(['data/test.csv'])

In [5]:
X_num = ['Episode_Length_minutes', 'Host_Popularity_percentage', 'Guest_Popularity_percentage', 'Number_of_Ads', 'ELm_sqrt']
X_cat = ['Episode_Sentiment', 'Episode_Title', 'Genre', 'Podcast_Name', 'Publication_Day', 'Publication_Time']
X_bool = ['ELm_na', 'GPp_na']
sc = sgutil.SGCache('img', 'result', 'model')
target = 'Listening_Time_minutes'

In [6]:
from sklearn.metrics import root_mean_squared_error
from sklearn.model_selection import KFold
kf = KFold(4, shuffle = True, random_state = 123)

In [7]:
from itertools import combinations
from tqdm.notebook import tqdm 

In [9]:
from cuml.preprocessing import TargetEncoder

In [10]:
def get_tgt_rmse(df, X_tgt, smooth_space = np.linspace(1, 10, 10)):
    rmse_fold = list()
    best_prd = None
    best_rmse = np.inf
    for i in smooth_space:
        tgt = TargetEncoder(smooth = i, split_method ='continuous')
        prds = list()
        rmses = list()
        for train_idx, test_idx in kf.split(df[X_tgt], df[target]):
            df.iloc[train_idx].pipe(lambda x: tgt.fit(x[X_tgt], x[target]))
            df_valid = df.iloc[test_idx]
            prds.append(
                pd.Series(tgt.transform(df_valid[X_tgt]), index = df_valid.index)
            )
            rmses.append(
                root_mean_squared_error(df_valid[target], prds[-1])
            )
        rmse_fold.append(np.mean(rmses))
        if best_rmse > rmse_fold[-1]:
            best_rmse = rmse_fold[-1]
            best_prd = pd.concat(prds)
    return np.min(rmse_fold), smooth_space[np.argmin(rmse_fold)], best_prd

In [11]:
def get_tgt_rmse_list(df, X_values, smooth_space = [0.01, 0.1, 1, 10, 100, 1000]):
    results = list()
    best_prd_list = list()
    for i in tqdm(X_values):
        rmse, best_smooth, best_prd = get_tgt_rmse(df, i, smooth_space = smooth_space)
        results.append(
            pd.Series(
                [i, rmse, best_smooth], index = ['X_tgt', 'RMSE', 'smooth']
            )
        )
        best_prd_list.append(best_prd.rename('__'.join(i)))
    return pd.DataFrame(results).sort_values('RMSE'), pd.concat(best_prd_list, axis=1).sort_index()

In [12]:
X_val = [
    'Episode_Length_minutes', 'Host_Popularity_percentage', 'Guest_Popularity_percentage', 'Number_of_Ads',
    'Episode_Sentiment', 'Episode_Title', 'Genre', 'Podcast_Name', 'Publication_Day', 'Publication_Time'
]

# ALL

In [37]:
df_c1, df_prd_1 = sc.cache_result(
    'tgt_rmse_c1', lambda : get_tgt_rmse_list(df_train, [list(i) for i in combinations(X_val, 1)]), rerun = 0
)
df_c1

Unnamed: 0,X_tgt,RMSE,smooth
0,[Episode_Length_minutes],13.478717,0.1
1,[Host_Popularity_percentage],26.851036,10.0
3,[Number_of_Ads],26.923498,0.01
2,[Guest_Popularity_percentage],26.991587,100.0
5,[Episode_Title],27.064105,100.0
7,[Podcast_Name],27.104318,100.0
4,[Episode_Sentiment],27.117177,100.0
9,[Publication_Time],27.130197,1000.0
6,[Genre],27.131325,1000.0
8,[Publication_Day],27.13408,1000.0


In [38]:
df_c2, df_prd_2 = sc.cache_result(
    'tgt_rmse_c2', lambda : get_tgt_rmse_list(df_train, [list(i) for i in combinations(X_val, 2)])
)
df_c2

Unnamed: 0,X_tgt,RMSE,smooth
2,"[Episode_Length_minutes, Number_of_Ads]",13.597791,0.1
3,"[Episode_Length_minutes, Episode_Sentiment]",13.658922,0.1
8,"[Episode_Length_minutes, Publication_Time]",13.801916,0.1
7,"[Episode_Length_minutes, Publication_Day]",14.249536,0.1
5,"[Episode_Length_minutes, Genre]",14.790121,0.1
6,"[Episode_Length_minutes, Podcast_Name]",19.202019,0.1
4,"[Episode_Length_minutes, Episode_Title]",21.356184,0.1
0,"[Episode_Length_minutes, Host_Popularity_perce...",26.004004,0.01
1,"[Episode_Length_minutes, Guest_Popularity_perc...",26.308444,0.01
9,"[Host_Popularity_percentage, Guest_Popularity_...",26.6142,0.1


In [39]:
df_c3, df_prd_3 = sc.cache_result(
    'tgt_rmse_c3', lambda : get_tgt_rmse_list(df_train, [list(i) for i in combinations(X_val, 3)])
)
df_c3

Unnamed: 0,X_tgt,RMSE,smooth
15,"[Episode_Length_minutes, Number_of_Ads, Episod...",14.822682,0.1
25,"[Episode_Length_minutes, Episode_Sentiment, Pu...",15.034009,0.1
20,"[Episode_Length_minutes, Number_of_Ads, Public...",15.451455,0.1
24,"[Episode_Length_minutes, Episode_Sentiment, Pu...",16.407360,0.1
19,"[Episode_Length_minutes, Number_of_Ads, Public...",17.047765,0.1
...,...,...,...
116,"[Genre, Podcast_Name, Publication_Day]",27.100219,1.0
106,"[Episode_Sentiment, Genre, Publication_Time]",27.102671,100.0
105,"[Episode_Sentiment, Genre, Publication_Day]",27.105426,100.0
109,"[Episode_Sentiment, Publication_Day, Publicati...",27.105580,100.0


In [40]:
from sklearn.linear_model import Lasso
from sklearn.model_selection import cross_validate
result = cross_validate(
    Lasso(), pd.concat([df_prd_1, df_prd_2, df_prd_3], axis=1), df_train[target], cv = kf, return_train_score = True, return_estimator = True,
    scoring = 'neg_root_mean_squared_error'
)
result

{'fit_time': array([16.79773188, 16.39941287, 16.69040608, 17.15137982]),
 'score_time': array([0.07062602, 0.07138562, 0.08125687, 0.070755  ]),
 'estimator': [Lasso(), Lasso(), Lasso(), Lasso()],
 'test_score': array([-13.02415835, -13.06627079, -13.08771192, -13.11668459]),
 'train_score': array([-13.08860006, -13.07544134, -13.06702481, -13.05846032])}

In [41]:
pd.concat([pd.Series(
    i.coef_, 
    index = df_prd_1.columns.tolist() + df_prd_2.columns.tolist() + df_prd_3.columns.tolist()
) for i in result['estimator']], axis=1).sum(axis=1).sort_values(ascending = False).pipe(
    lambda x: x.loc[x.abs() > 0]
).iloc[:50]

Episode_Length_minutes                                                        2.164473
Episode_Length_minutes__Number_of_Ads                                         0.715038
Episode_Length_minutes__Episode_Sentiment                                     0.396872
Number_of_Ads__Episode_Sentiment__Podcast_Name                                0.349240
Host_Popularity_percentage__Guest_Popularity_percentage                       0.318062
Number_of_Ads__Episode_Sentiment__Episode_Title                               0.309409
Host_Popularity_percentage__Number_of_Ads                                     0.246002
Host_Popularity_percentage__Episode_Title__Publication_Day                    0.201470
Host_Popularity_percentage                                                    0.187487
Episode_Length_minutes__Publication_Time                                      0.181315
Host_Popularity_percentage__Number_of_Ads__Episode_Title                      0.174746
Guest_Popularity_percentage__Episode_Title_

In [150]:
result = cross_validate(
    Lasso(alpha = 10), pd.concat([df_prd_1, df_prd_2, df_prd_3], axis=1), df_train[target], cv = kf, return_train_score = True, return_estimator = True,
    scoring = 'neg_root_mean_squared_error'
)

In [151]:
result

{'fit_time': array([14.32147956, 14.64381146, 14.30359864, 14.31751776]),
 'score_time': array([0.06859541, 0.07409096, 0.07515168, 0.07828474]),
 'estimator': [Lasso(alpha=10),
  Lasso(alpha=10),
  Lasso(alpha=10),
  Lasso(alpha=10)],
 'test_score': array([-13.29321151, -13.3422136 , -13.36211833, -13.39238578]),
 'train_score': array([-13.36512335, -13.34892969, -13.34194783, -13.33328577])}

In [152]:
pd.concat([pd.Series(
    i.coef_, 
    index = df_prd_1.columns.tolist() + df_prd_2.columns.tolist() + df_prd_3.columns.tolist()
) for i in result['estimator']], axis=1).sum(axis=1).sort_values(ascending = False).pipe(
    lambda x: x.loc[x.abs() > 0]
).iloc[:50]

Episode_Length_minutes                                         1.660493
Episode_Length_minutes__Number_of_Ads                          1.283022
Episode_Length_minutes__Episode_Sentiment                      0.430794
Episode_Length_minutes__Publication_Time                       0.164059
Episode_Length_minutes__Number_of_Ads__Episode_Sentiment       0.110863
Episode_Length_minutes__Publication_Day                        0.110240
Episode_Length_minutes__Number_of_Ads__Publication_Time        0.090717
Episode_Length_minutes__Number_of_Ads__Publication_Day         0.066607
Episode_Length_minutes__Episode_Title                          0.040763
Episode_Length_minutes__Episode_Sentiment__Publication_Time    0.036794
Episode_Length_minutes__Number_of_Ads__Genre                   0.010159
Episode_Length_minutes__Genre                                  0.002412
dtype: float64

In [153]:
result = cross_validate(
    Lasso(alpha = 0.1), pd.concat([df_prd_1, df_prd_2, df_prd_3], axis=1), df_train[target], cv = kf, return_train_score = True, return_estimator = True,
    scoring = 'neg_root_mean_squared_error'
)

In [155]:
pd.concat([pd.Series(
    i.coef_, 
    index = df_prd_1.columns.tolist() + df_prd_2.columns.tolist() + df_prd_3.columns.tolist()
) for i in result['estimator']], axis=1).sum(axis=1).sort_values(ascending = False).pipe(
    lambda x: x.loc[x.abs() > 0]
).iloc[:50]

Episode_Length_minutes                                                        2.277317
Episode_Length_minutes__Number_of_Ads                                         0.590363
Host_Popularity_percentage                                                    0.392128
Episode_Length_minutes__Episode_Sentiment                                     0.376052
Host_Popularity_percentage__Episode_Title__Podcast_Name                       0.346543
Host_Popularity_percentage__Guest_Popularity_percentage                       0.345084
Number_of_Ads__Episode_Sentiment__Publication_Time                            0.322413
Host_Popularity_percentage__Episode_Title__Publication_Time                   0.303683
Episode_Length_minutes__Host_Popularity_percentage                            0.265906
Guest_Popularity_percentage__Podcast_Name__Publication_Day                    0.253806
Number_of_Ads__Episode_Sentiment__Episode_Title                               0.234274
Host_Popularity_percentage__Guest_Popularit

# Episode_Length_minutes

In [13]:
df_train_1 = df_train.loc[df_train['Episode_Length_minutes'].notna()]
df_train_2 = df_train.loc[df_train['Episode_Length_minutes'].isna()]

## Not NA

In [None]:
X_val = [
    'Episode_Length_minutes', 'Host_Popularity_percentage', 'Guest_Popularity_percentage', 'Number_of_Ads',
    'Episode_Sentiment', 'Episode_Title', 'Genre', 'Podcast_Name', 'Publication_Day', 'Publication_Time'
]

In [14]:
df_c1_1, df_prd_1_1 = sc.cache_result(
    'tgt_rmse_c1_1', lambda : get_tgt_rmse_list(df_train_1, [list(i) for i in combinations(X_val, 1)]), rerun = 0
)
df_c1_1

  0%|          | 0/10 [00:00<?, ?it/s]

Unnamed: 0,X_tgt,RMSE,smooth
0,[Episode_Length_minutes],10.766287,0.1
1,[Host_Popularity_percentage],26.993083,10.0
3,[Number_of_Ads],27.043055,0.01
2,[Guest_Popularity_percentage],27.128409,100.0
5,[Episode_Title],27.192795,100.0
7,[Podcast_Name],27.23439,100.0
4,[Episode_Sentiment],27.245425,1000.0
9,[Publication_Time],27.260513,1000.0
6,[Genre],27.261399,1000.0
8,[Publication_Day],27.265196,1000.0


In [15]:
df_c2_1, df_prd_2_1 = sc.cache_result(
    'tgt_rmse_c2_1', lambda : get_tgt_rmse_list(df_train_1, [list(i) for i in combinations(X_val, 2)]), rerun = 0
)
df_c2_1

  0%|          | 0/45 [00:00<?, ?it/s]

Unnamed: 0,X_tgt,RMSE,smooth
2,"[Episode_Length_minutes, Number_of_Ads]",10.938435,0.1
3,"[Episode_Length_minutes, Episode_Sentiment]",11.020969,0.1
8,"[Episode_Length_minutes, Publication_Time]",11.225651,0.1
7,"[Episode_Length_minutes, Publication_Day]",11.826413,0.1
5,"[Episode_Length_minutes, Genre]",12.558555,0.1
6,"[Episode_Length_minutes, Podcast_Name]",18.098049,0.1
4,"[Episode_Length_minutes, Episode_Title]",20.657897,0.1
0,"[Episode_Length_minutes, Host_Popularity_perce...",25.989086,0.01
1,"[Episode_Length_minutes, Guest_Popularity_perc...",26.329916,0.01
9,"[Host_Popularity_percentage, Guest_Popularity_...",26.76602,0.1


In [33]:
from sklearn.model_selection import cross_validate
from sklearn.linear_model import LinearRegression, Lasso
result = cross_validate(
    Lasso(), pd.concat([
            df_prd_1_1,
            df_prd_2_1
    ], axis=1), df_train_1[target], cv = kf, return_train_score = True, return_estimator = True,
    scoring = 'neg_root_mean_squared_error'
)
result

{'fit_time': array([3.64599133, 3.60197043, 4.45727563, 3.75624061]),
 'score_time': array([0.02736497, 0.01269889, 0.0370388 , 0.01283908]),
 'estimator': [Lasso(), Lasso(), Lasso(), Lasso()],
 'test_score': array([-10.412633  , -10.42713913, -10.42408984, -10.43631323]),
 'train_score': array([-10.4285498 , -10.42380062, -10.42402606, -10.42135685])}

In [34]:
pd.concat([pd.Series(
    i.coef_, 
    index = df_prd_1_1.columns.tolist() + df_prd_2_1.columns.tolist()
) for i in result['estimator']], axis=1).sum(axis=1).sort_values(ascending = False).pipe(
    lambda x: x.loc[x.abs() > 0]
).iloc[:50]

Episode_Length_minutes                                     1.865710
Episode_Length_minutes__Number_of_Ads                      0.986703
Number_of_Ads__Episode_Sentiment                           0.653130
Episode_Length_minutes__Episode_Sentiment                  0.510490
Episode_Length_minutes__Publication_Time                   0.276558
Number_of_Ads__Podcast_Name                                0.264427
Episode_Length_minutes__Host_Popularity_percentage         0.242214
Episode_Length_minutes__Guest_Popularity_percentage        0.238117
Host_Popularity_percentage__Number_of_Ads                  0.229185
Number_of_Ads__Episode_Title                               0.227157
Episode_Length_minutes__Publication_Day                    0.188935
Host_Popularity_percentage__Guest_Popularity_percentage    0.164947
Episode_Length_minutes__Episode_Title                      0.116951
Host_Popularity_percentage                                 0.099850
Guest_Popularity_percentage__Number_of_Ads      

# Except Episode_Length_minutes 3

In [18]:
X_val = [
    'Host_Popularity_percentage', 'Guest_Popularity_percentage', 'Number_of_Ads',
    'Episode_Sentiment', 'Episode_Title', 'Genre', 'Podcast_Name', 'Publication_Day', 'Publication_Time'
]

In [19]:
df_c3_2, df_prd_3_2 = sc.cache_result(
    'tgt_rmse_c3_2', lambda : get_tgt_rmse_list(df_train, [list(i) for i in combinations(X_val, 3)]), rerun = 0
)
df_c3_2

  0%|          | 0/84 [00:00<?, ?it/s]

Unnamed: 0,X_tgt,RMSE,smooth
1,"[Host_Popularity_percentage, Guest_Popularity_...",26.624471,0.10
6,"[Host_Popularity_percentage, Guest_Popularity_...",26.636031,0.10
0,"[Host_Popularity_percentage, Guest_Popularity_...",26.639306,0.10
19,"[Host_Popularity_percentage, Episode_Title, Po...",26.654809,1.00
5,"[Host_Popularity_percentage, Guest_Popularity_...",26.683332,0.01
...,...,...,...
80,"[Genre, Podcast_Name, Publication_Day]",27.100219,1.00
70,"[Episode_Sentiment, Genre, Publication_Time]",27.102671,100.00
69,"[Episode_Sentiment, Genre, Publication_Day]",27.105426,100.00
73,"[Episode_Sentiment, Publication_Day, Publicati...",27.105580,100.00


In [35]:
df_train['Number_of_Ads'].value_counts()

Number_of_Ads
0.000000      217592
1.000000      214069
3.000000      160173
2.000000      158156
103.250000         2
53.369999          1
103.000000         1
103.910004         1
53.419998          1
103.750000         1
12.000000          1
103.879997         1
Name: count, dtype: int64

In [36]:
df_test['Number_of_Ads'].value_counts()

Number_of_Ads
0.000000       72863
1.000000       71015
3.000000       53556
2.000000       52564
89.120003          1
2063.000000        1
Name: count, dtype: int64

In [43]:
df_train[X_num].describe()

Unnamed: 0,Episode_Length_minutes,Host_Popularity_percentage,Guest_Popularity_percentage,Number_of_Ads,ELm_sqrt
count,662907.0,750000.0,603970.0,749999.0,662907.0
mean,64.504738,59.859909,52.236454,1.348855,7.707816
std,32.969604,22.873098,28.451242,1.15113,2.257057
min,0.0,1.3,0.0,0.0,0.0
25%,35.73,39.41,28.379999,0.0,5.977458
50%,63.84,60.049999,53.580002,1.0,7.989994
75%,94.07,79.529999,76.599998,2.0,9.698969
max,325.23999,119.459999,119.910004,103.910004,18.03441


In [54]:
df_test[X_num].describe()

Unnamed: 0,Episode_Length_minutes,Host_Popularity_percentage,Guest_Popularity_percentage,Number_of_Ads,ELm_sqrt
count,221264.0,250000.0,201168.0,250000.0,221264.0
mean,419.2987,59.716492,52.192799,1.355852,7.75162
std,166854.5,22.880028,28.445034,4.274398,18.952908
min,2.47,2.49,0.0,0.0,1.571623
25%,35.78,39.25,28.32,0.0,5.981638
50%,63.97,59.900002,53.360001,1.0,7.998125
75%,94.15,79.389999,76.559998,2.0,9.703093
max,78486260.0,117.760002,116.82,2063.0,8859.24707


- Host_Popularity_percentage 범주형 변수화하여 따로 저장
  
- Guest_Popularity_percentage 범주형 변수화하여 따로 저장

- Host_Popularity_percentage 를 -1 ~ 1 사이로 MinMaxScaling 

- Number_of_Ads에 이상치가 존재합니다. 이를 최빈값으로 정리하고 0 ~ 1 사이로 MinMaxScaling

- Guest_Popularity_percentage 를 -1 ~ 1 사이로 MinMaxScaling

- Target Encoding 실험 재설계,
  > Episode_Length_minutes 제외한 모든 변수로 모든 데이터셋으로 Combination 1 ~ 4
  >
  > Episode_Length_minutes 포함 모든 변수로 Episode_Length_minutes 미결측 데이터셋으로 Combination 0 ~ 3

In [48]:
(df_train['Host_Popularity_percentage'] > 100).sum()

np.int64(25)

In [49]:
(df_train['Guest_Popularity_percentage'] > 100).sum()

np.int64(19)

In [51]:
df_train.loc[
    df_train['Host_Popularity_percentage'] > 100, 'Host_Popularity_percentage'
].value_counts()

Host_Popularity_percentage
117.139999    5
117.760002    2
112.250000    2
103.000000    2
118.730003    1
115.180000    1
105.570000    1
114.970001    1
107.019997    1
118.690002    1
118.930000    1
101.110001    1
109.379997    1
112.440002    1
101.309998    1
114.730003    1
119.459999    1
104.599998    1
Name: count, dtype: int64

In [52]:
df_train.loc[
    df_train['Guest_Popularity_percentage'] > 100, 'Guest_Popularity_percentage'
].value_counts()

Guest_Popularity_percentage
114.720001    2
107.809998    2
115.620003    2
106.900002    1
110.139999    1
102.129997    1
105.480003    1
107.080002    1
119.910004    1
102.379997    1
114.879997    1
115.430000    1
115.410004    1
107.339996    1
107.580002    1
105.440002    1
Name: count, dtype: int64