In [3]:
from tqdm import tqdm
from itertools import combinations

import numpy as np
import pandas as pd
import polars as pl

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import KFold
from sklearn.preprocessing import TargetEncoder

import lightgbm as lgb

import warnings

In [4]:
def feature_eng(df):
    podc_dict = {'Mystery Matters': 0, 'Joke Junction': 1, 'Study Sessions': 2, 'Digital Digest': 3, 'Mind & Body': 4, 'Fitness First': 5, 'Criminal Minds': 6, 'News Roundup': 7, 'Daily Digest': 8, 'Music Matters': 9, 'Sports Central': 10, 'Melody Mix': 11, 'Game Day': 12, 'Gadget Geek': 13, 'Global News': 14, 'Tech Talks': 15, 'Sport Spot': 16, 'Funny Folks': 17, 'Sports Weekly': 18, 'Business Briefs': 19, 'Tech Trends': 20, 'Innovators': 21, 'Health Hour': 22, 'Comedy Corner': 23, 'Sound Waves': 24, 'Brain Boost': 25, "Athlete's Arena": 26, 'Wellness Wave': 27, 'Style Guide': 28, 'World Watch': 29, 'Humor Hub': 30, 'Money Matters': 31, 'Healthy Living': 32, 'Home & Living': 33, 'Educational Nuggets': 34, 'Market Masters': 35, 'Learning Lab': 36, 'Lifestyle Lounge': 37, 'Crime Chronicles': 38, 'Detective Diaries': 39, 'Life Lessons': 40, 'Current Affairs': 41, 'Finance Focus': 42, 'Laugh Line': 43, 'True Crime Stories': 44, 'Business Insights': 45, 'Fashion Forward': 46, 'Tune Time': 47}
    genr_dict = {'True Crime': 0, 'Comedy': 1, 'Education': 2, 'Technology': 3, 'Health': 4, 'News': 5, 'Music': 6, 'Sports': 7, 'Business': 8, 'Lifestyle': 9}
    week_dict = {'Monday': 0, 'Tuesday': 1, 'Wednesday': 2, 'Thursday': 3, 'Friday': 4, 'Saturday': 5, 'Sunday': 6}
    time_dict = {'Morning': 0, 'Afternoon': 1, 'Evening': 2, 'Night': 3}
    sent_dict = {'Negative': 0, 'Neutral': 1, 'Positive': 2}
    
    df['Episode_Num'] = df['Episode_Title'].str[8:].astype('category')
    
    df['Genre'] = df['Genre'].replace(genr_dict)
    df['Podcast_Name'] = df['Podcast_Name'].replace(podc_dict)
    df['Publication_Day'] = df['Publication_Day'].replace(week_dict)
    df['Publication_Time'] = df['Publication_Time'].replace(time_dict)
    df['Episode_Sentiment'] = df['Episode_Sentiment'].replace(sent_dict)
    
    df['Genre'] = df['Genre'].astype('category')
    df['Podcast_Name'] = df['Podcast_Name'].astype('category')
    df['Publication_Day'] = df['Publication_Day'].astype('category')
    df['Publication_Time'] = df['Publication_Time'].astype('category')
    df['Episode_Sentiment'] = df['Episode_Sentiment'].astype('category')
    
    df = df.drop(columns=['Episode_Title'])
    return df

In [6]:
df_train = pd.read_csv('data/train.csv', index_col='id')
df_train = feature_eng(df_train)

df_test = pd.read_csv('data/test.csv', index_col='id')
df_test = feature_eng(df_test)

df_subm = pd.read_csv('data/sample_submission.csv', index_col='id')

  df['Genre'] = df['Genre'].replace(genr_dict)
  df['Podcast_Name'] = df['Podcast_Name'].replace(podc_dict)
  df['Publication_Day'] = df['Publication_Day'].replace(week_dict)
  df['Publication_Time'] = df['Publication_Time'].replace(time_dict)
  df['Episode_Sentiment'] = df['Episode_Sentiment'].replace(sent_dict)
  df['Genre'] = df['Genre'].replace(genr_dict)
  df['Podcast_Name'] = df['Podcast_Name'].replace(podc_dict)
  df['Publication_Day'] = df['Publication_Day'].replace(week_dict)
  df['Publication_Time'] = df['Publication_Time'].replace(time_dict)
  df['Episode_Sentiment'] = df['Episode_Sentiment'].replace(sent_dict)


In [7]:
encode_columns = ['Episode_Length_minutes', 'Episode_Num', 'Host_Popularity_percentage', 'Number_of_Ads', 'Episode_Sentiment', 'Publication_Day', 'Publication_Time']
pair_size = [2, 3, 4]

for r in pair_size:
    for cols in tqdm(list(combinations(encode_columns, r))):
        new_col_name = '_'.join(cols)
        
        df_train[new_col_name] = df_train[list(cols)].astype(str).agg('_'.join, axis=1)
        df_train[new_col_name] = df_train[new_col_name].astype('category')
        
        df_test[new_col_name] = df_test[list(cols)].astype(str).agg('_'.join, axis=1)
        df_test[new_col_name] = df_test[new_col_name].astype('category')

100%|███████████████████████████████████████████████████████████████████████████████████| 21/21 [00:42<00:00,  2.05s/it]
100%|███████████████████████████████████████████████████████████████████████████████████| 35/35 [01:29<00:00,  2.56s/it]
100%|███████████████████████████████████████████████████████████████████████████████████| 35/35 [01:44<00:00,  2.98s/it]


In [8]:
X = df_train.drop(columns=['Listening_Time_minutes'])
y = df_train['Listening_Time_minutes']

In [10]:
cv = KFold(5, random_state=42, shuffle=True)
y_pred = np.zeros(len(df_subm))

for idx_train, idx_valid in cv.split(X, y):
    X_train, y_train = X.iloc[idx_train].copy(), y.iloc[idx_train]
    X_valid, y_valid = X.iloc[idx_valid].copy(), y.iloc[idx_valid]
    X_test = df_test[X.columns].copy()
    
    encoded_columns = df_train.columns[11:]
    encoder = TargetEncoder(random_state=42)
    
    X_train[encoded_columns] = encoder.fit_transform(X_train[encoded_columns], y_train)
    X_valid[encoded_columns] = encoder.transform(X_valid[encoded_columns])
    X_test[encoded_columns] = encoder.transform(X_test[encoded_columns])

    model = lgb.LGBMRegressor(
        n_iter=1000,
        max_depth=-1,
        num_leaves=1024,
        colsample_bytree=0.7,
        learning_rate=0.03,
        objective='l2',
        metric='rmse', 
        verbosity=-1,
        max_bin=1024,
    )

    model.fit(
        X_train, y_train,
        eval_set=[(X_valid, y_valid)],
        callbacks=[lgb.log_evaluation(100)],
    )
    
    y_pred += model.predict(X_test)

[100]	valid_0's rmse: 12.3248
[200]	valid_0's rmse: 12.2214
[300]	valid_0's rmse: 12.2177
[400]	valid_0's rmse: 12.2184
[500]	valid_0's rmse: 12.22
[600]	valid_0's rmse: 12.2222
[700]	valid_0's rmse: 12.2222
[800]	valid_0's rmse: 12.2238
[900]	valid_0's rmse: 12.2245
[1000]	valid_0's rmse: 12.2269
[100]	valid_0's rmse: 12.3654
[200]	valid_0's rmse: 12.2599
[300]	valid_0's rmse: 12.2544
[400]	valid_0's rmse: 12.255
[500]	valid_0's rmse: 12.2556
[600]	valid_0's rmse: 12.2555
[700]	valid_0's rmse: 12.2568
[800]	valid_0's rmse: 12.2578
[900]	valid_0's rmse: 12.2601
[1000]	valid_0's rmse: 12.2607
[100]	valid_0's rmse: 12.3798
[200]	valid_0's rmse: 12.2757
[300]	valid_0's rmse: 12.2795
[400]	valid_0's rmse: 12.28
[500]	valid_0's rmse: 12.2821
[600]	valid_0's rmse: 12.2838
[700]	valid_0's rmse: 12.286
[800]	valid_0's rmse: 12.2874
[900]	valid_0's rmse: 12.2891
[1000]	valid_0's rmse: 12.2915
[100]	valid_0's rmse: 12.3539
[200]	valid_0's rmse: 12.2495
[300]	valid_0's rmse: 12.2462
[400]	valid_0

In [12]:
df_subm['Listening_Time_minutes'] = y_pred / 5
df_subm.to_csv('result/submission.csv')
df_subm.head()

Unnamed: 0_level_0,Listening_Time_minutes
id,Unnamed: 1_level_1
750000,53.652669
750001,23.661658
750002,48.053122
750003,80.324293
750004,49.207982


In [13]:
!kaggle competitions submit -c playground-series-s5e4 -f result/submission.csv -m "0"

100%|██████████████████████████████████████| 6.01M/6.01M [00:06<00:00, 1.02MB/s]
Successfully submitted to Predict Podcast Listening Time

In [17]:
X_train = X.copy()
X_test = df_test[X.columns].copy()

encoded_columns = df_train.columns[11:]
encoder = TargetEncoder(random_state=42)

X_train[encoded_columns] = encoder.fit_transform(X_train[encoded_columns], y)
X_test[encoded_columns] = encoder.transform(X_test[encoded_columns])

model = lgb.LGBMRegressor(
    n_iter=1000,
    max_depth=-1,
    num_leaves=1024,
    colsample_bytree=0.7,
    learning_rate=0.03,
    objective='l2',
    metric='rmse', 
    verbosity=-1,
    max_bin=1024,
)

model.fit(
    X_train, y,
)

In [18]:
y_pred = model.predict(X_test)
y_pred

array([53.85206762, 23.85606308, 49.92819131, ...,  6.98550643,
       76.91054486, 57.28212225])

In [19]:
df_subm['Listening_Time_minutes'] = y_pred
df_subm.to_csv('result/submission.csv')
df_subm.head()

Unnamed: 0_level_0,Listening_Time_minutes
id,Unnamed: 1_level_1
750000,53.852068
750001,23.856063
750002,49.928191
750003,83.628084
750004,47.611073


In [20]:
!kaggle competitions submit -c playground-series-s5e4 -f result/submission.csv -m "0"

100%|██████████████████████████████████████| 6.02M/6.02M [00:02<00:00, 2.51MB/s]
Successfully submitted to Predict Podcast Listening Time

In [22]:
X_test[encoded_columns]

Unnamed: 0_level_0,Episode_Length_minutes_Episode_Num,Episode_Length_minutes_Host_Popularity_percentage,Episode_Length_minutes_Number_of_Ads,Episode_Length_minutes_Episode_Sentiment,Episode_Length_minutes_Publication_Day,Episode_Length_minutes_Publication_Time,Episode_Num_Host_Popularity_percentage,Episode_Num_Number_of_Ads,Episode_Num_Episode_Sentiment,Episode_Num_Publication_Day,...,Episode_Num_Host_Popularity_percentage_Publication_Day_Publication_Time,Episode_Num_Number_of_Ads_Episode_Sentiment_Publication_Day,Episode_Num_Number_of_Ads_Episode_Sentiment_Publication_Time,Episode_Num_Number_of_Ads_Publication_Day_Publication_Time,Episode_Num_Episode_Sentiment_Publication_Day_Publication_Time,Host_Popularity_percentage_Number_of_Ads_Episode_Sentiment_Publication_Day,Host_Popularity_percentage_Number_of_Ads_Episode_Sentiment_Publication_Time,Host_Popularity_percentage_Number_of_Ads_Publication_Day_Publication_Time,Host_Popularity_percentage_Episode_Sentiment_Publication_Day_Publication_Time,Number_of_Ads_Episode_Sentiment_Publication_Day_Publication_Time
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
750000,45.437406,45.437406,59.298337,56.889110,57.366209,56.432761,45.437406,43.533727,44.584864,43.102532,...,45.437406,48.500120,46.258279,40.083214,42.546831,45.437406,52.324656,45.437406,45.437406,47.089503
750001,24.764820,24.764820,17.722471,18.692653,19.220093,19.380794,34.229613,46.801058,44.757831,43.761284,...,45.437406,48.313367,51.199762,50.178555,46.539327,26.343640,78.645460,45.437406,16.890039,47.614155
750002,40.112350,45.437406,49.830596,49.568865,45.965990,49.386953,84.984920,48.873363,48.443006,46.128162,...,45.437406,49.264396,50.807530,39.229723,45.183892,45.437406,46.969430,45.437406,45.437406,49.119337
750003,45.437406,45.437406,88.841151,85.041137,76.574034,85.310660,54.154784,41.597400,43.799138,43.632533,...,45.437406,39.990001,40.099336,42.187936,42.260115,5.983080,45.437406,45.437406,45.437406,43.681061
750004,45.437406,49.077140,51.025148,51.213051,53.746846,49.285640,45.437406,41.834676,44.429629,44.531006,...,45.437406,42.440226,40.317511,41.339426,48.877886,65.794390,50.125694,45.437406,33.720920,43.668055
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,45.437406,45.437406,8.052275,9.339370,10.732991,9.691316,45.437406,39.757320,42.758038,43.199160,...,45.437406,42.787113,39.052328,38.431658,43.439962,27.494420,34.028659,45.437406,35.494420,38.047102
999996,45.437406,45.437406,60.836422,56.476661,67.541160,62.512972,60.756966,43.213860,44.625555,46.226580,...,45.437406,39.650551,49.577913,46.519745,44.561467,18.680516,29.805920,45.437406,66.705920,42.984524
999997,0.700480,45.437406,8.054990,6.260637,6.196765,6.017504,38.665220,41.677286,41.976295,38.241558,...,45.437406,34.234232,42.708892,42.948684,37.469210,38.200956,22.032895,68.958700,27.543670,47.449897
999998,87.542960,45.437406,73.254715,81.560069,78.504922,79.232990,79.454930,41.738861,45.926289,45.777990,...,45.437406,43.564049,42.523783,43.148549,45.784873,45.437406,54.516704,45.437406,45.437406,42.332598
