In [1]:
import pandas as pd
from tqdm import tqdm
from itertools import combinations

import numpy as np
import pandas as pd
import polars as pl

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import KFold
from sklearn.preprocessing import TargetEncoder
import warnings
warnings.filterwarnings("ignore")

In [2]:
df_train = pd.read_csv('train.csv', index_col='id', dtype={'Episode_Length_minutes': str})
df_test = pd.read_csv('test.csv', index_col='id', dtype={'Episode_Length_minutes': str})
df_subm = pd.read_csv('sample_submission.csv', index_col='id')
original = pd.read_csv('podcast_dataset.csv', dtype={'Episode_Length_minutes': str})

In [3]:
def count_decimal_digits(s):
    if pd.isna(s):
        return None
    if '.' in s:
        return len(s.split('.')[1]) 
    return 0
df_train['decimal_digits_e'] = df_train['Episode_Length_minutes'].apply(count_decimal_digits)
df_test['decimal_digits_e'] = df_test['Episode_Length_minutes'].apply(count_decimal_digits)
original['decimal_digits_e'] = original['Episode_Length_minutes'].apply(count_decimal_digits)

df_train['Episode_Length_minutes'] = df_train['Episode_Length_minutes'].astype('float')
df_test['Episode_Length_minutes'] = df_test['Episode_Length_minutes'].astype('float')
original['Episode_Length_minutes'] = original['Episode_Length_minutes'].astype('float')

In [4]:
df_train = df_train[df_train['Number_of_Ads'].notna()]
original = original[original['Listening_Time_minutes'].notna()]

In [5]:
original = original.drop_duplicates()
df_train = pd.concat([df_train,original])

In [6]:
df_train["NaNs"] = np.float32(0)
df_test["NaNs"] = np.float32(0)

for i,c in enumerate(['Episode_Length_minutes', 'Guest_Popularity_percentage']):

    # NEW FEATURE - ENCODE ALL NAN AS ONE BASE-2 FEATURE
    df_train["NaNs"] += df_train[c].isna()*2**i
    df_test["NaNs"] += df_test[c].isna()*2**i
    
    n = f"{c}_nan_wc"
    df_train[n] = df_train[c].isna()*100 + df_train["Episode_Length_minutes"]    
    df_test[n] = df_test[c].isna()*100 + df_test["Episode_Length_minutes"]  

In [7]:
for i in ['Episode_Length_minutes','Host_Popularity_percentage','Guest_Popularity_percentage']:
    for k in range(0,1):
        n = i + f"_round{k}"
        df_train[n] = df_train["Episode_Length_minutes"].round(k)
        df_test[n] = df_test["Episode_Length_minutes"].round(k)

for i in ['Episode_Length_minutes','Host_Popularity_percentage','Guest_Popularity_percentage']:
    for k in range(0,2):
        n = i + f'_digit{k}'
        df_train[n] = ((df_train['Episode_Length_minutes'] * 10**k) % 10).fillna(-1).astype("int8")
        df_test[n] = ((df_test['Episode_Length_minutes'] * 10**k) % 10).fillna(-1).astype("int8")

for i in ['Episode_Length_minutes','Host_Popularity_percentage','Guest_Popularity_percentage']:
    k = i + "_digit0_1"
    n = i + '_digit0'
    m = i + '_digit1'
    df_train[k] = ((df_train[n]+1)*11 + df_train[m]+1).astype("int8")
    df_test[k] = ((df_test[n]+1)*11 + df_test[m]+1).astype("int8")

In [8]:
def feature_eng(df):
    podc_dict = {'Mystery Matters': 48, 'Joke Junction': 1, 'Study Sessions': 2, 'Digital Digest': 3, 'Mind & Body': 4, 'Fitness First': 5, 'Criminal Minds': 6, 'News Roundup': 7, 'Daily Digest': 8, 'Music Matters': 9, 'Sports Central': 10, 'Melody Mix': 11, 'Game Day': 12, 'Gadget Geek': 13, 'Global News': 14, 'Tech Talks': 15, 'Sport Spot': 16, 'Funny Folks': 17, 'Sports Weekly': 18, 'Business Briefs': 19, 'Tech Trends': 20, 'Innovators': 21, 'Health Hour': 22, 'Comedy Corner': 23, 'Sound Waves': 24, 'Brain Boost': 25, "Athlete's Arena": 26, 'Wellness Wave': 27, 'Style Guide': 28, 'World Watch': 29, 'Humor Hub': 30, 'Money Matters': 31, 'Healthy Living': 32, 'Home & Living': 33, 'Educational Nuggets': 34, 'Market Masters': 35, 'Learning Lab': 36, 'Lifestyle Lounge': 37, 'Crime Chronicles': 38, 'Detective Diaries': 39, 'Life Lessons': 40, 'Current Affairs': 41, 'Finance Focus': 42, 'Laugh Line': 43, 'True Crime Stories': 44, 'Business Insights': 45, 'Fashion Forward': 46, 'Tune Time': 47}
    genr_dict = {'True Crime': 10, 'Comedy': 1, 'Education': 2, 'Technology': 3, 'Health': 4, 'News': 5, 'Music': 6, 'Sports': 7, 'Business': 8, 'Lifestyle': 9}
    week_dict = {'Monday': 1, 'Tuesday': 2, 'Wednesday': 3, 'Thursday': 4, 'Friday': 5, 'Saturday': 6, 'Sunday': 7}
    time_dict = {'Morning': 1, 'Afternoon': 2, 'Evening': 3, 'Night': 4}
    sent_dict = {'Negative': -1, 'Neutral': 1, 'Positive': 2}
    
    df['Episode_Title'] = df['Episode_Title'].str[8:].astype('int')
    
    df['Genre'] = df['Genre'].replace(genr_dict)
    df['Podcast_Name'] = df['Podcast_Name'].replace(podc_dict)
    df['Publication_Day'] = df['Publication_Day'].replace(week_dict)
    df['Publication_Time'] = df['Publication_Time'].replace(time_dict)
    df['Episode_Sentiment'] = df['Episode_Sentiment'].replace(sent_dict)
    

    return df

import pandas as pd
df_train = feature_eng(df_train)
df_test = feature_eng(df_test)

In [11]:
Cal_columns = ['Episode_Length_minutes', 'Episode_Title', 'Host_Popularity_percentage', 'Number_of_Ads'
                  ,'Episode_Sentiment', 'Publication_Day', 'Publication_Time','Podcast_Name','Guest_Popularity_percentage'
                 ,'Genre']
import itertools
for x in list(itertools.combinations(Cal_columns,2)):
    sum = x[0] + '_sum_' + x[1]
    divide = x[0] + '_divide_' + x[1]
    multiply = x[0] + '_mul_' + x[1]
    substract = x[0] + '_sub_' + x[1]
    
    df_train[sum] = df_train[x[0]] + df_train[x[1]]
    df_train[divide] = df_train[x[0]] / (df_train[x[1]]+0.1)
    df_train[multiply] = df_train[x[0]] * df_train[x[1]]
    df_train[substract] = df_train[x[0]] - df_train[x[1]]

    df_test[sum] = df_test[x[0]] + df_test[x[1]]
    df_test[divide] = df_test[x[0]] / (df_test[x[1]]+0.1)
    df_test[multiply] = df_test[x[0]] * df_test[x[1]]
    df_test[substract] = df_test[x[0]] - df_test[x[1]]
    

In [13]:
df_train['Genre'] = df_train['Genre'].astype('category')
df_train['Podcast_Name'] = df_train['Podcast_Name'].astype('category')
df_train['Publication_Day'] = df_train['Publication_Day'].astype('category')
df_train['Publication_Time'] = df_train['Publication_Time'].astype('category')
df_train['Episode_Sentiment'] = df_train['Episode_Sentiment'].astype('category')
df_train['Episode_Title'] = df_train['Episode_Title'].astype('category')

df_test['Genre'] = df_test['Genre'].astype('category')
df_test['Podcast_Name'] = df_test['Podcast_Name'].astype('category')
df_test['Publication_Day'] = df_test['Publication_Day'].astype('category')
df_test['Publication_Time'] = df_test['Publication_Time'].astype('category')
df_test['Episode_Sentiment'] = df_test['Episode_Sentiment'].astype('category')
df_test['Episode_Title'] = df_test['Episode_Title'].astype('category')

In [14]:
encode_columns = ['Episode_Length_minutes', 'Episode_Title', 'Host_Popularity_percentage', 'Number_of_Ads'
                  ,'Episode_Sentiment', 'Publication_Day', 'Publication_Time','Podcast_Name','Guest_Popularity_percentage'
                 ,'Genre']
pair_size = [2, 3, 4]

for r in pair_size:
    for cols in tqdm(list(combinations(encode_columns, r))):
        new_col_name = '_'.join(cols)
        
        df_train[new_col_name] = df_train[list(cols)].astype(str).agg('_'.join, axis=1)
        df_train[new_col_name] = df_train[new_col_name].astype('category')
        
        df_test[new_col_name] = df_test[list(cols)].astype(str).agg('_'.join, axis=1)
        df_test[new_col_name] = df_test[new_col_name].astype('category')

100%|███████████████████████████████████████████| 45/45 [01:35<00:00,  2.12s/it]
100%|█████████████████████████████████████████| 120/120 [05:04<00:00,  2.54s/it]
100%|█████████████████████████████████████████| 210/210 [10:13<00:00,  2.92s/it]


In [15]:
df_test.to_parquet('df_test.parquet.gzip',
              compression='gzip') 

In [16]:
X = df_train
y = df_train['Listening_Time_minutes']

In [17]:
X.to_parquet('df_X.parquet.gzip',
              compression='gzip') 

In [18]:
X

Unnamed: 0,Podcast_Name,Episode_Title,Episode_Length_minutes,Genre,Host_Popularity_percentage,Publication_Day,Publication_Time,Guest_Popularity_percentage,Number_of_Ads,Episode_Sentiment,...,Episode_Sentiment_Publication_Day_Guest_Popularity_percentage_Genre,Episode_Sentiment_Publication_Time_Podcast_Name_Guest_Popularity_percentage,Episode_Sentiment_Publication_Time_Podcast_Name_Genre,Episode_Sentiment_Publication_Time_Guest_Popularity_percentage_Genre,Episode_Sentiment_Podcast_Name_Guest_Popularity_percentage_Genre,Publication_Day_Publication_Time_Podcast_Name_Guest_Popularity_percentage,Publication_Day_Publication_Time_Podcast_Name_Genre,Publication_Day_Publication_Time_Guest_Popularity_percentage_Genre,Publication_Day_Podcast_Name_Guest_Popularity_percentage_Genre,Publication_Time_Podcast_Name_Guest_Popularity_percentage_Genre
0,48,98,,10,74.81,4,4,,0.0,2,...,2_4_nan_10,2_4_48_nan,2_4_48_10,2_4_nan_10,2_48_nan_10,4_4_48_nan,4_4_48_10,4_4_nan_10,4_48_nan_10,4_48_nan_10
1,1,26,119.80,1,66.95,6,2,75.95,2.0,-1,...,-1_6_75.95_1,-1_2_1_75.95,-1_2_1_1,-1_2_75.95_1,-1_1_75.95_1,6_2_1_75.95,6_2_1_1,6_2_75.95_1,6_1_75.95_1,2_1_75.95_1
2,2,16,73.90,2,69.97,2,3,8.97,0.0,-1,...,-1_2_8.97_2,-1_3_2_8.97,-1_3_2_2,-1_3_8.97_2,-1_2_8.97_2,2_3_2_8.97,2_3_2_2,2_3_8.97_2,2_2_8.97_2,3_2_8.97_2
3,3,45,67.17,3,57.22,1,1,78.70,2.0,2,...,2_1_78.7_3,2_1_3_78.7,2_1_3_3,2_1_78.7_3,2_3_78.7_3,1_1_3_78.7,1_1_3_3,1_1_78.7_3,1_3_78.7_3,1_3_78.7_3
4,4,86,110.51,4,80.07,1,2,58.68,3.0,1,...,1_1_58.68_4,1_2_4_58.68,1_2_4_4,1_2_58.68_4,1_4_58.68_4,1_2_4_58.68,1_2_4_4,1_2_58.68_4,1_4_58.68_4,2_4_58.68_4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49991,42,77,92.44,8,27.34,4,1,63.54,3.0,2,...,2_4_63.54_8,2_1_42_63.54,2_1_42_8,2_1_63.54_8,2_42_63.54_8,4_1_42_63.54,4_1_42_8,4_1_63.54_8,4_42_63.54_8,1_42_63.54_8
49993,27,100,50.30,4,20.61,4,1,55.63,0.0,2,...,2_4_55.63_4,2_1_27_55.63,2_1_27_4,2_1_55.63_4,2_27_55.63_4,4_1_27_55.63,4_1_27_4,4_1_55.63_4,4_27_55.63_4,1_27_55.63_4
49996,25,12,89.78,2,93.08,6,3,93.12,3.0,2,...,2_6_93.12_2,2_3_25_93.12,2_3_25_2,2_3_93.12_2,2_25_93.12_2,6_3_25_93.12,6_3_25_2,6_3_93.12_2,6_25_93.12_2,3_25_93.12_2
49997,20,18,77.86,3,41.04,6,2,52.33,2.0,2,...,2_6_52.33_3,2_2_20_52.33,2_2_20_3,2_2_52.33_3,2_20_52.33_3,6_2_20_52.33,6_2_20_3,6_2_52.33_3,6_20_52.33_3,2_20_52.33_3
