<a href="https://www.kaggle.com/code/vanpatangan/predict-podcast-listening-time?scriptVersionId=232645886" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# Import Libraries and Data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


import warnings
warnings.filterwarnings('ignore')

In [None]:
# Read in data
train = pd.read_csv("/kaggle/input/playground-series-s5e4/train.csv")
test = pd.read_csv("/kaggle/input/playground-series-s5e4/test.csv")

# EDA

In [None]:
def check(df):
    """
    Generates a concise summary of DataFrame columns.
    """
    # Use list comprehension to iterate over each column
    summary = [
        [col, df[col].dtype, df[col].count(), df[col].nunique(), df[col].isnull().sum(), df.duplicated().sum()]
        for col in df.columns
    ]

    # Create a DataFrame from the list of lists
    df_check = pd.DataFrame(summary, columns=["column", "dtype", "instances", "unique", "sum_null", "duplicates"])

    return df_check

In [None]:
print("Training Data Summary")
display(check(train))
display(train.head())

print("Test Data Summary")
display(check(test))
display(test.head())

## Visualizations

In [None]:
# Distribution of Listening_Time_minutes
sns.histplot(data=train, x='Listening_Time_minutes', bins=30, kde=True)
plt.title('Distribution of Listening Time (Minutes)')
plt.xlabel('Listening Time (minutes)')
plt.show()

In [None]:
# Listening_Time_minutes across different Genre categories
plt.figure(figsize=(12, 6))
sns.boxplot(data=train, x='Genre', y='Listening_Time_minutes')
plt.xticks(rotation=45)
plt.title('Listening Time by Genre')
plt.show()

In [None]:
# Episode Length vs. Listening Time
plt.figure(figsize=(10, 6))
sns.scatterplot(data=train, x='Episode_Length_minutes', y='Listening_Time_minutes', alpha=0.5)
plt.title('Episode Length vs. Listening Time')
plt.xlabel('Episode Length (minutes)')
plt.ylabel('Listening Time (minutes)')
plt.show()

In [None]:
# Correlation Matrix of Numerical Features
plt.figure(figsize=(8, 6))
sns.heatmap(train[['Episode_Length_minutes', 'Host_Popularity_percentage', 'Guest_Popularity_percentage', 'Number_of_Ads', 'Listening_Time_minutes']].corr(), annot=True, cmap='RdBu', vmin=-1, vmax=1)
plt.title('Correlation Heatmap')
plt.show()

In [None]:
# Listening Time by Episode Sentiment
plt.figure(figsize=(12, 6))
sns.boxplot(data=train, x='Episode_Sentiment', y='Listening_Time_minutes')
plt.title('Listening Time by Episode Sentiment')
plt.show()

In [None]:
# Average Listening_Time_minutes for each Publication_Day
plt.figure(figsize=(12, 6))
sns.barplot(data=train, x='Publication_Day', y='Listening_Time_minutes', estimator='mean', order=['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'])
plt.title('Average Listening Time by Publication Day')
plt.show()

# Preprocess

In [None]:
from sklearn.impute import SimpleImputer

# Define numerical columns with missing values
num_cols = ['Episode_Length_minutes', 'Guest_Popularity_percentage', 'Number_of_Ads']

# Imputer (fit on train, apply to both)
imputer = SimpleImputer(strategy='median')
train[num_cols] = imputer.fit_transform(train[num_cols])
test[num_cols] = imputer.transform(test[num_cols]) # Use train's median

In [None]:
# Convert Publication_Time to numerical bins 
time_mapping = {
    'Night': 0,
    'Morning': 6,
    'Afternoon': 12,
    'Evening': 18
}

train['Publication_Hour'] = train['Publication_Time'].map(time_mapping)
test['Publication_Hour'] = test['Publication_Time'].map(time_mapping)

# Drop original column
train.drop('Publication_Time', axis=1, inplace=True)
test.drop('Publication_Time', axis=1, inplace=True)

## Interaction Features

In [None]:
# Episode Length / Popularity
train['Length_Host_Popularity'] = train['Episode_Length_minutes'] * train['Host_Popularity_percentage']
test['Length_Host_Popularity'] = test['Episode_Length_minutes'] * test['Host_Popularity_percentage']
train['Length_Guest_Popularity'] = train['Episode_Length_minutes'] * train['Guest_Popularity_percentage']
test['Length_Guest_Popularity'] = test['Episode_Length_minutes'] * test['Guest_Popularity_percentage']

# Ads / Length
train['Ads_Length_Ratio'] = train['Number_of_Ads'] / (train['Episode_Length_minutes'] + 1e-6)  
test['Ads_Length_Ratio'] = test['Number_of_Ads'] / (test['Episode_Length_minutes'] + 1e-6)


In [None]:
# Encode categorical variables 

# Target encoding for high cardinality columns (Too many unique values for one hot encoding)
for col in ['Podcast_Name', 'Episode_Title']:
    mean_target = train.groupby(col)['Listening_Time_minutes'].mean()
    train[col + '_encoded'] = train[col].map(mean_target)
    test[col + '_encoded'] = test[col].map(mean_target).fillna(mean_target.mean())  # Fill unseen categories

# One hot encoding for low cardinality columns
cat_cols = ['Genre', 'Publication_Day', 'Episode_Sentiment']
train = pd.get_dummies(train, columns=cat_cols, drop_first=True)
test = pd.get_dummies(test, columns=cat_cols, drop_first=True)

# Align train and test columns (in case of mismatched categories)
train, test = train.align(test, join='left', axis=1, fill_value=0)

# Drop unnecessary columns
test_id = test['id'].copy() # For submission later
columns_to_drop = ['Podcast_Name','Episode_Title', 'id']

train.drop(columns=columns_to_drop, inplace=True)
test.drop(columns=columns_to_drop, inplace=True)

In [None]:
# Popularity Ratio / Sentiment
train['Host_Popularity_Sentiment_Positive'] = train['Host_Popularity_percentage'] * train.get('Episode_Sentiment_Positive', 0)
test['Host_Popularity_Sentiment_Positive'] = test['Host_Popularity_percentage'] * test.get('Episode_Sentiment_Positive', 0)

# Modeling

In [None]:
from sklearn.model_selection import train_test_split

X = train.drop(columns=['Listening_Time_minutes'])  # Features
y = train['Listening_Time_minutes']  # Target

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold

# Initialize KFold for crossvalidation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
rmse_scores = []

for fold, (train_idx, val_idx) in enumerate(kf.split(X), 1):
    # Split data for this fold
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    # Create LightGBM datasets
    train_data = lgb.Dataset(X_train, label=y_train)
    val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)

    # Define parameters
params = {
    'objective': 'regression',          # For predicting target var
    'metric': 'rmse',                   # Root Mean Squared Error
    'boosting_type': 'gbdt',            # Gradient Boosted Decision Trees
    'num_leaves': 74,                   # Reduced to prevent overfitting
    'learning_rate': 0.09,              # Smaller = slower, more robust
    'feature_fraction': 0.9,            # Feature subsampling
    'bagging_freq': 7,
    'min_child_samples': 93,
    'max_depth': 8,
    'verbose': -1                       # Suppress warnings
}

# Train with early stopping using callback
model = lgb.train(
    params,
    train_data,
    num_boost_round=2000,               # Max iterations
    valid_sets=[val_data],              # Validation set for monitoring
    valid_names=['validation'],         # Name for validation set in output
    callbacks=[
        lgb.early_stopping(stopping_rounds=100), # Stop if no improvement for 100 rounds
        lgb.log_evaluation(period=10)            # Print progress every 5 rounds
    ]
)

 # Predict and calculate RMSE
y_val_pred = model.predict(X_val, num_iteration=model.best_iteration)
rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))
rmse_scores.append(rmse)
    
print(f"Fold {fold} RMSE: {rmse:.4f}")
print(f"Fold {fold} Best Iteration: {model.best_iteration}\n")

 # Cross validated performance
print(f"Average Validation RMSE: {np.mean(rmse_scores):.4f} ± {np.std(rmse_scores):.4f}")


In [None]:
lgb.plot_importance(model, max_num_features=10)

## Final Prediction

In [None]:
# Align columns
X_val = X_val[X_train.columns]
test = test[X_train.columns]

# Predict on test set
test_pred = model.predict(test, num_iteration=model.best_iteration)

# Submission

In [None]:
# Create submission
submission = pd.DataFrame({'id': test_id, 'Listening_Time_minutes': test_pred})
submission.to_csv('submission.csv', index=False)
submission.head()