# 🎬 OTT Viewer Drop-Off & Retention Dataset - Comprehensive Data Science Analysis

This notebook demonstrates comprehensive capabilities in:
- **Exploratory Data Analysis (EDA)**
- **Machine Learning** (Classification & Regression)
- **Feature Engineering**
- **Advanced Analytics** (Clustering, Time Series)
- **Business Intelligence**

---

## 📚 Setup and Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (classification_report, confusion_matrix, roc_auc_score,
                             mean_squared_error, r2_score, accuracy_score, roc_curve)
from sklearn.cluster import KMeans
import warnings
warnings.filterwarnings('ignore')

# Set style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
%matplotlib inline

print("✓ Libraries imported successfully!")

## 📊 Section 1: Data Loading and Overview

In [None]:
# Load the dataset
df = pd.read_csv('ott_viewer_dropoff_retention_us_v1.0.csv')

print(f"Dataset Shape: {df.shape}")
print(f"\nTotal Episodes: {len(df):,}")
print(f"Unique Shows: {df['show_id'].nunique()}")
print(f"Platforms: {df['platform'].nunique()}")
print(f"Genres: {df['genre'].nunique()}")

# Display first few rows
df.head(10)

In [None]:
# Data types and info
df.info()

In [None]:
# Check for missing values
missing = df.isnull().sum()
if missing.sum() == 0:
    print("✓ No missing values found!")
else:
    print("Missing values:\n", missing[missing > 0])

In [None]:
# Statistical summary
df.describe()

## 📈 Section 2: Exploratory Data Analysis (EDA)

In [None]:
# Target variable distributions
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# Drop-off distribution
df['drop_off'].value_counts().plot(kind='bar', ax=axes[0], color=['green', 'red'])
axes[0].set_title('Drop-off Distribution', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Drop-off (0=No, 1=Yes)')
axes[0].set_ylabel('Count')
axes[0].set_xticklabels(['No Drop-off', 'Drop-off'], rotation=0)

# Retention risk distribution
df['retention_risk'].value_counts().plot(kind='bar', ax=axes[1], color=['green', 'orange', 'red'])
axes[1].set_title('Retention Risk Distribution', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Retention Risk')
axes[1].set_ylabel('Count')
axes[1].tick_params(axis='x', rotation=45)

# Drop-off probability distribution
axes[2].hist(df['drop_off_probability'], bins=50, color='steelblue', edgecolor='black')
axes[2].set_title('Drop-off Probability Distribution', fontsize=14, fontweight='bold')
axes[2].set_xlabel('Drop-off Probability')
axes[2].set_ylabel('Frequency')
axes[2].axvline(df['drop_off_probability'].mean(), color='red', linestyle='--', label='Mean')
axes[2].legend()

plt.tight_layout()
plt.show()

print(f"Drop-off Rate: {df['drop_off'].mean()*100:.2f}%")

In [None]:
# Platform and Genre distributions
fig, axes = plt.subplots(1, 2, figsize=(18, 6))

# Top platforms
df['platform'].value_counts().head(10).plot(kind='barh', ax=axes[0], color='steelblue')
axes[0].set_title('Top 10 Platforms', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Number of Episodes')
axes[0].invert_yaxis()

# Top genres
df['genre'].value_counts().head(10).plot(kind='barh', ax=axes[1], color='coral')
axes[1].set_title('Top 10 Genres', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Number of Episodes')
axes[1].invert_yaxis()

plt.tight_layout()
plt.show()

In [None]:
# Key metrics distributions
fig, axes = plt.subplots(2, 3, figsize=(18, 10))

metrics = [
    ('avg_watch_percentage', 'Average Watch Percentage'),
    ('hook_strength', 'Hook Strength'),
    ('pacing_score', 'Pacing Score'),
    ('cognitive_load', 'Cognitive Load'),
    ('visual_intensity', 'Visual Intensity'),
    ('episode_duration_min', 'Episode Duration (min)')
]

for idx, (col, title) in enumerate(metrics):
    ax = axes[idx // 3, idx % 3]
    ax.hist(df[col], bins=30, color='steelblue', edgecolor='black', alpha=0.7)
    ax.set_title(title, fontsize=12, fontweight='bold')
    ax.set_xlabel(col)
    ax.set_ylabel('Frequency')
    ax.axvline(df[col].mean(), color='red', linestyle='--', linewidth=2, label='Mean')
    ax.legend()

plt.tight_layout()
plt.show()

## 🔗 Section 3: Correlation Analysis

In [None]:
# Select numerical features for correlation
numerical_features = ['pacing_score', 'hook_strength', 'visual_intensity',
                     'avg_watch_percentage', 'pause_count', 'rewind_count',
                     'cognitive_load', 'episode_duration_min',
                     'drop_off', 'drop_off_probability']

correlation_matrix = df[numerical_features].corr()

# Plot correlation heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm',
            center=0, square=True, linewidths=1)
plt.title('Feature Correlation Matrix', fontsize=16, fontweight='bold', pad=20)
plt.tight_layout()
plt.show()

In [None]:
# Correlations with target variable
drop_off_corr = correlation_matrix['drop_off'].sort_values(ascending=False)

plt.figure(figsize=(10, 6))
drop_off_corr[drop_off_corr.index != 'drop_off'].plot(kind='barh', color='steelblue')
plt.title('Feature Correlations with Drop-off', fontsize=14, fontweight='bold')
plt.xlabel('Correlation Coefficient')
plt.axvline(x=0, color='black', linestyle='-', linewidth=0.5)
plt.grid(axis='x', alpha=0.3)
plt.tight_layout()
plt.show()

## 🔧 Section 4: Feature Engineering

In [None]:
# Create a copy for MLdf_ml = df.copy()# Episode position featuresdf_ml['is_premiere'] = (df_ml['episode_number'] == 1).astype(int)df_ml['is_finale'] = df_ml.groupby(['show_id', 'season_number'])['episode_number'].transform('max') == df_ml['episode_number']df_ml['is_finale'] = df_ml['is_finale'].astype(int)df_ml['episode_position'] = df_ml['episode_number'] / df_ml.groupby(['show_id', 'season_number'])['episode_number'].transform('max')# Engagement metricsdf_ml['completion_rate'] = df_ml['avg_watch_percentage'] / 100df_ml['interaction_intensity'] = df_ml['pause_count'] + df_ml['rewind_count']df_ml['engagement_score'] = (df_ml['avg_watch_percentage'] * df_ml['hook_strength']) / 100# Encode categorical variablesle_platform = LabelEncoder()le_genre = LabelEncoder()le_attention = LabelEncoder()le_retention = LabelEncoder()df_ml['platform_encoded'] = le_platform.fit_transform(df_ml['platform'])df_ml['genre_encoded'] = le_genre.fit_transform(df_ml['genre'])df_ml['attention_encoded'] = le_attention.fit_transform(df_ml['attention_required'])df_ml['retention_risk_encoded'] = le_retention.fit_transform(df_ml['retention_risk'])le_dialogue = LabelEncoder()df_ml['dialogue_density_encoded'] = le_dialogue.fit_transform(df_ml['dialogue_density'])# Content complexity (now using encoded dialogue_density)df_ml['content_complexity'] = (df_ml['cognitive_load'] + df_ml['dialogue_density_encoded'] + df_ml['visual_intensity']) / 3print(f"✓ Feature engineering completed!")print(f"Total features: {len(df_ml.columns)}")print(f"\nNew features created:")new_features = [c for c in df_ml.columns if c not in df.columns]for feat in new_features:    print(f"  - {feat}")

## 🤖 Section 5: Machine Learning - Binary Classification (Drop-off Prediction)

In [None]:
# Select features for modelingfeature_cols = ['pacing_score', 'hook_strength', 'visual_intensity',               'avg_watch_percentage', 'pause_count', 'rewind_count',               'cognitive_load', 'platform_encoded', 'genre_encoded',               'attention_encoded', 'dialogue_density_encoded', 'season_number', 'episode_number',               'is_premiere', 'is_finale', 'episode_position',               'engagement_score', 'content_complexity', 'skip_intro',               'night_watch_safe', 'episode_duration_min']X = df_ml[feature_cols]y = df_ml['drop_off']# Train-test splitX_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)print(f"Training samples: {len(X_train):,}")print(f"Test samples: {len(X_test):,}")print(f"Drop-off rate in training: {y_train.mean()*100:.2f}%")print(f"Drop-off rate in test: {y_test.mean()*100:.2f}%")

In [None]:
# Train Random Forest
print("Training Random Forest Classifier...")
rf_model = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42, n_jobs=-1)
rf_model.fit(X_train, y_train)

# Predictions
y_pred_rf = rf_model.predict(X_test)
y_pred_proba_rf = rf_model.predict_proba(X_test)[:, 1]

# Evaluation
print("\n" + "="*60)
print("RANDOM FOREST RESULTS")
print("="*60)
print(f"Accuracy: {accuracy_score(y_test, y_pred_rf):.4f}")
print(f"ROC-AUC: {roc_auc_score(y_test, y_pred_proba_rf):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred_rf, target_names=['No Drop-off', 'Drop-off']))

In [None]:
# Visualizations
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred_rf)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[0],
            xticklabels=['No Drop-off', 'Drop-off'],
            yticklabels=['No Drop-off', 'Drop-off'])
axes[0].set_title('Confusion Matrix', fontsize=14, fontweight='bold')
axes[0].set_ylabel('True Label')
axes[0].set_xlabel('Predicted Label')

# ROC Curve
fpr, tpr, _ = roc_curve(y_test, y_pred_proba_rf)
axes[1].plot(fpr, tpr, linewidth=2, label=f'ROC curve (AUC = {roc_auc_score(y_test, y_pred_proba_rf):.3f})')
axes[1].plot([0, 1], [0, 1], 'k--', linewidth=2, label='Random Classifier')
axes[1].set_xlabel('False Positive Rate', fontsize=12)
axes[1].set_ylabel('True Positive Rate', fontsize=12)
axes[1].set_title('ROC Curve', fontsize=14, fontweight='bold')
axes[1].legend()
axes[1].grid(alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Feature Importance
feature_importance = pd.DataFrame({
    'feature': feature_cols,
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False)

plt.figure(figsize=(10, 8))
plt.barh(range(len(feature_importance)), feature_importance['importance'])
plt.yticks(range(len(feature_importance)), feature_importance['feature'])
plt.xlabel('Importance', fontsize=12)
plt.title('Feature Importance - Random Forest', fontsize=14, fontweight='bold')
plt.gca().invert_yaxis()
plt.grid(axis='x', alpha=0.3)
plt.tight_layout()
plt.show()

print("\nTop 10 Important Features:")
print(feature_importance.head(10))

## 📊 Section 6: Regression - Drop-off Probability Prediction

In [None]:
# Prepare data for regression
y_reg = df_ml['drop_off_probability']

X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(
    X, y_reg, test_size=0.2, random_state=42
)

# Train Gradient Boosting Regressor
print("Training Gradient Boosting Regressor...")
gb_model = GradientBoostingRegressor(n_estimators=100, max_depth=5, random_state=42)
gb_model.fit(X_train_reg, y_train_reg)

# Predictions
y_pred_gb = gb_model.predict(X_test_reg)

# Evaluation
print("\n" + "="*60)
print("GRADIENT BOOSTING REGRESSION RESULTS")
print("="*60)
print(f"R² Score: {r2_score(y_test_reg, y_pred_gb):.4f}")
print(f"RMSE: {np.sqrt(mean_squared_error(y_test_reg, y_pred_gb)):.4f}")
print(f"MAE: {np.mean(np.abs(y_test_reg - y_pred_gb)):.4f}")

In [None]:
# Visualize predictions
plt.figure(figsize=(10, 6))
plt.scatter(y_test_reg, y_pred_gb, alpha=0.3, s=10)
plt.plot([0, 1], [0, 1], 'r--', linewidth=2, label='Perfect Prediction')
plt.xlabel('Actual Drop-off Probability', fontsize=12)
plt.ylabel('Predicted Drop-off Probability', fontsize=12)
plt.title('Predicted vs Actual Drop-off Probability', fontsize=14, fontweight='bold')
plt.legend()
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()

# Residuals
residuals = y_test_reg - y_pred_gb
plt.figure(figsize=(10, 6))
plt.scatter(y_pred_gb, residuals, alpha=0.3, s=10)
plt.axhline(y=0, color='r', linestyle='--', linewidth=2)
plt.xlabel('Predicted Drop-off Probability', fontsize=12)
plt.ylabel('Residuals', fontsize=12)
plt.title('Residual Plot', fontsize=14, fontweight='bold')
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()

## 🎯 Section 7: Multi-Class Classification (Retention Risk)

In [None]:
# Prepare data
y_risk = df_ml['retention_risk_encoded']

X_train_risk, X_test_risk, y_train_risk, y_test_risk = train_test_split(
    X, y_risk, test_size=0.2, random_state=42, stratify=y_risk
)

# Train model
print("Training Random Forest for Retention Risk...")
rf_risk_model = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42, n_jobs=-1)
rf_risk_model.fit(X_train_risk, y_train_risk)

# Predictions
y_pred_risk = rf_risk_model.predict(X_test_risk)

# Evaluation
print("\n" + "="*60)
print("RETENTION RISK CLASSIFICATION RESULTS")
print("="*60)
print(f"Accuracy: {accuracy_score(y_test_risk, y_pred_risk):.4f}")
print("\nClassification Report:")
risk_labels = le_retention.classes_
print(classification_report(y_test_risk, y_pred_risk, target_names=risk_labels))

In [None]:
# Confusion matrix for multi-class
cm_risk = confusion_matrix(y_test_risk, y_pred_risk)

plt.figure(figsize=(8, 6))
sns.heatmap(cm_risk, annot=True, fmt='d', cmap='YlOrRd',
            xticklabels=risk_labels, yticklabels=risk_labels)
plt.title('Retention Risk - Confusion Matrix', fontsize=14, fontweight='bold')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.tight_layout()
plt.show()

## 🔍 Section 8: Clustering Analysis

In [None]:
# Select features for clustering
cluster_features = ['avg_watch_percentage', 'hook_strength', 'pacing_score',
                   'cognitive_load', 'visual_intensity', 'drop_off_probability']

X_cluster = df_ml[cluster_features].copy()

# Standardize
scaler = StandardScaler()
X_cluster_scaled = scaler.fit_transform(X_cluster)

# K-Means clustering
n_clusters = 4
kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
df_ml['cluster'] = kmeans.fit_predict(X_cluster_scaled)

print(f"Clustering completed with {n_clusters} clusters")
print("\nCluster Distribution:")
print(df_ml['cluster'].value_counts().sort_index())

In [None]:
# Cluster characteristics
print("\nCluster Characteristics:\n")
for i in range(n_clusters):
    cluster_data = df_ml[df_ml['cluster'] == i]
    print(f"Cluster {i} ({len(cluster_data)} episodes):")
    print(f"  Avg Watch %: {cluster_data['avg_watch_percentage'].mean():.1f}%")
    print(f"  Drop-off Rate: {cluster_data['drop_off'].mean()*100:.1f}%")
    print(f"  Hook Strength: {cluster_data['hook_strength'].mean():.2f}")
    print(f"  Cognitive Load: {cluster_data['cognitive_load'].mean():.2f}")
    print(f"  Dominant Risk: {cluster_data['retention_risk'].mode()[0]}")
    print()

In [None]:
# Visualize clusters
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Cluster scatter plot
for cluster in range(n_clusters):
    cluster_data = df_ml[df_ml['cluster'] == cluster]
    axes[0].scatter(cluster_data['avg_watch_percentage'],
                   cluster_data['drop_off_probability'],
                   label=f'Cluster {cluster}', alpha=0.5, s=10)

axes[0].set_xlabel('Average Watch Percentage', fontsize=12)
axes[0].set_ylabel('Drop-off Probability', fontsize=12)
axes[0].set_title('Clusters: Watch % vs Drop-off Probability', fontsize=14, fontweight='bold')
axes[0].legend()
axes[0].grid(alpha=0.3)

# Cluster profiles
cluster_profiles = df_ml.groupby('cluster')[cluster_features].mean()
cluster_profiles_normalized = (cluster_profiles - cluster_profiles.min()) / (cluster_profiles.max() - cluster_profiles.min())
cluster_profiles_normalized.T.plot(kind='bar', ax=axes[1], width=0.8)
axes[1].set_title('Normalized Cluster Profiles', fontsize=14, fontweight='bold')
axes[1].set_ylabel('Normalized Value')
axes[1].set_xlabel('Features')
axes[1].legend(title='Cluster', bbox_to_anchor=(1.05, 1), loc='upper left')
axes[1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

## 📉 Section 9: Time Series Analysis - Episode Progression

In [None]:
# Episode-level analysis
episode_analysis = df_ml.groupby('episode_number').agg({
    'drop_off': 'mean',
    'avg_watch_percentage': 'mean',
    'drop_off_probability': 'mean',
    'show_id': 'count'
}).rename(columns={'show_id': 'episode_count'})

fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Drop-off by episode
axes[0].plot(episode_analysis.index, episode_analysis['drop_off']*100, marker='o', linewidth=2)
axes[0].set_xlabel('Episode Number', fontsize=12)
axes[0].set_ylabel('Drop-off Rate (%)', fontsize=12)
axes[0].set_title('Drop-off Rate by Episode Number', fontsize=14, fontweight='bold')
axes[0].grid(alpha=0.3)

# Watch percentage by episode
axes[1].plot(episode_analysis.index, episode_analysis['avg_watch_percentage'], marker='o', color='green', linewidth=2)
axes[1].set_xlabel('Episode Number', fontsize=12)
axes[1].set_ylabel('Average Watch Percentage', fontsize=12)
axes[1].set_title('Watch Completion by Episode Number', fontsize=14, fontweight='bold')
axes[1].grid(alpha=0.3)

plt.tight_layout()
plt.show()

print("Episode-level trends:")
print(episode_analysis.head(10))

In [None]:
# Season-level analysis
season_analysis = df_ml.groupby('season_number').agg({
    'drop_off': 'mean',
    'avg_watch_percentage': 'mean',
    'drop_off_probability': 'mean',
    'show_id': 'count'
}).rename(columns={'show_id': 'episode_count'})

print("\nSeason-level trends:")
print(season_analysis)

# Visualize
fig, ax = plt.subplots(figsize=(10, 6))
x = season_analysis.index
width = 0.35
ax.bar(x - width/2, season_analysis['drop_off']*100, width, label='Drop-off Rate (%)', color='red', alpha=0.7)
ax2 = ax.twinx()
ax2.bar(x + width/2, season_analysis['avg_watch_percentage'], width, label='Avg Watch %', color='green', alpha=0.7)

ax.set_xlabel('Season Number', fontsize=12)
ax.set_ylabel('Drop-off Rate (%)', fontsize=12)
ax2.set_ylabel('Average Watch Percentage', fontsize=12)
ax.set_title('Season-level Performance Metrics', fontsize=14, fontweight='bold')
ax.legend(loc='upper left')
ax2.legend(loc='upper right')
ax.grid(alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# Premiere vs Regular vs Finale comparison
premiere_stats = df_ml[df_ml['is_premiere'] == 1].agg({
    'drop_off': 'mean',
    'avg_watch_percentage': 'mean',
    'drop_off_probability': 'mean'
})

finale_stats = df_ml[df_ml['is_finale'] == 1].agg({
    'drop_off': 'mean',
    'avg_watch_percentage': 'mean',
    'drop_off_probability': 'mean'
})

regular_stats = df_ml[(df_ml['is_premiere'] == 0) & (df_ml['is_finale'] == 0)].agg({
    'drop_off': 'mean',
    'avg_watch_percentage': 'mean',
    'drop_off_probability': 'mean'
})

# Create comparison dataframe
episode_type_comparison = pd.DataFrame({
    'Premiere': premiere_stats,
    'Regular': regular_stats,
    'Finale': finale_stats
})

print("\nPremiere vs Regular vs Finale Episodes:")
print(episode_type_comparison)

# Visualize
episode_type_comparison.T.plot(kind='bar', figsize=(12, 6), rot=0)
plt.title('Performance Metrics by Episode Type', fontsize=14, fontweight='bold')
plt.ylabel('Value')
plt.xlabel('Episode Type')
plt.legend(title='Metrics', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.show()

## 💡 Section 10: Business Insights and Recommendations

In [None]:
# High-risk episodes analysis
high_risk_episodes = df_ml[df_ml['retention_risk'] == 'high']

print("="*60)
print("HIGH-RISK EPISODES ANALYSIS")
print("="*60)
print(f"\nTotal high-risk episodes: {len(high_risk_episodes):,}")
print(f"Percentage of dataset: {len(high_risk_episodes)/len(df_ml)*100:.1f}%")
print(f"\nCharacteristics:")
print(f"  Average watch completion: {high_risk_episodes['avg_watch_percentage'].mean():.1f}%")
print(f"  Average cognitive load: {high_risk_episodes['cognitive_load'].mean():.2f}")
print(f"  Average hook strength: {high_risk_episodes['hook_strength'].mean():.2f}")
print(f"  Average pacing score: {high_risk_episodes['pacing_score'].mean():.2f}")
print(f"\nTop genres at risk:")
print(high_risk_episodes['genre'].value_counts().head(5))

In [None]:
# Night-watch analysis
night_safe = df_ml[df_ml['night_watch_safe'] == 1]
not_night_safe = df_ml[df_ml['night_watch_safe'] == 0]

print("="*60)
print("NIGHT-WATCH SAFETY IMPACT")
print("="*60)
print(f"\nNight-safe episodes: {len(night_safe):,} ({len(night_safe)/len(df_ml)*100:.1f}%)")
print(f"Not night-safe episodes: {len(not_night_safe):,} ({len(not_night_safe)/len(df_ml)*100:.1f}%)")
print(f"\nDrop-off rates:")
print(f"  Night-safe: {night_safe['drop_off'].mean()*100:.2f}%")
print(f"  Not night-safe: {not_night_safe['drop_off'].mean()*100:.2f}%")
print(f"  Difference: {(not_night_safe['drop_off'].mean() - night_safe['drop_off'].mean())*100:.2f} percentage points")

# Visualize
night_comparison = pd.DataFrame({
    'Night-safe': [night_safe['drop_off'].mean()*100, night_safe['avg_watch_percentage'].mean()],
    'Not Night-safe': [not_night_safe['drop_off'].mean()*100, not_night_safe['avg_watch_percentage'].mean()]
}, index=['Drop-off Rate (%)', 'Avg Watch (%)'])

night_comparison.T.plot(kind='bar', figsize=(10, 6), rot=0)
plt.title('Night-Watch Safety Impact on Viewer Behavior', fontsize=14, fontweight='bold')
plt.ylabel('Percentage')
plt.grid(axis='y', alpha=0.3)
plt.legend(title='Metrics')
plt.tight_layout()
plt.show()

In [None]:
# Platform performance
print("="*60)
print("PLATFORM PERFORMANCE ANALYSIS")
print("="*60)

platform_perf = df_ml.groupby('platform').agg({
    'drop_off': 'mean',
    'avg_watch_percentage': 'mean',
    'show_id': 'nunique',
    'title': 'count'
}).sort_values('avg_watch_percentage', ascending=False).head(10)
platform_perf.columns = ['Drop-off Rate', 'Avg Watch %', 'Unique Shows', 'Total Episodes']

print("\nTop 10 Platforms by Average Watch Percentage:")
print(platform_perf)

# Visualize
fig, axes = plt.subplots(1, 2, figsize=(16, 6))
platform_perf['Avg Watch %'].plot(kind='barh', ax=axes[0], color='steelblue')
axes[0].set_title('Average Watch % by Platform (Top 10)', fontsize=12, fontweight='bold')
axes[0].set_xlabel('Average Watch Percentage')
axes[0].invert_yaxis()

platform_perf['Drop-off Rate'].plot(kind='barh', ax=axes[1], color='coral')
axes[1].set_title('Drop-off Rate by Platform (Top 10)', fontsize=12, fontweight='bold')
axes[1].set_xlabel('Drop-off Rate')
axes[1].invert_yaxis()

plt.tight_layout()
plt.show()

In [None]:
# Genre performance
print("="*60)
print("GENRE PERFORMANCE ANALYSIS")
print("="*60)

genre_perf = df_ml.groupby('genre').agg({
    'drop_off': 'mean',
    'avg_watch_percentage': 'mean',
    'show_id': 'nunique',
    'title': 'count'
}).sort_values('avg_watch_percentage', ascending=False).head(10)
genre_perf.columns = ['Drop-off Rate', 'Avg Watch %', 'Unique Shows', 'Total Episodes']

print("\nTop 10 Genres by Average Watch Percentage:")
print(genre_perf)

# Visualize
fig, axes = plt.subplots(1, 2, figsize=(16, 6))
genre_perf['Avg Watch %'].plot(kind='barh', ax=axes[0], color='green')
axes[0].set_title('Average Watch % by Genre (Top 10)', fontsize=12, fontweight='bold')
axes[0].set_xlabel('Average Watch Percentage')
axes[0].invert_yaxis()

genre_perf['Drop-off Rate'].plot(kind='barh', ax=axes[1], color='red')
axes[1].set_title('Drop-off Rate by Genre (Top 10)', fontsize=12, fontweight='bold')
axes[1].set_xlabel('Drop-off Rate')
axes[1].invert_yaxis()

plt.tight_layout()
plt.show()

## 🎯 Section 11: Final Summary and Recommendations

In [None]:
print("="*80)
print("COMPREHENSIVE ANALYSIS SUMMARY")
print("="*80)

print("\n✅ ANALYSES COMPLETED:")
print("   ✓ Exploratory Data Analysis (EDA)")
print("   ✓ Feature Engineering (13 new features)")
print("   ✓ Binary Classification (Drop-off Prediction)")
print("   ✓ Multi-class Classification (Retention Risk)")
print("   ✓ Regression (Drop-off Probability)")
print("   ✓ Clustering Analysis (4 viewer segments)")
print("   ✓ Time Series Analysis (Episode/Season trends)")
print("   ✓ Business Intelligence Insights")

print("\n📊 MODEL PERFORMANCE:")
print(f"   • Drop-off Prediction (Random Forest):")
print(f"     - Accuracy: {accuracy_score(y_test, y_pred_rf):.1%}")
print(f"     - ROC-AUC: {roc_auc_score(y_test, y_pred_proba_rf):.3f}")
print(f"   • Probability Prediction (Gradient Boosting):")
print(f"     - R² Score: {r2_score(y_test_reg, y_pred_gb):.3f}")
print(f"     - RMSE: {np.sqrt(mean_squared_error(y_test_reg, y_pred_gb)):.4f}")
print(f"   • Retention Risk (Random Forest):")
print(f"     - Accuracy: {accuracy_score(y_test_risk, y_pred_risk):.1%}")

print("\n🎯 KEY INSIGHTS:")
print(f"   • Overall drop-off rate: {df['drop_off'].mean()*100:.2f}%")
print(f"   • High-risk episodes: {len(high_risk_episodes):,} ({len(high_risk_episodes)/len(df)*100:.1f}%)")
print(f"   • Top predictor of drop-off: {feature_importance.iloc[0]['feature']}")
print(f"   • Night-watch safety reduces drop-off by {(not_night_safe['drop_off'].mean() - night_safe['drop_off'].mean())*100:.2f}pp")
print(f"   • Episode 1 (premieres) have {df_ml[df_ml['is_premiere']==1]['drop_off'].mean()*100:.1f}% drop-off rate")

print("\n💡 RECOMMENDATIONS FOR OTT PLATFORMS:")
print("""   
   1. CONTENT OPTIMIZATION:
      • Improve hook strength in high-risk episodes
      • Balance cognitive load to avoid viewer fatigue
      • Optimize pacing for better engagement
   
   2. STRATEGIC POSITIONING:
      • Create night-watch-safe content playlists
      • Pay attention to mid-season drop-off patterns
      • Strengthen premieres and finales
   
   3. PREDICTIVE ANALYTICS:
      • Deploy ML models for real-time churn prediction
      • Implement early warning systems for at-risk viewers
      • Use clustering for personalized recommendations
   
   4. PLATFORM-SPECIFIC:
      • Benchmark against top-performing platforms
      • Genre-specific retention strategies
      • A/B test content positioning
""")

print("\n🚀 NEXT STEPS:")
print("""   
   • Deploy models in production environment
   • Set up real-time monitoring dashboards
   • Conduct A/B experiments on recommendations
   • Deep-dive into show-specific patterns
   • Integrate with existing analytics pipelines
""")

print("="*80)
print("Analysis Complete!")
print("="*80)