In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta

# Create larger sample data (30k entries)
np.random.seed(42)

def create_large_sample_data(n_entries=30000):
    # Calculate approximate number of users needed (assuming similar distribution)
    n_users = n_entries // 5  # Average of 5 entries per user
    
    data = {
        'user_id': [],
        'show_id': [],
        'episode_id': [],
        'episode_type': [],
        'log_date': [],
        'view_start_time': [],
        'view_end_time': [],
        'log_date_sequence': [],
        'view_percent': []
    }
    
    current_entries = 0
    user_id = 1
    
    while current_entries < n_entries:
        # Determine number of logs for this user (weighted towards single log)
        n_logs = np.random.choice([1, 2, 3, 4, 5, 10, 15, 20], 
                                p=[0.6, 0.1, 0.1, 0.05, 0.05, 0.05, 0.03, 0.02])
        
        base_date = pd.Timestamp('2024-01-01')
        for log_seq in range(n_logs):
            if current_entries >= n_entries:
                break
                
            view_start = base_date + pd.Timedelta(days=np.random.randint(0, 90))
            
            data['user_id'].append(user_id)
            data['show_id'].append(np.random.randint(1, 501))  # Increased show range
            data['episode_id'].append(np.random.randint(1, 21))
            data['episode_type'].append(np.random.choice(['a', 'b', 'c']))
            data['log_date'].append(view_start.date())
            data['view_start_time'].append(view_start)
            data['view_end_time'].append(view_start + pd.Timedelta(minutes=np.random.randint(10, 180)))
            data['log_date_sequence'].append(log_seq + 1)
            
            # Create more realistic view_percent distribution
            if log_seq == 0:  # First-time viewers
                view_percent = np.random.choice([100] * 3 + list(range(10, 100, 10)) * 2)
            else:  # Returning viewers
                view_percent = np.random.choice([100] * 6 + list(range(10, 100, 10)))
            data['view_percent'].append(view_percent)
            
            current_entries += 1
        
        user_id += 1
    
    return pd.DataFrame(data)

# Create and analyze the large dataset
df = create_large_sample_data(30000)

# 1. Basic Statistics
print("\nDataset Overview:")
print(f"Total Entries: {len(df)}")
print(f"Total Unique Users: {df['user_id'].nunique()}")
print(f"Total Unique Shows: {df['show_id'].nunique()}")

# 2. User Behavior Analysis
user_stats = df.groupby('user_id').agg({
    'log_date_sequence': 'max',
    'view_percent': ['mean', 'std', 'median'],
    'show_id': 'nunique',
    'episode_id': 'count'
}).reset_index()

user_stats.columns = ['user_id', 'total_logs', 'avg_view_percent', 
                     'std_view_percent', 'median_view_percent', 
                     'unique_shows', 'total_episodes']

print("\nUser Behavior Statistics:")
print(f"Single-log Users: {(user_stats['total_logs'] == 1).sum()}")
print(f"Single-log User Percentage: {(user_stats['total_logs'] == 1).mean()*100:.1f}%")

# 3. View Percentage Analysis by Episode Type
episode_stats = df.groupby('episode_type').agg({
    'view_percent': ['mean', 'median', 'count'],
    'user_id': 'nunique'
}).round(2)

print("\nEpisode Type Statistics:")
print(episode_stats)

# 4. Visualizations
plt.style.use('seaborn')
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
fig.suptitle('Large Scale Viewing Pattern Analysis (30k entries)', fontsize=16)

# Plot 1: View Percentage Distribution
sns.histplot(data=df, x='view_percent', bins=20, ax=axes[0,0])
axes[0,0].set_title('Overall View Percentage Distribution')

# Plot 2: Episode Type Performance
sns.boxplot(data=df, x='episode_type', y='view_percent', ax=axes[0,1])
axes[0,1].set_title('View Percentage by Episode Type')

# Plot 3: Log Sequence vs View Percentage
log_seq_stats = df.groupby('log_date_sequence')['view_percent'].mean().reset_index()
sns.lineplot(data=log_seq_stats, x='log_date_sequence', y='view_percent', ax=axes[1,0])
axes[1,0].set_title('Average View Percentage by Log Sequence')

# Plot 4: Time of Day Analysis
df['hour'] = pd.to_datetime(df['view_start_time']).dt.hour
sns.histplot(data=df, x='hour', hue='episode_type', multiple="stack", ax=axes[1,1])
axes[1,1].set_title('Viewing Hours by Episode Type')

plt.tight_layout()

# 5. Advanced Analysis: User Return Patterns
print("\nUser Return Pattern Analysis:")

# Calculate time between views for returning users
user_view_patterns = df.sort_values(['user_id', 'view_start_time']).groupby('user_id').agg({
    'view_start_time': lambda x: x.diff().mean() if len(x) > 1 else pd.NaT,
    'view_percent': 'mean',
    'log_date_sequence': 'max'
}).reset_index()

# Filter for returning users
returning_users = user_view_patterns[user_view_patterns['log_date_sequence'] > 1]

print("\nReturning User Statistics:")
print(f"Average Days Between Views: {returning_users['view_start_time'].mean().days:.1f}")
print(f"Average View Percentage: {returning_users['view_percent'].mean():.1f}%")

# 6. Show Performance Analysis
show_stats = df.groupby('show_id').agg({
    'user_id': 'nunique',
    'view_percent': 'mean',
    'episode_type': lambda x: x.mode()[0]
}).sort_values('user_id', ascending=False)

print("\nTop 10 Shows by Unique Viewers:")
print(show_stats.head(10))

plt.show()

# 7. Calculate and display key metrics for business decisions
print("\nKey Business Metrics:")
retention_rate = len(returning_users) / len(user_stats) * 100
avg_views_per_return = len(df[df['log_date_sequence'] > 1]) / len(returning_users)
completion_rate = (df['view_percent'] >= 90).mean() * 100

print(f"User Retention Rate: {retention_rate:.1f}%")
print(f"Average Views per Returning User: {avg_views_per_return:.1f}")
print(f"Content Completion Rate: {completion_rate:.1f}%")


Dataset Overview:
Total Entries: 30000
Total Unique Users: 10340
Total Unique Shows: 500

User Behavior Statistics:
Single-log Users: 6113
Single-log User Percentage: 59.1%

Episode Type Statistics:
             view_percent               user_id
                     mean median  count nunique
episode_type                                   
a                   64.94   70.0  10053    5274
b                   65.58   70.0   9978    5280
c                   65.44   70.0   9969    5261


OSError: 'seaborn' is not a valid package style, path of style file, URL of style file, or library style name (library styles are listed in `style.available`)