In [None]:
# Data manipulation and analysis
import pandas as pd
import numpy as np
import json
from datetime import datetime
import warnings

# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Statistical analysis
from scipy import stats
from scipy.stats import chi2_contingency
from scipy.sparse import csr_matrix

# Machine learning utilities
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics.pairwise import cosine_similarity

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# Suppress warnings
warnings.filterwarnings('ignore')

print("Libraries imported successfully!")


In [None]:
# Load all datasets
print("Loading datasets...")

# Training data - user-item interactions with ratings and timestamps
train_df = pd.read_csv('data/train.csv')
print(f"✓ Training data loaded: {train_df.shape}")

# Test data - users for whom we need to make predictions
test_df = pd.read_csv('data/test.csv')
print(f"✓ Test data loaded: {test_df.shape}")

# Sample submission - expected format (user_id: space-separated item recommendations)
sample_submission = pd.read_csv('data/sample_submission.csv')
print(f"✓ Sample submission loaded: {sample_submission.shape}")

# Item metadata - product information
item_metadata = pd.read_csv('data/item_metadata.csv')
print(f"✓ Item metadata loaded: {item_metadata.shape}")

# ID mappings - mappings between original IDs and encoded numeric IDs
with open('data/id_mappings.json', 'r') as f:
    id_mappings = json.load(f)
print(f"✓ ID mappings loaded")

print("\nAll datasets loaded successfully!")


In [None]:
# Dataset overview
datasets = {
    'Training Data': train_df,
    'Test Data': test_df,
    'Sample Submission': sample_submission,
    'Item Metadata': item_metadata
}

print("=" * 60)
print("DATASET OVERVIEW")
print("=" * 60)

for name, df in datasets.items():
    print(f"\n{name}:")
    print(f"  Shape: {df.shape}")
    print(f"  Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
    print(f"  Columns: {list(df.columns)}")

print(f"\nID Mappings structure:")
for key in id_mappings.keys():
    if isinstance(id_mappings[key], dict):
        print(f"  {key}: {len(id_mappings[key])} mappings")
    else:
        print(f"  {key}: {type(id_mappings[key])}")

# Display first few rows of each dataset
for name, df in datasets.items():
    print(f"\n{name.upper()} - First 5 rows:")
    print("-" * 40)
    display(df.head())


## 3. Data Quality Assessment

In [1]:
# Missing values analysis
def analyze_missing_values(df, name):
    print(f"\n{name}:")
    print("-" * 30)
    missing = df.isnull().sum()
    missing_pct = (missing / len(df)) * 100
    
    if missing.sum() == 0:
        print("✓ No missing values found!")
    else:
        missing_df = pd.DataFrame({
            'Missing Count': missing,
            'Missing Percentage': missing_pct.round(2)
        }).sort_values('Missing Count', ascending=False)
        
        display(missing_df[missing_df['Missing Count'] > 0])

print("=" * 60)
print("MISSING VALUES ANALYSIS")
print("=" * 60)

for name, df in datasets.items():
    analyze_missing_values(df, name)


MISSING VALUES ANALYSIS


NameError: name 'datasets' is not defined

In [None]:
# Duplicate analysis
print("=" * 60)
print("DUPLICATE ANALYSIS")
print("=" * 60)

for name, df in datasets.items():
    duplicates = df.duplicated().sum()
    print(f"{name}: {duplicates} duplicate rows ({duplicates/len(df)*100:.2f}%)")

# Data types analysis
print("\n" + "=" * 60)
print("DATA TYPES ANALYSIS")
print("=" * 60)

for name, df in datasets.items():
    print(f"\n{name}:")
    print(f"  Data types: {df.dtypes.value_counts().to_dict()}")
    
    # Identify potential issues
    object_cols = df.select_dtypes(include=['object']).columns
    if len(object_cols) > 0:
        print(f"  Text columns: {list(object_cols)}")
        
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    if len(numeric_cols) > 0:
        print(f"  Numeric columns: {list(numeric_cols)}")


In [None]:
# Basic statistics about training data
print("=" * 60)
print("TRAINING DATA BASIC STATISTICS")
print("=" * 60)

print(f"Training data shape: {train_df.shape}")
print(f"Number of unique users: {train_df['user_id'].nunique():,}")
print(f"Number of unique items: {train_df['item_id'].nunique():,}")
print(f"Number of interactions: {len(train_df):,}")
print(f"Date range: {pd.to_datetime(train_df['timestamp'], unit='ms').min()} to {pd.to_datetime(train_df['timestamp'], unit='ms').max()}")

# Statistical summary
print(f"\nStatistical summary:")
display(train_df.describe())

# Rating distribution
print(f"\nRating distribution:")
display(train_df['rating'].value_counts().sort_index())


In [None]:
# Visualize rating distribution
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Rating distribution histogram
train_df['rating'].hist(bins=20, ax=axes[0], alpha=0.7)
axes[0].set_title('Rating Distribution')
axes[0].set_xlabel('Rating')
axes[0].set_ylabel('Frequency')

# Rating distribution pie chart
rating_counts = train_df['rating'].value_counts().sort_index()
axes[1].pie(rating_counts.values, labels=rating_counts.index, autopct='%1.1f%%')
axes[1].set_title('Rating Distribution (Percentage)')

plt.tight_layout()
plt.show()

# Rating statistics
print("Rating Statistics:")
print(f"Mean rating: {train_df['rating'].mean():.2f}")
print(f"Median rating: {train_df['rating'].median():.2f}")
print(f"Mode rating: {train_df['rating'].mode().iloc[0]:.1f}")
print(f"Standard deviation: {train_df['rating'].std():.2f}")


In [None]:
# User activity analysis
user_stats = train_df.groupby('user_id').agg({
    'item_id': 'count',
    'rating': ['mean', 'std', 'min', 'max']
}).round(2)

user_stats.columns = ['num_interactions', 'avg_rating', 'rating_std', 'min_rating', 'max_rating']
user_stats = user_stats.reset_index()

print("=" * 60)
print("USER BEHAVIOR ANALYSIS")
print("=" * 60)

print("User interaction statistics:")
display(user_stats.describe())

# Visualize user activity distribution
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Number of interactions per user
axes[0,0].hist(user_stats['num_interactions'], bins=50, alpha=0.7)
axes[0,0].set_title('Distribution of User Interactions')
axes[0,0].set_xlabel('Number of Interactions')
axes[0,0].set_ylabel('Number of Users')
axes[0,0].set_yscale('log')

# Average rating per user
axes[0,1].hist(user_stats['avg_rating'], bins=30, alpha=0.7)
axes[0,1].set_title('Distribution of Average User Ratings')
axes[0,1].set_xlabel('Average Rating')
axes[0,1].set_ylabel('Number of Users')

# Rating standard deviation per user
axes[1,0].hist(user_stats['rating_std'].dropna(), bins=30, alpha=0.7)
axes[1,0].set_title('Distribution of User Rating Standard Deviation')
axes[1,0].set_xlabel('Rating Std Dev')
axes[1,0].set_ylabel('Number of Users')

# Rating range per user
user_stats['rating_range'] = user_stats['max_rating'] - user_stats['min_rating']
axes[1,1].hist(user_stats['rating_range'], bins=20, alpha=0.7)
axes[1,1].set_title('Distribution of User Rating Range')
axes[1,1].set_xlabel('Rating Range')
axes[1,1].set_ylabel('Number of Users')

plt.tight_layout()
plt.show()


In [None]:
# Item popularity analysis
item_stats = train_df.groupby('item_id').agg({
    'user_id': 'count',
    'rating': ['mean', 'std', 'min', 'max']
}).round(2)

item_stats.columns = ['num_interactions', 'avg_rating', 'rating_std', 'min_rating', 'max_rating']
item_stats = item_stats.reset_index()

print("=" * 60)
print("ITEM POPULARITY ANALYSIS")
print("=" * 60)

print("Item interaction statistics:")
display(item_stats.describe())

# Most popular items
print("\nTop 10 most popular items (by number of interactions):")
top_items = item_stats.nlargest(10, 'num_interactions')
display(top_items)

# Highest rated items (with minimum interactions)
min_interactions = 10
highly_rated = item_stats[item_stats['num_interactions'] >= min_interactions].nlargest(10, 'avg_rating')
print(f"\nTop 10 highest rated items (min {min_interactions} interactions):")
display(highly_rated)

# Visualize item popularity
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Item interaction distribution
axes[0,0].hist(item_stats['num_interactions'], bins=50, alpha=0.7)
axes[0,0].set_title('Distribution of Item Interactions')
axes[0,0].set_xlabel('Number of Interactions')
axes[0,0].set_ylabel('Number of Items')
axes[0,0].set_yscale('log')

# Item average rating distribution
axes[0,1].hist(item_stats['avg_rating'], bins=30, alpha=0.7)
axes[0,1].set_title('Distribution of Item Average Ratings')
axes[0,1].set_xlabel('Average Rating')
axes[0,1].set_ylabel('Number of Items')

# Scatter plot: popularity vs rating
axes[1,0].scatter(item_stats['num_interactions'], item_stats['avg_rating'], alpha=0.5)
axes[1,0].set_title('Item Popularity vs Average Rating')
axes[1,0].set_xlabel('Number of Interactions')
axes[1,0].set_ylabel('Average Rating')
axes[1,0].set_xscale('log')

# Long tail analysis
item_counts = item_stats['num_interactions'].sort_values(ascending=False)
axes[1,1].plot(range(len(item_counts)), item_counts.values)
axes[1,1].set_title('Long Tail Distribution of Item Popularity')
axes[1,1].set_xlabel('Item Rank')
axes[1,1].set_ylabel('Number of Interactions')
axes[1,1].set_yscale('log')

plt.tight_layout()
plt.show()


In [None]:
# Convert timestamp to datetime
train_df['datetime'] = pd.to_datetime(train_df['timestamp'], unit='ms')
train_df['date'] = train_df['datetime'].dt.date
train_df['year'] = train_df['datetime'].dt.year
train_df['month'] = train_df['datetime'].dt.month
train_df['day_of_week'] = train_df['datetime'].dt.day_name()
train_df['hour'] = train_df['datetime'].dt.hour

print("=" * 60)
print("TEMPORAL ANALYSIS")
print("=" * 60)

# Interactions over time
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Interactions by year
yearly_counts = train_df['year'].value_counts().sort_index()
axes[0,0].bar(yearly_counts.index, yearly_counts.values)
axes[0,0].set_title('Interactions by Year')
axes[0,0].set_xlabel('Year')
axes[0,0].set_ylabel('Number of Interactions')

# Interactions by month
monthly_counts = train_df['month'].value_counts().sort_index()
axes[0,1].bar(monthly_counts.index, monthly_counts.values)
axes[0,1].set_title('Interactions by Month')
axes[0,1].set_xlabel('Month')
axes[0,1].set_ylabel('Number of Interactions')

# Interactions by day of week
dow_counts = train_df['day_of_week'].value_counts()
dow_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
dow_counts = dow_counts.reindex(dow_order)
axes[1,0].bar(range(len(dow_counts)), dow_counts.values)
axes[1,0].set_title('Interactions by Day of Week')
axes[1,0].set_xlabel('Day of Week')
axes[1,0].set_ylabel('Number of Interactions')
axes[1,0].set_xticks(range(len(dow_counts)))
axes[1,0].set_xticklabels(dow_counts.index, rotation=45)

# Interactions by hour
hourly_counts = train_df['hour'].value_counts().sort_index()
axes[1,1].bar(hourly_counts.index, hourly_counts.values)
axes[1,1].set_title('Interactions by Hour of Day')
axes[1,1].set_xlabel('Hour')
axes[1,1].set_ylabel('Number of Interactions')

plt.tight_layout()
plt.show()

# Time series plot
daily_interactions = train_df.groupby('date').size()
plt.figure(figsize=(15, 6))
plt.plot(daily_interactions.index, daily_interactions.values)
plt.title('Daily Interactions Over Time')
plt.xlabel('Date')
plt.ylabel('Number of Interactions')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:
# Item metadata overview
print("=" * 60)
print("ITEM METADATA ANALYSIS")
print("=" * 60)

print(f"Item metadata shape: {item_metadata.shape}")
print(f"Columns: {list(item_metadata.columns)}")

# Display basic info about metadata
print("\nMetadata column statistics:")
for col in item_metadata.columns:
    print(f"{col}: {item_metadata[col].dtype} - {item_metadata[col].nunique()} unique values")

# Category analysis
print("\nTop 10 main categories:")
if 'main_category' in item_metadata.columns:
    category_counts = item_metadata['main_category'].value_counts().head(10)
    display(category_counts)
    
    # Visualize categories
    plt.figure(figsize=(12, 6))
    category_counts.plot(kind='bar')
    plt.title('Top 10 Product Categories')
    plt.xlabel('Category')
    plt.ylabel('Number of Products')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

# Price analysis
if 'price' in item_metadata.columns:
    print("\nPrice analysis:")
    # Convert price to numeric (handle 'None' values)
    item_metadata['price_numeric'] = pd.to_numeric(item_metadata['price'], errors='coerce')
    price_stats = item_metadata['price_numeric'].describe()
    print(price_stats)
    
    # Price distribution
    plt.figure(figsize=(12, 4))
    
    plt.subplot(1, 2, 1)
    item_metadata['price_numeric'].dropna().hist(bins=50, alpha=0.7)
    plt.title('Price Distribution')
    plt.xlabel('Price')
    plt.ylabel('Frequency')
    
    plt.subplot(1, 2, 2)
    item_metadata['price_numeric'].dropna().plot(kind='box')
    plt.title('Price Box Plot')
    plt.ylabel('Price')
    
    plt.tight_layout()
    plt.show()

# Rating analysis in metadata
if 'average_rating' in item_metadata.columns:
    print("\nMetadata rating analysis:")
    rating_stats = item_metadata['average_rating'].describe()
    print(rating_stats)
    
    plt.figure(figsize=(10, 4))
    
    plt.subplot(1, 2, 1)
    item_metadata['average_rating'].hist(bins=30, alpha=0.7)
    plt.title('Average Rating Distribution (Metadata)')
    plt.xlabel('Average Rating')
    plt.ylabel('Frequency')
    
    plt.subplot(1, 2, 2)
    if 'rating_number' in item_metadata.columns:
        plt.scatter(item_metadata['rating_number'], item_metadata['average_rating'], alpha=0.5)
        plt.title('Number of Ratings vs Average Rating')
        plt.xlabel('Number of Ratings')
        plt.ylabel('Average Rating')
        plt.xscale('log')
    
    plt.tight_layout()
    plt.show()


In [None]:
# ID mappings analysis
print("=" * 60)
print("ID MAPPINGS ANALYSIS")
print("=" * 60)

# Analyze mapping structure
for key, value in id_mappings.items():
    if isinstance(value, dict):
        print(f"\n{key}:")
        print(f"  Number of mappings: {len(value)}")
        if len(value) > 0:
            # Show sample mappings
            sample_keys = list(value.keys())[:5]
            print(f"  Sample mappings:")
            for sample_key in sample_keys:
                print(f"    {sample_key} -> {value[sample_key]}")
    else:
        print(f"\n{key}: {type(value)}")

# Check consistency between datasets and mappings
print("\n" + "=" * 40)
print("CONSISTENCY CHECK")
print("=" * 40)

if 'user_mapping' in id_mappings:
    mapped_users = set(id_mappings['user_mapping'].values())
    train_users = set(train_df['user_id'].unique())
    test_users = set(test_df['user_id'].unique())
    
    print(f"Mapped users: {len(mapped_users)}")
    print(f"Training users: {len(train_users)}")
    print(f"Test users: {len(test_users)}")
    print(f"Overlap train-test: {len(train_users.intersection(test_users))}")

if 'item_mapping' in id_mappings:
    mapped_items = set(id_mappings['item_mapping'].values())
    train_items = set(train_df['item_id'].unique())
    metadata_items = set(item_metadata.index) if 'parent_asin' in item_metadata.columns else set()
    
    print(f"Mapped items: {len(mapped_items)}")
    print(f"Training items: {len(train_items)}")
    print(f"Metadata items: {len(metadata_items)}")
    print(f"Items in both train and metadata: {len(train_items.intersection(metadata_items))}")


In [None]:
# Analyze relationships between features
print("=" * 60)
print("DATA RELATIONSHIPS AND CORRELATIONS")
print("=" * 60)

# Merge training data with item metadata for analysis
# First, create a mapping from item_id to metadata index
if 'parent_asin' in item_metadata.columns:
    # Create reverse item mapping
    reverse_item_mapping = {v: k for k, v in id_mappings.get('item_mapping', {}).items()}
    
    # Add original ASIN to train data
    train_df['item_asin'] = train_df['item_id'].map(reverse_item_mapping)
    
    # Merge with metadata
    train_with_metadata = train_df.merge(
        item_metadata,
        left_on='item_asin',
        right_on='parent_asin',
        how='left'
    )
    
    print(f"Training data with metadata: {train_with_metadata.shape}")
    print(f"Successful merges: {train_with_metadata['parent_asin'].notna().sum()}")

# Correlation analysis
numeric_cols = train_df.select_dtypes(include=[np.number]).columns
if len(numeric_cols) > 1:
    plt.figure(figsize=(10, 8))
    correlation_matrix = train_df[numeric_cols].corr()
    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0)
    plt.title('Correlation Matrix - Training Data')
    plt.tight_layout()
    plt.show()

# Analyze rating patterns by category (if metadata available)
if 'main_category' in item_metadata.columns and 'train_with_metadata' in locals():
    print("\nRating patterns by category:")
    category_ratings = train_with_metadata.groupby('main_category')['rating'].agg(['mean', 'count', 'std']).round(2)
    category_ratings = category_ratings[category_ratings['count'] >= 100]  # Filter categories with enough data
    category_ratings = category_ratings.sort_values('mean', ascending=False)
    display(category_ratings.head(10))
    
    # Visualize
    plt.figure(figsize=(12, 6))
    top_categories = category_ratings.head(10)
    plt.bar(range(len(top_categories)), top_categories['mean'])
    plt.title('Average Rating by Category (Top 10)')
    plt.xlabel('Category')
    plt.ylabel('Average Rating')
    plt.xticks(range(len(top_categories)), top_categories.index, rotation=45)
    plt.tight_layout()
    plt.show()

# Price vs Rating analysis (if price data available)
if 'price_numeric' in item_metadata.columns and 'train_with_metadata' in locals():
    price_rating_data = train_with_metadata[['rating', 'price_numeric']].dropna()
    if len(price_rating_data) > 0:
        plt.figure(figsize=(10, 6))
        plt.scatter(price_rating_data['price_numeric'], price_rating_data['rating'], alpha=0.5)
        plt.title('Price vs Rating')
        plt.xlabel('Price')
        plt.ylabel('Rating')
        plt.xscale('log')
        
        # Add correlation coefficient
        corr = price_rating_data['price_numeric'].corr(price_rating_data['rating'])
        plt.text(0.05, 0.95, f'Correlation: {corr:.3f}', transform=plt.gca().transAxes)
        
        plt.tight_layout()
        plt.show()


In [None]:
# Cold start problem analysis
print("=" * 60)
print("COLD START PROBLEM ANALYSIS")
print("=" * 60)

# Analyze new users and items in test set
train_users = set(train_df['user_id'].unique())
test_users = set(test_df['user_id'].unique())

print("User cold start analysis:")
print(f"Total users in training: {len(train_users):,}")
print(f"Total users in test: {len(test_users):,}")
print(f"Users appearing in both train and test: {len(train_users.intersection(test_users)):,}")
print(f"New users in test (cold start): {len(test_users - train_users):,}")
print(f"Cold start user percentage: {len(test_users - train_users) / len(test_users) * 100:.2f}%")

# Analyze user interaction patterns for cold start mitigation
print("\nUser interaction distribution (for warm start strategy):")
user_interaction_counts = train_df['user_id'].value_counts()
interaction_stats = user_interaction_counts.describe()
display(interaction_stats)

# Visualize user interaction distribution
plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.hist(user_interaction_counts, bins=50, alpha=0.7)
plt.title('Distribution of User Interactions')
plt.xlabel('Number of Interactions')
plt.ylabel('Number of Users')
plt.yscale('log')

plt.subplot(1, 2, 2)
plt.boxplot(user_interaction_counts)
plt.title('User Interactions Box Plot')
plt.ylabel('Number of Interactions')

plt.tight_layout()
plt.show()

# Analyze items that could be recommended to cold start users
print("\nPopular items analysis (for cold start recommendations):")
popular_items = train_df.groupby('item_id').agg({
    'user_id': 'count',
    'rating': ['mean', 'std']
}).round(2)
popular_items.columns = ['interaction_count', 'avg_rating', 'rating_std']
popular_items = popular_items.sort_values('interaction_count', ascending=False)

print("Top 20 most popular items:")
display(popular_items.head(20))

# Analyze seasonal/temporal patterns for recommendations
if 'datetime' in train_df.columns:
    print("\nTemporal patterns for cold start recommendations:")
    recent_interactions = train_df[train_df['datetime'] >= train_df['datetime'].max() - pd.Timedelta(days=30)]
    recent_popular = recent_interactions.groupby('item_id').size().sort_values(ascending=False)
    
    print("Top 10 trending items (last 30 days):")
    display(recent_popular.head(10))


In [None]:
# Sparsity analysis
print("=" * 60)
print("SPARSITY ANALYSIS")
print("=" * 60)

# Calculate sparsity metrics
n_users = train_df['user_id'].nunique()
n_items = train_df['item_id'].nunique()
n_interactions = len(train_df)

# Theoretical maximum interactions
max_possible_interactions = n_users * n_items
sparsity = 1 - (n_interactions / max_possible_interactions)

print(f"Number of users: {n_users:,}")
print(f"Number of items: {n_items:,}")
print(f"Number of interactions: {n_interactions:,}")
print(f"Maximum possible interactions: {max_possible_interactions:,}")
print(f"Sparsity: {sparsity:.6f} ({sparsity*100:.4f}%)")
print(f"Density: {1-sparsity:.6f} ({(1-sparsity)*100:.4f}%)")

# Create user-item interaction matrix for visualization
print("\nCreating user-item matrix sample...")
# Take a sample for visualization (full matrix would be too large)
sample_users = train_df['user_id'].unique()[:100]
sample_items = train_df['item_id'].unique()[:100]

sample_interactions = train_df[
    (train_df['user_id'].isin(sample_users)) & 
    (train_df['item_id'].isin(sample_items))
]

# Create pivot table
interaction_matrix = sample_interactions.pivot_table(
    index='user_id', 
    columns='item_id', 
    values='rating',
    fill_value=0
)

print(f"Sample matrix shape: {interaction_matrix.shape}")
print(f"Sample matrix sparsity: {(interaction_matrix == 0).sum().sum() / interaction_matrix.size:.4f}")

# Visualize sparsity pattern
plt.figure(figsize=(12, 8))
plt.imshow(interaction_matrix.values, cmap='Blues', aspect='auto')
plt.title('User-Item Interaction Matrix (Sample 100x100)')
plt.xlabel('Items')
plt.ylabel('Users')
plt.colorbar(label='Rating')
plt.tight_layout()
plt.show()

# Analyze interaction density by user segments
print("\nSparsity by user activity segments:")
user_interaction_counts = train_df.groupby('user_id').size()

# Define user segments based on activity
percentiles = [0, 50, 90, 95, 99, 100]
thresholds = np.percentile(user_interaction_counts, percentiles)

for i in range(len(percentiles)-1):
    mask = (user_interaction_counts >= thresholds[i]) & (user_interaction_counts < thresholds[i+1])
    segment_users = user_interaction_counts[mask].index
    segment_data = train_df[train_df['user_id'].isin(segment_users)]
    
    segment_items = segment_data['item_id'].nunique()
    segment_interactions = len(segment_data)
    segment_max_interactions = len(segment_users) * segment_items
    segment_sparsity = 1 - (segment_interactions / segment_max_interactions) if segment_max_interactions > 0 else 1
    
    print(f"  {percentiles[i]:2.0f}-{percentiles[i+1]:2.0f}th percentile: "
          f"{len(segment_users):,} users, "
          f"sparsity: {segment_sparsity:.4f}")

# Long tail analysis for items
print("\nLong tail analysis:")
item_popularity = train_df['item_id'].value_counts().sort_values(ascending=False)

# Calculate cumulative coverage
cumulative_interactions = item_popularity.cumsum()
total_interactions = cumulative_interactions.iloc[-1]

print(f"Top 10% of items cover {cumulative_interactions[int(len(item_popularity)*0.1)] / total_interactions * 100:.1f}% of interactions")
print(f"Top 20% of items cover {cumulative_interactions[int(len(item_popularity)*0.2)] / total_interactions * 100:.1f}% of interactions")
print(f"Top 50% of items cover {cumulative_interactions[int(len(item_popularity)*0.5)] / total_interactions * 100:.1f}% of interactions")

# Visualize long tail
plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.plot(range(len(item_popularity)), item_popularity.values)
plt.title('Item Popularity Distribution (Long Tail)')
plt.xlabel('Item Rank')
plt.ylabel('Number of Interactions')
plt.yscale('log')

plt.subplot(1, 2, 2)
coverage_pct = cumulative_interactions / total_interactions * 100
plt.plot(range(len(item_popularity)), coverage_pct.values)
plt.title('Cumulative Coverage by Item Rank')
plt.xlabel('Item Rank')
plt.ylabel('Cumulative Coverage (%)')
plt.axhline(y=80, color='r', linestyle='--', label='80% Coverage')
plt.legend()

plt.tight_layout()
plt.show()


In [None]:
# Test set analysis
print("=" * 60)
print("TEST SET ANALYSIS")
print("=" * 60)

print(f"Test set shape: {test_df.shape}")
print(f"Test set columns: {list(test_df.columns)}")

# Analyze test users
test_users = set(test_df['user_id'].unique())
train_users = set(train_df['user_id'].unique())

print(f"\nTest user analysis:")
print(f"Number of test users: {len(test_users):,}")
print(f"Test users also in training: {len(test_users.intersection(train_users)):,}")
print(f"New test users (cold start): {len(test_users - train_users):,}")

# For users that exist in training, analyze their behavior
existing_test_users = test_users.intersection(train_users)
if len(existing_test_users) > 0:
    existing_user_data = train_df[train_df['user_id'].isin(existing_test_users)]
    
    print(f"\nExisting test users behavior in training:")
    existing_user_stats = existing_user_data.groupby('user_id').agg({
        'item_id': 'count',
        'rating': ['mean', 'std']
    }).round(2)
    existing_user_stats.columns = ['num_interactions', 'avg_rating', 'rating_std']
    
    print("Statistics for existing test users:")
    display(existing_user_stats.describe())
    
    # Compare with all training users
    all_user_stats = train_df.groupby('user_id').agg({
        'item_id': 'count',
        'rating': ['mean', 'std']
    }).round(2)
    all_user_stats.columns = ['num_interactions', 'avg_rating', 'rating_std']
    
    print(f"\nComparison with all training users:")
    print(f"Existing test users - Mean interactions: {existing_user_stats['num_interactions'].mean():.2f}")
    print(f"All training users - Mean interactions: {all_user_stats['num_interactions'].mean():.2f}")
    print(f"Existing test users - Mean rating: {existing_user_stats['avg_rating'].mean():.2f}")
    print(f"All training users - Mean rating: {all_user_stats['avg_rating'].mean():.2f}")

# Analyze submission format
print(f"\nSample submission analysis:")
print(f"Sample submission shape: {sample_submission.shape}")
print(f"Sample submission columns: {list(sample_submission.columns)}")

# Analyze prediction format
if 'predictions' in sample_submission.columns:
    # Parse predictions to see how many items are recommended per user
    sample_predictions = sample_submission['predictions'].iloc[0]
    prediction_items = sample_predictions.split()
    print(f"Items per user (sample): {len(prediction_items)}")
    print(f"Sample predictions: {sample_predictions}")
    
    # Check if all predictions have the same format
    prediction_lengths = sample_submission['predictions'].apply(lambda x: len(x.split())).value_counts()
    print(f"\nPrediction lengths distribution:")
    display(prediction_lengths)
    
    # Analyze what items appear in sample predictions
    all_prediction_items = []
    for pred in sample_submission['predictions'].head(100):  # Sample to avoid memory issues
        all_prediction_items.extend(pred.split())
    
    prediction_item_counts = pd.Series(all_prediction_items).value_counts()
    print(f"\nMost recommended items in sample submission:")
    display(prediction_item_counts.head(10))
    
    # Check if prediction items exist in training
    train_items = set(train_df['item_id'].astype(str).unique())
    prediction_items_set = set(all_prediction_items)
    
    print(f"\nPrediction items validation:")
    print(f"Unique items in sample predictions: {len(prediction_items_set)}")
    print(f"Prediction items also in training: {len(prediction_items_set.intersection(train_items))}")
    print(f"Prediction items not in training: {len(prediction_items_set - train_items)}")

# Visualize test vs train user overlap
plt.figure(figsize=(10, 6))
labels = ['Cold Start Users', 'Existing Users']
sizes = [len(test_users - train_users), len(test_users.intersection(train_users))]
colors = ['lightcoral', 'lightblue']

plt.pie(sizes, labels=labels, colors=colors, autopct='%1.1f%%', startangle=90)
plt.title('Test Set User Distribution')
plt.axis('equal')
plt.show()
