# MovieLens 32M Exploratory Data Analysis
This notebook performs core analysis to guide the recommendation strategy.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from pathlib import Path

# Setup plotting
plt.style.use('ggplot')
%matplotlib inline

# Load processed data
data_dir = Path('../data/processed')
ratings = pd.read_parquet(data_dir / 'ratings.parquet')
movies = pd.read_parquet(data_dir / 'movies.parquet')

print(f'Loaded {len(ratings):,} ratings and {len(movies):,} movies.')

## 1. Sparsity Analysis

In [None]:
n_users = ratings['userId'].nunique()
n_movies = ratings['movieId'].nunique()
actual_ratings = len(ratings)
possible_ratings = n_users * n_movies
sparsity = (1 - actual_ratings / possible_ratings) * 100

print(f'Number of Users: {n_users:,}')
print(f'Number of Movies: {n_movies:,}')
print(f'Theoretical Max Ratings: {possible_ratings:,}')
print(f'Actual Ratings: {actual_ratings:,}')
print(f'Dataset Sparsity: {sparsity:.4f}%')

## 2. Long-tail Effect

In [None]:
movie_counts = ratings.groupby('movieId').size().sort_values(ascending=False)

plt.figure(figsize=(12, 6))
plt.plot(movie_counts.values)
plt.title('Movie Popularity Distribution (Long Tail)')
plt.xlabel('Movie Index (Ranked by popularity)')
plt.ylabel('Number of Ratings')
plt.yscale('log')
plt.show()

top_10_percent_cutoff = int(len(movie_counts) * 0.1)
top_10_percent_ratings = movie_counts.iloc[:top_10_percent_cutoff].sum()
percentage = (top_10_percent_ratings / len(ratings)) * 100
print(f'The top 10% of movies account for {percentage:.2f}% of all ratings.')

## 3. User Activity

In [None]:
user_counts = ratings.groupby('userId').size().sort_values(ascending=False)

plt.figure(figsize=(10, 5))
sns.histplot(user_counts, bins=50, kde=True)
plt.title('User Activity Distribution')
plt.xlabel('Number of Ratings per User')
plt.xscale('log')
plt.show()

print(f'Average ratings per user: {user_counts.mean():.1f}')
print(f'Median ratings per user: {user_counts.median():.1f}')
print(f'Min ratings per user: {user_counts.min()}')
print(f'Max ratings per user: {user_counts.max()}')

## 4. Genre Distribution

In [None]:
genres_exploded = movies.explode('genres')
genre_counts = genres_exploded['genres'].value_counts()

plt.figure(figsize=(12, 6))
sns.barplot(x=genre_counts.values, y=genre_counts.index, palette='magma')
plt.title('Distribution of Movie Genres')
plt.xlabel('Number of Movies')
plt.show()