# MovieLens 32M Exploratory Data Analysis
This notebook explores the dataset to understand user behavior and movie distributions.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Setup plotting
plt.style.use('ggplot')
%matplotlib inline

# Load processed data
data_dir = Path('../data/processed')
ratings = pd.read_parquet(data_dir / 'ratings.parquet')
movies = pd.read_parquet(data_dir / 'movies.parquet')

print(f'Loaded {len(ratings):,} ratings and {len(movies):,} movies.')

In [None]:
# Basic stats
n_users = ratings['userId'].nunique()
n_movies = ratings['movieId'].nunique()
avg_rating = ratings['rating'].mean()

print(f'Number of unique users: {n_users:,}')
print(f'Number of unique movies: {n_movies:,}')
print(f'Average rating: {avg_rating:.2f}')

In [None]:
# Rating distribution
plt.figure(figsize=(10, 5))
sns.countplot(x='rating', data=ratings, palette='viridis')
plt.title('Distribution of Movie Ratings')
plt.show()