In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [3]:

# Load data
ratings = pd.read_csv('../data/raw/ratings.csv')
movies = pd.read_csv('../data/raw/movies.csv')


In [4]:

print("=== RATINGS ===")
print(ratings.head())
print(f"\nShape: {ratings.shape}")
print(f"\nInfo:")
print(ratings.info())
print(f"\nBasic stats:")
print(ratings.describe())

print("\n=== MOVIES ===")
print(movies.head())
print(f"\nShape: {movies.shape}")
print(f"\nSample genres:")
print(movies['genres'].value_counts().head(10))

print("\n=== DATA QUALITY CHECKS ===")
print(f"Missing values in ratings:\n{ratings.isnull().sum()}")
print(f"\nMissing values in movies:\n{movies.isnull().sum()}")
print(f"\nDuplicate ratings: {ratings.duplicated().sum()}")
print(f"\nUnique users: {ratings['userId'].nunique()}")
print(f"\nUnique movies in ratings: {ratings['movieId'].nunique()}")
print(f"\nUnique movies in movies.csv: {movies['movieId'].nunique()}")

print("\n=== RATING DISTRIBUTION ===")
print(ratings['rating'].value_counts().sort_index())

print("\n=== SPARSITY ===")
n_users = ratings['userId'].nunique()
n_movies = ratings['movieId'].nunique()
n_ratings = len(ratings)
sparsity = 1 - (n_ratings / (n_users * n_movies))
print(f"Matrix sparsity: {sparsity:.4f} ({sparsity*100:.2f}% empty)")

=== RATINGS ===
   userId  movieId  rating   timestamp
0       1      296     5.0  1147880044
1       1      306     3.5  1147868817
2       1      307     5.0  1147868828
3       1      665     5.0  1147878820
4       1      899     3.5  1147868510

Shape: (25000095, 4)

Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000095 entries, 0 to 25000094
Data columns (total 4 columns):
 #   Column     Dtype  
---  ------     -----  
 0   userId     int64  
 1   movieId    int64  
 2   rating     float64
 3   timestamp  int64  
dtypes: float64(1), int64(3)
memory usage: 762.9 MB
None

Basic stats:
             userId       movieId        rating     timestamp
count  2.500010e+07  2.500010e+07  2.500010e+07  2.500010e+07
mean   8.118928e+04  2.138798e+04  3.533854e+00  1.215601e+09
std    4.679172e+04  3.919886e+04  1.060744e+00  2.268758e+08
min    1.000000e+00  1.000000e+00  5.000000e-01  7.896520e+08
25%    4.051000e+04  1.196000e+03  3.000000e+00  1.011747e+09
50%    8.091400e+04 