# exploring ratings data csv

In [None]:
import pandas as pd
import numpy as np

In [None]:
ratings = pd.read_csv('../data/raw/ml-32m/ratings.csv')
print(f"total ratings: {len(ratings):,}")
ratings.head()

Total ratings: 32,000,204


Unnamed: 0,userId,movieId,rating,timestamp
0,1,17,4.0,944249077
1,1,25,1.0,944250228
2,1,29,2.0,943230976
3,1,30,5.0,944249077
4,1,32,5.0,943228858


In [None]:
print("shape:", ratings.shape)
print("\ncols:", ratings.columns.tolist())
print("\ndata types:")
print(ratings.dtypes)

shape: (32000204, 4)

cols: ['userId', 'movieId', 'rating', 'timestamp']

data types:
userId         int64
movieId        int64
rating       float64
timestamp      int64
dtype: object


In [None]:
print("rating distribution:")
print(ratings['rating'].value_counts().sort_index())
print(f"\navg rating: {ratings['rating'].mean():.2f}")
print(f"rating std dev: {ratings['rating'].std():.2f}")

Rating distribution:
rating
0.5     525132
1.0     946675
1.5     531063
2.0    2028622
2.5    1685386
3.0    6054990
3.5    4290105
4.0    8367654
4.5    2974000
5.0    4596577
Name: count, dtype: int64

Average rating: 3.54
Rating std: 1.06


In [None]:
user_counts = ratings['userId'].value_counts()
print(f"unique users: {ratings['userId'].nunique():,}")
print(f"average # ratings per user: {user_counts.mean():.1f}")
print(f"most active user has: {user_counts.max()} ratings")
print(f"least active user have: {user_counts.min()} ratings")

Total unique users: 200,948
Average ratings per user: 159.2
Most active user has: 33332 ratings
Least active users have: 20 ratings


In [None]:
movie_counts = ratings['movieId'].value_counts()
print(f"unique movies rated: {ratings['movieId'].nunique():,}")
print(f"average ratings per movie: {movie_counts.mean():.1f}")
print(f"most rated movie has: {movie_counts.max()} ratings")
print(f"least rated movies have: {movie_counts.min()} ratings")

Total unique movies: 84,432
Average ratings per movie: 379.0
Most rated movie has: 102929 ratings
Least rated movies have: 1 ratings


In [None]:
sample_user = user_counts.index[10]
user_ratings = ratings[ratings['userId'] == sample_user].sort_values('rating', ascending=False)
print(f"sampling user {sample_user}'s ratings ({len(user_ratings)} total):")
print(user_ratings[['movieId', 'rating']].head(10).to_string(index=False))

Sample user 14674 ratings (6407 total):
 movieId  rating
    2918     5.0
    1196     5.0
     589     5.0
    2108     5.0
    1036     5.0
    2087     5.0
    1073     5.0
    2795     5.0
    2005     5.0
    2000     5.0


In [None]:
print(f"\nuser {sample_user}'s rating distribution:")
user_rating_dist = user_ratings['rating'].value_counts().sort_index()
for rating, count in user_rating_dist.items():
    print(f"{rating:.1f} stars: {count} movies")


User 14674 rating distribution:
0.5 stars: 225 movies
1.0 stars: 340 movies
1.5 stars: 750 movies
2.0 stars: 1309 movies
2.5 stars: 1482 movies
3.0 stars: 1637 movies
3.5 stars: 412 movies
4.0 stars: 113 movies
4.5 stars: 54 movies
5.0 stars: 85 movies


In [None]:
avg_ratings = ratings.groupby('movieId').agg({
    'rating': ['mean', 'count']
}).round(2)
avg_ratings.columns = ['avg_rating', 'num_ratings']
avg_ratings = avg_ratings[avg_ratings['num_ratings'] >= 100]

print("top rated movies (100+ ratings):")
top_movies = avg_ratings.sort_values('avg_rating', ascending=False).head(10)
print(top_movies.to_string())

Top rated movies (100+ ratings):
         avg_rating  num_ratings
movieId                         
171011         4.45         1956
159817         4.44         2948
170705         4.43         2811
318            4.40       102929
171495         4.33          615
858            4.32        66440
202439         4.31        11670
198185         4.30         1140
179135         4.30         1163
220528         4.29          449
