In [79]:
import pandas as pd
import numpy as np

ratings='../Data/ml-latest/ratings.csv'

df_ratings=pd.read_csv(ratings, usecols=['userId','movieId','rating'],
    dtype={
        'userId':'int32',
        'movieId':'int32',
        'rating':'float32',
        }
    )

In [81]:
df_ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27753444 entries, 0 to 27753443
Data columns (total 3 columns):
 #   Column   Dtype  
---  ------   -----  
 0   userId   int32  
 1   movieId  int32  
 2   rating   float32
dtypes: float32(1), int32(2)
memory usage: 317.6 MB


In [80]:
total_users = len(df_ratings.groupby('userId').count())
total_users

283228

In [54]:
avg_ratings_per_user = df_ratings.groupby('userId')['rating'].count().mean()
avg_ratings_per_user

97.98976089934611

The average amount of rating is 98, well let's say 100. But there are some outliers as we will see. 

In [57]:
rating_count = df_ratings['rating'].value_counts()
rating_count.sort_values()

1.5     441354
0.5     442388
1.0     886233
2.5    1373419
2.0    1850627
4.5    2373550
3.5    3404360
5.0    4071135
3.0    5515668
4.0    7394710
Name: rating, dtype: int64

Wide spread in amount of ratings, with 4 stars being submitted the most and 1.5 the least.

---

## Upper bound of user ratings

Let's have a look at the most dedicated movie raters.

In [39]:
user_ratings_counts = df_ratings.groupby('userId').count()['rating']    # group ratings by userId, count n of ratings for each user
user_with_most_ratings = user_ratings_counts.idxmax()                   # find the index of the user with the most ratings

print(f'User {user_with_most_ratings} has rated the most movies, with {user_ratings_counts[user_with_most_ratings]} ratings.')

User 123100 has rated the most movies, with 23715 ratings.


In [87]:
n = 5500
users_found = df_ratings.groupby('userId')['rating'].count()
users_found = users_found[users_found > n]
print(f'{len(users_found)} users found with > {n} ratings made:')

top_dedicated = [] # we'll use this later

for user_id, num_ratings in users_found.items():
    print(f'User {user_id}: {num_ratings} ratings')
    top_dedicated.append(user_id)


11 users found with > 5500 ratings made:
User 63783: 6346 ratings
User 77609: 6398 ratings
User 111908: 6645 ratings
User 117490: 9279 ratings
User 123100: 23715 ratings
User 134596: 8381 ratings
User 141955: 5810 ratings
User 158002: 5701 ratings
User 172357: 5868 ratings
User 212343: 7884 ratings
User 242683: 7515 ratings


We have this one user who rated over 23,000 movies. It's not until in the ~6000 ratings range where the tail of the other users seem to begin.

In [73]:
n = 1000
users_with_n_ratings = user_ratings_counts[user_ratings_counts >= n].index.tolist()  # select users with => n ratings
print(f'{len(users_with_n_ratings)} users found with >= {n} ratings made.')

2782 users found with >= 1000 ratings made.


In [43]:
users_with_n_ratings = user_ratings_counts[user_ratings_counts >= n].index.tolist()  # select users with => n ratings
print(f'{len(users_with_n_ratings)} users found with >= {n} ratings made.')

68342 users found with >= 100 ratings made.


In [52]:
n = 2000
users_with_n_ratings = user_ratings_counts[user_ratings_counts >= n].index.tolist()  # select users with => n ratings
print(f'{len(users_with_n_ratings)} users found with >= {n} ratings made.')

424 users found with >= 2000 ratings made.


# Lower bound of user ratings

In [93]:
n = 50
users_with_few_ratings = user_ratings_counts[user_ratings_counts < n].index.tolist()  # select users with < n ratings
print(f'{len(users_with_few_ratings)} users found with < {n} ratings made.')

173556 users found with < 50 ratings made.


## Users who rated movies low or very low

In [86]:
df_ratings_05 = df_ratings[df_ratings['rating'] == 0.5]     # select all rows with a rating of 0.5

user_05_ratings_counts = df_ratings_05.groupby('userId').count()['rating']  # count number of 0.5 ratings for each user

user_with_most_05_ratings = user_05_ratings_counts.idxmax()

print(f'User {user_with_most_05_ratings} has the most negative ratings, with {user_05_ratings_counts.loc[user_with_most_05_ratings]} 0.5 star ratings.')

User 94843 has the most negative ratings, with 3099 0.5 star ratings.


In [83]:
user_id = 94843
user_ratings = df_ratings[df_ratings['userId'] == user_id]
print(user_ratings)


         userId  movieId  rating
9191555   94843        1     1.0
9191556   94843        2     3.0
9191557   94843        3     0.5
9191558   94843        4     0.5
9191559   94843        5     0.5
...         ...      ...     ...
9196680   94843   192113     0.5
9196681   94843   192115     0.5
9196682   94843   192117     0.5
9196683   94843   192119     0.5
9196684   94843   192121     0.5

[5130 rows x 3 columns]


In [84]:
user_ratings.value_counts()

userId  movieId  rating
94843   1        1.0       1
        65802    0.5       1
        60363    0.5       1
        60333    0.5       1
        60161    3.0       1
                          ..
        3101     2.5       1
        3100     0.5       1
        3098     0.5       1
        3097     0.5       1
        192121   0.5       1
Length: 5130, dtype: int64

In [47]:
users_with_only_05 = df_ratings.groupby('userId')['rating'].nunique() == 1
users_with_only_05 = users_with_only_05[users_with_only_05].index
users_with_only_05_05 = df_ratings[(df_ratings['userId'].isin(users_with_only_05)) & (df_ratings['rating'] == 0.5)]

print(users_with_only_05_05)


          userId  movieId  rating
22130        220      410     0.5
31974        309      303     0.5
31975        309      315     0.5
31976        309      485     0.5
31977        309      543     0.5
...          ...      ...     ...
27692816  282520     1376     0.5
27692817  282520     1396     0.5
27692818  282520     1639     0.5
27692819  282520     2699     0.5
27692820  282520     5060     0.5

[1182 rows x 3 columns]


## Users who only leaves 5 star reviews

In [91]:
users_with_only_5 = df_ratings.groupby('userId')['rating'].nunique() == 1
users_with_only_5 = users_with_only_5[users_with_only_5].index
users_with_only_5_5 = df_ratings[(df_ratings['userId'].isin(users_with_only_5)) & (df_ratings['rating'] == 5)]

print(users_with_only_5_5)

          userId  movieId  rating
938            9     2858     5.0
14087        142       50     5.0
14088        142      318     5.0
14089        142      356     5.0
14090        142      527     5.0
...          ...      ...     ...
27744187  283119     1197     5.0
27744188  283119     1907     5.0
27744189  283119     1909     5.0
27744190  283119     2006     5.0
27747952  283169     1721     5.0

[22745 rows x 3 columns]


In [94]:
remove_top = df_ratings[~df_ratings['userId'].isin(top_dedicated)]                              # remove all users with more than 6000 ratings made
remove_low_ratings = remove_top[~df_ratings['userId'].isin(users_with_only_05_05)]              # remove all users who ONLY rated movies 0.5
remove_high_ratings = remove_low_ratings[~df_ratings['userId'].isin(users_with_only_5_5)]       # remove all users who ONLY rated movies 5
remove_few_raters = remove_high_ratings[~df_ratings['userId'].isin(users_with_few_ratings)]     # remove all users with less than 50 ratings made

new_avg_ratings_per_user = remove_few_raters.groupby('userId')['rating'].count().mean()

new_avg_ratings_per_user

  remove_low_ratings = remove_top[~df_ratings['userId'].isin(users_with_only_05_05)]              # remove all users who ONLY rated movies 0.5
  remove_high_ratings = remove_low_ratings[~df_ratings['userId'].isin(users_with_only_5_5)]       # remove all users who ONLY rated movies 5
  remove_few_raters = remove_high_ratings[~df_ratings['userId'].isin(users_with_few_ratings)]     # remove all users with less than 50 ratings made


222.55275804524854

## Some ideas to get a better mean rating and make the data more manageable

- Remove users who rated >6000 movies

- Remove users who rated <50 movies

- Remove users who only gave 0.5 star ratings

- Remove users who only gave 5 star ratings

- Set upper bound for amount of ratins