In [1]:
import pandas as pd
import numpy as np

ratings='../Data/ml-latest/ratings.csv'
movies='../Data/ml-latest/movies.csv'

df=pd.read_csv(ratings, usecols=['userId','movieId','rating'],
    dtype={
        'userId':'int32',
        'movieId':'int32',
        'rating':'float32',
        }
    )

titles=pd.read_csv(movies, usecols=['movieId','title'],
    dtype={
        'movieId':'int32',
        'title':'str',
        }
    )

df_genres=pd.read_csv(movies, usecols=['movieId','genres'],
    dtype={
        'movieId':'int32',
        'genres':'str',
        }
    )

## How many movies and how many users are in the dataset?

In [2]:
print(f"Found {len(df['movieId'].value_counts())} movies and {len(df['userId'].value_counts())} users.")

Found 53889 movies and 283228 users.


## How many genres are there?

In [3]:
print(f"Found {len(df_genres['movieId'].unique())} genres.")

Found 58098 genres.


## What genres are the most popular?

In [4]:
print(df_genres['genres'].value_counts().head(10))

Drama                   8402
Comedy                  5372
(no genres listed)      4266
Documentary             4250
Comedy|Drama            2212
Drama|Romance           2028
Comedy|Romance          1506
Horror                  1459
Comedy|Drama|Romance    1001
Drama|Thriller           863
Name: genres, dtype: int64


In [5]:
remove_genres = df_genres[df_genres['genres'] == '(no genres listed)']['movieId']

In [6]:
new_df = df.drop(df[df['movieId'].isin(remove_genres)].index)
print(f"Succesfully dropped {len(df['movieId']) - len(new_df['movieId'])} rows with missing genres.")

Succesfully dropped 18389 rows with missing genres.


## How many movies have received < 20 ratings?

In [7]:
n = 20

movie_counts = df['movieId'].value_counts()
movies_below_n_occurrences = movie_counts[movie_counts < n].index.tolist()
#movie_titles = titles.set_index('movieId')['title'].loc[movies_below_n_occurrences]

print(f"Found {len(movies_below_n_occurrences)} movies with less than {n} ratings.")
#print(f"Movies with less than {n} occurrences:")
#print(movie_titles[:10].to_string())

Found 35523 movies with less than 20 ratings.


In [8]:
new_df = new_df.drop(new_df[new_df['movieId'].isin(movies_below_n_occurrences)].index)
print(f"Total rows dropped: {len(df['movieId']) - len(new_df['movieId'])}.")

Total rows dropped: 174538.


## How many users have submitted < 20 ratings?

In [9]:
n = 20
user_ratings_counts = df['userId'].value_counts()
users_with_few_ratings = user_ratings_counts[user_ratings_counts < n].index.tolist()  # select users with < n ratings
print(f"Found {len(users_with_few_ratings)} users who submitted less than {n} ratings.")

Found 108623 users who submitted less than 20 ratings.


In [10]:
new_df = new_df.drop(new_df[new_df['userId'].isin(users_with_few_ratings)].index)
print(f"Total rows dropped: {len(df['movieId']) - len(new_df['movieId'])}.")

Total rows dropped: 1381450.


## How many users have submitted > 500 ratings?

In [11]:
n = 500
user_ratings_counts = df['userId'].value_counts() # repeating code just to keep track of what I'm doing

users_with_n_ratings = user_ratings_counts[user_ratings_counts >= n].index.tolist()  # select users with => n ratings
print(f'{len(users_with_n_ratings)} users found with >= {n} ratings made.')

10288 users found with >= 500 ratings made.


In [12]:
new_df = new_df.drop(new_df[new_df['userId'].isin(users_with_n_ratings)].index)
print(f"Total rows dropped: {len(df['movieId']) - len(new_df['movieId'])}.")

Total rows dropped: 10612946.


## How many users have submitted **only** lowest possible score?

In [13]:
#TODO: Refactor this cell, it's too spaghetti

users_with_only_05 = df.groupby('userId')['rating'].nunique() == 1
users_with_only_05 = users_with_only_05[users_with_only_05].index
users_with_only_05_05 = df[(df['userId'].isin(users_with_only_05)) & (df['rating'] == 0.5)]

print(f"Found {len(users_with_only_05_05['userId'].index)} users who only submitted 0.5 star reviews.")

Found 1182 users who only submitted 0.5 star reviews.


In [14]:
#TODO: Refactoring try #1

mask = df.groupby('userId')['rating'].unique().apply(lambda x: len(x)==1 and x[0]==0.5)
print(f"Found {len(mask[mask].index)} users who only submitted 0.5 star reviews")

Found 259 users who only submitted 0.5 star reviews


Hm, something seems off. But let's just throw them all and figure out later how to do with this.

Edit: ohh, first method we get all rows, second method we only get all users.

In [15]:
new_df = new_df.drop(new_df[new_df['userId'].isin(users_with_only_05_05 )].index)
print(f"Total rows dropped: {len(df['movieId']) - len(new_df['movieId'])}.")

Total rows dropped: 10612946.


In [16]:
new_df = new_df.drop(new_df[new_df['userId'].isin(mask)].index)
print(f"Total rows dropped: {len(df['movieId']) - len(new_df['movieId'])}.")

Total rows dropped: 10612946.


No rows were dropped for the lowest possible scores. Maybe the rows were already dropped from previous operations or maybe I did something wrong with the mask. We can check that later.

## What are we left with now?

In [17]:
df.size

83260332

In [18]:
unique_names = df['movieId'].unique()
len(unique_names)

53889

In [19]:
new_df.size

51421494

In [20]:
unique_names = new_df['movieId'].unique()
len(unique_names)

18220

Hmm, idk, but seems good enough for now.

## Compare central tendencies between the reduced dataset and the original.

In [21]:
def get_rating(df, mId, get_mean=True):
    if get_mean:
        return df.loc[df['movieId'] == mId]['rating'].mean()
    else:
        return df.loc[df['movieId'] == mId]['rating']

In [22]:
def top_rated(n, df, include_index=False, include_movieId=False, include_rating=False):
    top = df.groupby('movieId')['rating'].mean().sort_values(ascending=False).head(n).index

    for i in top:
        title = titles[titles['movieId'] == i]

        print(title['title'].to_string(index=include_index))

        if include_movieId:
            print(f"movieId: {i}")

        print(f"Total ratings: {df[df['movieId'] == i]['rating'].count()}")

        if include_rating:
            print(f'Rating: {get_rating(df, i):.2f}')

        print('\n', end='')

def compare_lists(list1, list2):
    if len(list1) != len(list2):
        print("Lists have different lengths.")
        return
    
    # Check if the indices are the same
    if list1.index.equals(list2.index):
        print("Indices are the same.")
    else:
        print("Indices are different.")
    
    # Check if the values are the same
    if np.allclose(list1.values, list2.values, rtol=0.1):
        print("Values are the same.")
    else:
        print("Values are different.")
        
        # Calculate the percentage difference for each value
        diff_percent = (abs(list1 - list2) / ((list1 + list2) / 2)) * 100
        
        print("Percentage difference:")
        print(diff_percent)

In [23]:
new = new_df['rating'].value_counts(normalize=True).sort_index()
old = df['rating'].value_counts(normalize=True).sort_index()

In [24]:
compare_lists(new,old)

Indices are the same.
Values are different.
Percentage difference:
0.5    21.000230
1.0     8.053519
1.5    31.550340
2.0    12.925782
2.5    31.921156
3.0     2.604945
3.5    15.240901
4.0     5.882086
4.5     7.657666
5.0    19.157040
Name: rating, dtype: float64


In [25]:
top_rated(10, df=new_df, include_index=False, include_movieId=True, include_rating=True)

David Cross: Making America Great Again (2016)
movieId: 163653
Total ratings: 1
Rating: 5.00

Give 'em Hell, Malone (2009)
movieId: 116831
Total ratings: 1
Rating: 5.00

Svengali (1931)
movieId: 79711
Total ratings: 3
Rating: 5.00

Cluny Brown (1946)
movieId: 79809
Total ratings: 1
Rating: 5.00

Next Stop, Greenwich Village (1976)
movieId: 70595
Total ratings: 1
Rating: 5.00

McQ (1974)
movieId: 54020
Total ratings: 1
Rating: 5.00

Journey of Hope (Reise der Hoffnung) (1990)
movieId: 6778
Total ratings: 2
Rating: 5.00

Children of the Corn: Genesis (2011)
movieId: 116034
Total ratings: 1
Rating: 5.00

Wagon Master (1950)
movieId: 63793
Total ratings: 1
Rating: 5.00

Bombshell: The Hedy Lamarr Story (2017)
movieId: 183641
Total ratings: 1
Rating: 5.00



Why are there movies with totals of 1 ratings? I'm pretty sure I removed those?

## Reduce all ratings to <500 / movie

In [26]:
def random_remove_rows(df):
    # Get the counts for each movieId
    counts = df['movieId'].value_counts()

    # Determine which movieIds have more than 500 ratings
    mask = counts > 500
    movieIds_to_remove = mask.index[mask]

    def remove_rows_randomly(group):
        if len(group) > 500 and group['movieId'].iloc[0] in movieIds_to_remove:
            indices_to_remove = np.random.choice(group.index, size=len(group) - 500, replace=False)
            group.drop(indices_to_remove, inplace=True)
            return group, len(indices_to_remove)
        else:
            return group, 0

    # Apply custom function to each group
    groups = df.groupby('movieId')
    results = [remove_rows_randomly(group) for name, group in groups]

    # Combine results into final dataframe and count of removed rows
    df = pd.concat([result[0] for result in results])
    removed_rows = sum([result[1] for result in results])

    return df, removed_rows

In [27]:
new_df, removed_rows = random_remove_rows(new_df)

In [28]:
new_df.size

8787219

In [29]:
top_rated(10, df=new_df, include_index=False, include_movieId=True, include_rating=True)

Fiend Without a Face (1958)
movieId: 26019
Total ratings: 1
Rating: 5.00

Devil at 4 O'Clock, The (1961)
movieId: 6048
Total ratings: 1
Rating: 5.00

House That Dripped Blood, The (1971)
movieId: 40457
Total ratings: 1
Rating: 5.00

Honeydripper (2007)
movieId: 60436
Total ratings: 1
Rating: 5.00

Jimmy Carr: Laughing and Joking (2013)
movieId: 156787
Total ratings: 2
Rating: 5.00

Wagon Master (1950)
movieId: 63793
Total ratings: 1
Rating: 5.00

Son of Paleface (1952)
movieId: 25976
Total ratings: 1
Rating: 5.00

Mothlight (1963)
movieId: 100513
Total ratings: 1
Rating: 5.00

Gold Diggers of 1935 (1935)
movieId: 31297
Total ratings: 1
Rating: 5.00

Target (1985)
movieId: 26559
Total ratings: 1
Rating: 5.00



In [30]:
def top_rated(n, df, include_index=False, include_movieId=False, include_rating=False):
    top = df.groupby('movieId')['rating'].mean().sort_values(ascending=False).head(n).index

    for i in top:
        title = titles[titles['movieId'] == i]

        print(title['title'].to_string(index=include_index))

        if include_movieId:
            print(f"movieId: {i}")

        print(f"Total ratings: {df[df['movieId'] == i]['rating'].count()}")

        if include_rating:
            print(f'Rating: {get_rating(df, i):.2f}')

        print('\n', end='')

In [31]:
top_rated(10, df=new_df, include_index=False, include_movieId=True, include_rating=True)

Fiend Without a Face (1958)
movieId: 26019
Total ratings: 1
Rating: 5.00

Devil at 4 O'Clock, The (1961)
movieId: 6048
Total ratings: 1
Rating: 5.00

House That Dripped Blood, The (1971)
movieId: 40457
Total ratings: 1
Rating: 5.00

Honeydripper (2007)
movieId: 60436
Total ratings: 1
Rating: 5.00

Jimmy Carr: Laughing and Joking (2013)
movieId: 156787
Total ratings: 2
Rating: 5.00

Wagon Master (1950)
movieId: 63793
Total ratings: 1
Rating: 5.00

Son of Paleface (1952)
movieId: 25976
Total ratings: 1
Rating: 5.00

Mothlight (1963)
movieId: 100513
Total ratings: 1
Rating: 5.00

Gold Diggers of 1935 (1935)
movieId: 31297
Total ratings: 1
Rating: 5.00

Target (1985)
movieId: 26559
Total ratings: 1
Rating: 5.00



In [32]:
new = new_df['rating'].value_counts(normalize=True).sort_index()
old = df['rating'].value_counts(normalize=True).sort_index()

In [33]:
compare_lists(new,old)

Indices are the same.
Values are different.
Percentage difference:
0.5    24.531394
1.0    31.896178
1.5     4.025523
2.0    10.858897
2.5    11.163049
3.0     0.375243
3.5     5.392157
4.0     1.478488
4.5     0.488727
5.0     5.859574
Name: rating, dtype: float64


# We fucked up big time

In [34]:
new_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2929073 entries, 18607 to 24852543
Data columns (total 3 columns):
 #   Column   Dtype  
---  ------   -----  
 0   userId   int32  
 1   movieId  int32  
 2   rating   float32
dtypes: float32(1), int32(2)
memory usage: 55.9 MB


Well, at least its only 56 MB.

Let's start over. I have some new ideas.