Convert ratings into unique imdbIDs version

In [40]:
import pandas as pd
import numpy as np
import scipy as sp

In [41]:
def clean_ratings(std_threshold):
    # load ratings and mapping of movieID and imdbID
    ratings = pd.read_csv('../data/raw/ratings.csv', sep=',')
    mapping = movies[['id', 'imdbID']].rename(columns={'id':'movieID'})
    # merge ratings with imdbID to get uniquely identifiable movies
    ratings_imdb = ratings.merge(mapping, how='left', on='movieID')
    
    # duplicates contains all duplicated ratings
    duplicates = pd.DataFrame()
    # select all user_ids
    for i in ratings_imdb['user_id'].unique():
        # extract all entries of this user
        ratings_user = ratings_unique[ratings_unique['user_id']==i]
        # select all multiple ratings for one movie of this user
        ratings_mltpl = ratings_user[ratings_user.duplicated(subset='imdbID', keep=False)]
        # calculate the standard deviation of multiple ratings for each movie. Set threshold to accept
        # returns a boolean array indicating for each imdbID if the ratings are acceptable or differ too much
        std_ratings = ratings_mltpl.groupby('imdbID').std()['rating']==std_threshold
        # create new column, check for each entry if the corresponding imdbID is good or not
        ratings_mltpl['std'] = ratings_mltpl['imdbID'].apply(lambda x: std_ratings[x])
        # append the multiple ratings of this user to the df of all multiple ratings 
        duplicates = duplicates.append(ratings_mltpl)
        
    # merge the 'std'-row of duplicates with the ratings to indicated which entries are okay or not
    ratings_clean = ratings_imdb.merge(duplicates['std'], how='left', left_index=True, right_index=True)
    # each entry that is no duplicate is okay
    ratings_clean['std'] = ratings_clean['std'].fillna(True)
    print('Total entries before cleaning: ', ratings_clean.shape[0])
    # drop entries which are not okay
    ratings_clean = ratings_clean[ratings_clean['std']]
    print('Total entries after cleaning: ', ratings_clean.shape[0])
    
    #save the ratings in the right format and with the imdbID as the movie identifier
    ratings_finished = ratings_clean[['user_id', 'imdbID', 'rating']]
    ratings_finished.to_csv('../data/preprocessed/ratings_clean_std_'+str(std_threshold)+'.csv')

In [42]:
clean_ratings(std_threshold=0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ratings_mltpl['std'] = ratings_mltpl['imdbID'].apply(lambda x: std_ratings[x])


Total entries before cleaning:  812818
Total entries after cleaning:  792155


# Code row for row for easy comprehension and error-checking

In [12]:
# load ratings and mapping of movieID and imdbID
ratings = pd.read_csv('../data/raw/ratings.csv', sep=',')
mapping = movies[['id', 'imdbID']].rename(columns={'id':'movieID'})

In [13]:
# merge ratings with imdbID to get uniquely identifiable movies
ratings_imdb = ratings.merge(mapping, how='left', on='movieID')

In [31]:
# gives all user_ids back
ratings_imdb['user_id'].unique()

array([1264,  213,  593, ..., 2025,  303, 1838], dtype=int64)

Example for one user

In [21]:
# extract all entries of this user
ratings_user = ratings_unique[ratings_unique['user_id']==1264]
ratings_unique[ratings_unique['user_id']==1264]

Unnamed: 0,user_id,movieID,rating,imdbID
0,1264,2363,3.5,tt0047034
268,1264,1373,3.0,tt0098382
482,1264,5299,4.0,tt0259446
789,1264,3247,3.5,tt0105417
1524,1264,3438,1.5,tt0100758
...,...,...,...,...
807165,1264,2255,3.0,tt0084938
807957,1264,3421,3.5,tt0077975
811410,1264,596,4.0,tt0032910
811718,1264,3755,4.0,tt0177971


In [22]:
# select all multiple ratings for one movie of this user
ratings_mltpl = ratings_user[ratings_user.duplicated(subset='imdbID', keep=False)]
ratings_user[ratings_user.duplicated(subset='imdbID', keep=False)]

Unnamed: 0,user_id,movieID,rating,imdbID
11038,1264,708,3.5,tt0117979
26409,1264,648,3.0,tt0317919
33197,1264,2366,4.5,tt0360717
36567,1264,5504,3.5,tt0287717
50243,1264,7438,4.0,tt0378194
74292,1264,6874,4.0,tt0378194
93512,1264,934,4.0,tt0101862
109310,1264,2529,4.5,tt0063442
114422,1264,1013,4.0,tt0120783
135844,1264,3723,5.0,tt0116477


In [23]:
# calculate the standard deviation of multiple ratings for each movie. Set threshold to accept
# returns a boolean array indicating for each imdbID if the ratings are acceptable or differ too much
std_ratings = ratings_mltpl.groupby('imdbID').std()['rating']==0
ratings_mltpl.groupby('imdbID').std()['rating']==0

imdbID
tt0039628    False
tt0054033     True
tt0055254     True
tt0063442    False
tt0091064    False
tt0093409     True
tt0097722    False
tt0101862     True
tt0107131     True
tt0108333    False
tt0109279    False
tt0114709    False
tt0116477    False
tt0117979    False
tt0119683    False
tt0120783    False
tt0145487    False
tt0167260    False
tt0287717    False
tt0317919    False
tt0360717    False
tt0378194     True
tt0381061    False
Name: rating, dtype: bool

In [24]:
# create new column, check for each entry if the corresponding imdbID is good or not
ratings_mltpl['std'] = ratings_mltpl['imdbID'].apply(lambda x: std_ratings[x])
ratings_mltpl['imdbID'].apply(lambda x: std_ratings[x])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ratings_mltpl['std'] = ratings_mltpl['imdbID'].apply(lambda x: std_ratings[x])


11038     False
26409     False
33197     False
36567     False
50243      True
74292      True
93512      True
109310    False
114422    False
135844    False
206963    False
208729    False
236344    False
248495    False
286017    False
299929    False
315971     True
351890    False
355650    False
366226     True
370363    False
377426    False
380791    False
396141     True
404957     True
406546    False
407566    False
433989     True
451930    False
461366    False
478208    False
487298     True
504233    False
518191    False
524383    False
533543    False
569389    False
609939     True
611599     True
639500    False
652417    False
678487    False
689828    False
690581    False
710072    False
740031    False
742593    False
785299     True
Name: imdbID, dtype: bool

In [25]:
# merge the 'std'-row of duplicates with the ratings to indicated which entries are okay or not
ratings_clean = ratings_imdb.merge(duplicates['std'], how='left', left_index=True, right_index=True)

In [26]:
# each entry that is no duplicate is okay
ratings_clean['std'] = ratings_clean['std'].fillna(True)
ratings_clean['std'].fillna(True)

0         True
1         True
2         True
3         True
4         True
          ... 
812813    True
812814    True
812815    True
812816    True
812817    True
Name: std, Length: 812818, dtype: bool

In [27]:
print('Total entries before cleaning: ', ratings_clean.shape[0])

Total entries before cleaning:  812818


In [28]:
# drop entries which are not okay
ratings_clean = ratings_clean[ratings_clean['std']]
ratings_clean[ratings_clean['std']]

Unnamed: 0,user_id,movieID,rating,imdbID,std
0,1264,2363,3.5,tt0047034,True
1,213,8368,2.5,tt0304141,True
2,593,64032,3.0,tt0369436,True
3,609,54995,4.0,tt1077258,True
4,1590,5005,4.0,tt0052182,True
...,...,...,...,...,...
812813,99,2803,3.0,tt0107798,True
812814,333,4215,3.0,tt0093857,True
812815,49,2759,3.0,tt0144168,True
812816,322,3114,5.0,tt0114709,True


In [29]:
print('Total entries after cleaning: ', ratings_clean.shape[0])

Total entries after cleaning:  792155


In [30]:
#save the ratings in the right format and with the imdbID as the movie identifier
ratings_finished = ratings_clean[['user_id', 'imdbID', 'rating']]
ratings_clean[['user_id', 'imdbID', 'rating']]

Unnamed: 0,user_id,imdbID,rating
0,1264,tt0047034,3.5
1,213,tt0304141,2.5
2,593,tt0369436,3.0
3,609,tt1077258,4.0
4,1590,tt0052182,4.0
...,...,...,...
812813,99,tt0107798,3.0
812814,333,tt0093857,3.0
812815,49,tt0144168,3.0
812816,322,tt0114709,5.0


In [34]:
ratings_finished.merge(mapping, how='right', on='imdbID').dropna()

Unnamed: 0,user_id,imdbID,rating,movieID
0,1264.0,tt0047034,3.5,2363
1,981.0,tt0047034,3.5,2363
2,481.0,tt0047034,1.0,2363
3,98.0,tt0047034,2.5,2363
4,249.0,tt0047034,4.0,2363
...,...,...,...,...
954565,243.0,tt0047376,3.0,46901
954566,417.0,tt0043132,4.0,59832
954567,379.0,tt0081433,3.0,6353
954568,279.0,tt0295480,1.0,5402
