# Summary
Baselines are helpful for the following situations:
1. Cold start problem: 
    - New/anonymous users that the recommender system knows nothing about. The RS can recommend the most popular movies, that is, the movies that have the most ratings, or the movies with the highest averages.
    - New items cannot be recommend by collaborative filtering but genre popularity could help.
2. Essential benchmark:
    - In a sense, the most popular and the highest averages can already be used in a recommender system. Every model subsequent to this should beat these baselines; otherwise, there is no point.
3. In theory genre-based popularity should be computed but the column needs further parsing.

In [1]:
from utils import load_movielens_data

In [2]:
data = load_movielens_data()

In [None]:
movies = data[0]
ratings = data[1]
tags = data[2]

In [5]:
movies.head(2)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy


In [6]:
ratings.head(2)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247


In [15]:
tags.head(2)

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996


In [None]:
# movies with the most rating
popularity = ratings.groupby("movieId")["rating"].count().sort_values(ascending=False)
popular_movies = popularity.reset_index().merge(movies[["movieId", "title", "genres"]], on="movieId")
popular_movies.head(10)

Unnamed: 0,movieId,rating,title,genres
0,356,329,Forrest Gump (1994),Comedy|Drama|Romance|War
1,318,317,"Shawshank Redemption, The (1994)",Crime|Drama
2,296,307,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller
3,593,279,"Silence of the Lambs, The (1991)",Crime|Horror|Thriller
4,2571,278,"Matrix, The (1999)",Action|Sci-Fi|Thriller
5,260,251,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi
6,480,238,Jurassic Park (1993),Action|Adventure|Sci-Fi|Thriller
7,110,237,Braveheart (1995),Action|Drama|War
8,589,224,Terminator 2: Judgment Day (1991),Action|Sci-Fi
9,527,220,Schindler's List (1993),Drama|War


In [None]:
# movies with the best average
rating_counts = ratings.groupby("movieId")["rating"].count()
movie_id_more_than_20_rating = rating_counts[rating_counts > 20].index
filtered = ratings[ratings.movieId.isin(movie_id_more_than_20_rating)] #no nans

average_filtered = filtered.groupby("movieId")["rating"].mean()
best_movie = average_filtered.reset_index().merge(movies[["movieId", "title"]], on="movieId")
best_movie.sort_values(by="rating", ascending=False).head(20)

Unnamed: 0,movieId,rating,title
101,318,4.429022,"Shawshank Redemption, The (1994)"
241,922,4.333333,Sunset Blvd. (a.k.a. Sunset Boulevard) (1950)
226,898,4.310345,"Philadelphia Story, The (1940)"
147,475,4.3,In the Name of the Father (1993)
302,1204,4.3,Lawrence of Arabia (1962)
76,246,4.293103,Hoop Dreams (1994)
223,858,4.289062,"Godfather, The (1972)"
322,1235,4.288462,Harold and Maude (1971)
1234,168252,4.28,Logan (2017)
652,2959,4.272936,Fight Club (1999)


In [None]:
# filtered[filtered['rating'].isna()]

Unnamed: 0,userId,movieId,rating,timestamp


In [31]:
# movies with the best median
rating_counts = ratings.groupby("movieId")["rating"].count()
movie_id_more_than_20_rating = rating_counts[rating_counts > 20].index
filtered = ratings[ratings.movieId.isin(movie_id_more_than_20_rating)]

average_filtered = filtered.groupby("movieId")["rating"].median()
best_movie = average_filtered.reset_index().merge(movies[["movieId", "title"]], on="movieId")
best_movie.sort_values(by="rating", ascending=False).head(10)

Unnamed: 0,movieId,rating,title
1234,168252,4.5,Logan (2017)
1172,92259,4.5,Intouchables (2011)
305,1208,4.5,Apocalypse Now (1979)
307,1212,4.5,"Third Man, The (1949)"
313,1221,4.5,"Godfather: Part II, The (1974)"
1085,56782,4.5,There Will Be Blood (2007)
1088,58559,4.5,"Dark Knight, The (2008)"
652,2959,4.5,Fight Club (1999)
101,318,4.5,"Shawshank Redemption, The (1994)"
329,1250,4.5,"Bridge on the River Kwai, The (1957)"


In [None]:
# Top 30 movies with the most ratings aka 30 most popular movies
num_of_ratings = ratings.groupby("movieId")["rating"].count()
num_of_ratings.reset_index().merge(movies[["movieId", "title"]], on="movieId").sort_values(by="rating", ascending=False).head(30)

Unnamed: 0,movieId,rating,title
314,356,329,Forrest Gump (1994)
277,318,317,"Shawshank Redemption, The (1994)"
257,296,307,Pulp Fiction (1994)
510,593,279,"Silence of the Lambs, The (1991)"
1938,2571,278,"Matrix, The (1999)"
224,260,251,Star Wars: Episode IV - A New Hope (1977)
418,480,238,Jurassic Park (1993)
97,110,237,Braveheart (1995)
507,589,224,Terminator 2: Judgment Day (1991)
461,527,220,Schindler's List (1993)
