In [1]:
import pandas as pd

In [2]:
ratings = pd.read_csv('data/ratings.dat', sep='::', engine = 'python', header=None)
ratings.columns = ['UserID', 'MovieID', 'Rating', 'Timestamp']
movies = pd.read_csv('data/movies.dat', sep='::', engine = 'python',
                     encoding="ISO-8859-1", header = None)
movies.columns = ['MovieID', 'Title', 'Genres']

In [18]:
movies.head(10)

Unnamed: 0,MovieID,Title,Genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children's
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


In [4]:
ratings.head()

Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [5]:
users = pd.read_csv('data/users.dat', sep='::', engine = 'python', header=None)
users.head()

Unnamed: 0,0,1,2,3,4
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


## User first input a genre
## Filter movies by genre
## Define most popular movies using most watched movies and highest grossing movies
## Define Highly rated movies using weighted average ratings
## define ranking function that combines the two

In [6]:
def genres_matrix(movies):
    genres = []
    for i in range(len(movies)):
        genres.extend(movies.iloc[i]['Genres'].split('|'))
    genres = list(set(genres))
    genres.sort()
    genres_matrix = pd.DataFrame(columns = genres)
    for i in range(len(movies)):
        genres_matrix.loc[i] = [0]*len(genres)
        for genre in movies.iloc[i]['Genres'].split('|'):
            genres_matrix.loc[i][genre] = 1
    genres_matrix.index = movies['MovieID']
    return genres_matrix

In [7]:
# using genres matrix to filter movies base on user input
genres = genres_matrix(movies)
genres.head()

Unnamed: 0_level_0,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
MovieID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
1,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0
3,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0
4,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0


In [8]:
# save genres type to a json file
genres_list = genres.columns.tolist()
import json
with open('genres.json', 'w') as f:
    json.dump(genres_list, f)

In [9]:
def subset_movies(ratings_df, genre_df, genre):
    # giving a genre, return a subset of movies that are in that genre
    genre_movies = genre_df[genre_df[genre] == 1]

    genre_index = genre_movies.index
    ratings_df = ratings_df[ratings_df['MovieID'].isin(genre_index)]
    rating_matrix = pd.pivot_table(ratings_df, values='Rating', index=['UserID'],
                                   columns=['MovieID'])
    return rating_matrix

In [10]:
subset = subset_movies(ratings, genres, 'Action')
genre_rating = subset
genre_rating

MovieID,6,9,10,15,20,21,42,44,70,71,...,3807,3827,3836,3841,3864,3877,3879,3889,3898,3946
UserID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,1.0,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,2.0,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6036,3.0,,,,,3.0,,,,,...,,,,,,,,,,
6037,,,,,,,,,,,...,,,,,,,,,,
6038,,,,,,,,,,,...,,,,,,,,,,
6039,,,,,,,,,,,...,,,,,,,,,,


In [11]:
def most_watched_movies(genre_rating):
    # return a list of most watched movies
    # input is a rating matrix
    # output is a list of movie ids
    movie_count = genre_rating.count()
    movie_count = movie_count
    # scale to 0 to 1
    movie_count = movie_count/movie_count.max()
    movie_count = movie_count.rename('popularity')
    return movie_count

In [12]:
most_watched_movies(genre_rating).sort_values(ascending = False).head(10)

MovieID
260     1.000000
1196    0.999666
1210    0.963892
480     0.893347
2028    0.886994
589     0.885657
2571    0.865931
1580    0.848546
1198    0.840522
110     0.816784
Name: popularity, dtype: float64

In [13]:
def highly_rated_movies(genre_rating):
    # return a list of highly rated movies
    # input is a rating matrix
    # output is a list of movie ids
    # using Weighted Rating (WR) = (v / (v+m)) × R + (m / (v+m)) × C
    mean = genre_rating.mean()
    v = genre_rating.count()
    m = 300
    C = mean.mean()
    wr = (v/(v+m))*mean + (m/(v+m))*C
    # scale to 0 to 1
    wr = wr/wr.max()
    wr = wr.rename('rating')
    return wr

In [14]:
highly_rated_movies(genre_rating).sort_values(ascending = False).head(10)

MovieID
858     1.000000
1198    0.994337
260     0.994217
2028    0.966971
2571    0.961912
1196    0.960672
1221    0.956970
1197    0.956433
110     0.943819
2019    0.938577
Name: rating, dtype: float64

In [15]:
def ranking(genre_rating, n):
    # return a list of movies that are ranked by most watched and highly rated
    # input is a rating matrix
    # output is a list of movie ids
    movie_count = most_watched_movies(genre_rating)
    wr = highly_rated_movies(genre_rating)
    rank = pd.merge(movie_count, wr, left_index=True, right_index=True)

    # popularity count 1/3, rating 2/3
    rank['score'] = rank['popularity']/3 + rank['rating']*2/3
    # scale
    rank['score'] = rank['score']/rank['score'].max()
    return rank.sort_values(by = 'score', ascending = False).head(n)

In [16]:
top_10_action_movie_id = ranking(genre_rating, 10)
index = top_10_action_movie_id.index

In [17]:
# return a list of movies that are ranked by most watched and highly rated
action_movie = movies[movies['MovieID'].isin(index)]
action_movie

Unnamed: 0,MovieID,Title,Genres
108,110,Braveheart (1995),Action|Drama|War
257,260,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Fantasy|Sci-Fi
585,589,Terminator 2: Judgment Day (1991),Action|Sci-Fi|Thriller
847,858,"Godfather, The (1972)",Action|Crime|Drama
1178,1196,Star Wars: Episode V - The Empire Strikes Back...,Action|Adventure|Drama|Sci-Fi|War
1179,1197,"Princess Bride, The (1987)",Action|Adventure|Comedy|Romance
1180,1198,Raiders of the Lost Ark (1981),Action|Adventure
1192,1210,Star Wars: Episode VI - Return of the Jedi (1983),Action|Adventure|Romance|Sci-Fi|War
1959,2028,Saving Private Ryan (1998),Action|Drama|War
2502,2571,"Matrix, The (1999)",Action|Sci-Fi|Thriller
