In [1]:
import pandas as pd
import numpy as np

In [2]:
#load movies and ratings

movies = pd.read_csv('Data/movies_sml.csv')
ratings = pd.read_csv('Data/ratings_sml.csv')

In [3]:
# load movies with bag of words. Needed for content filtering

movies_bow = pd.read_csv('Data/movies_bow.csv')

In [4]:
#load ratings matrix.Needed for Collaborative filtering
ratings_matrix = pd.read_hdf('Data/ratings_mx.h5')

In [5]:
# load predictions
preds = pd.read_hdf('Data/preds_hdf2.h5')

###### test collab system first to see if any additional files are needed

In [6]:


def recommend_movies(predictions, userId, movies, original_ratings, num_recommendations):
    
    # Get and sort the user's predictions
    user_row_number = userId - 1 # User ID starts at 1, not 0
    sorted_user_predictions = preds.iloc[user_row_number].sort_values(ascending=False) # User ID starts at 1
    
    # Get the user's data and merge in the movie information.
    user_data = original_ratings[original_ratings.userId == (userId)]
    user_full = (user_data.merge(movies, how = 'left', left_on = 'movieId', right_on = 'movieId').
                     sort_values(['rating'], ascending=False)
                 )

#     print (f"User {0} has already rated {1} movies. format(userID, user_full.shape[0]")
#     print 'Recommending highest {0} predicted ratings movies not already rated.'.format(num_recommendations)
    
    # Recommend the highest predicted rating movies that the user hasn't seen yet.
    recommendations = (movies[~movies['movieId'].isin(user_full['movieId'])].
         merge(pd.DataFrame(sorted_user_predictions).reset_index(), how = 'left',
               left_on = 'movieId',
               right_on = 'movieId').
         rename(columns = {user_row_number: 'Predictions'}).
         sort_values('Predictions', ascending = False).
                       iloc[:num_recommendations, :-1]
                      )
    rec_movies = recommendations['Title']
    already_rated = user_full['Title_x']
    return  already_rated, rec_movies

In [7]:
already_rated, predictions = recommend_movies(preds, 60, movies, ratings, 20)

In [8]:
predictions

423                       Schindler's List
215     Star Wars: Episode IV - A New Hope
293                           Forrest Gump
102                             Braveheart
6587                             Inception
6003                                    Up
5575                                WALL·E
1420                     Life Is Beautiful
1950                             Gladiator
8495                          Interstellar
249                 Léon: The Professional
485                                  Fargo
3110                          Finding Nemo
7820                      Django Unchained
8928                    The Imitation Game
5957                  Inglourious Basterds
5748                   Slumdog Millionaire
5059                           Ratatouille
2950                   Catch Me If You Can
9237                            Inside Out
Name: Title, dtype: object

###### test content system first to see if any additional files are needed

In [9]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [10]:
# Initializing and generating the count matrix 

count = CountVectorizer()
count_matrix = count.fit_transform(movies_bow['bag_of_words'])

In [11]:
# Generating the cosine similarity matrix

cosine_sim = cosine_similarity(count_matrix, count_matrix)

In [12]:
#map index to Title. This will be used in the function and is integral to 
# how we access the recommended movies

indices = pd.Series(movies_bow.index, index = movies_bow['Title'])

#Create recommendation function

def get_recommendations(title, cosine_sim=cosine_sim):
    # Get the index of the movie that matches the title
    idx = indices[title]

    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:26]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return movies_bow['Title'].iloc[movie_indices]

In [13]:
get_recommendations('Rumble in the Bronx')

11663                   Kung Fu Yoga
4129                       Who Am I?
3463                     Crime Story
670                       Supercop 2
690      Around the World in 80 Days
691      Around the World in 80 Days
1143                       Mr. Magoo
1416                       Rush Hour
1595                      Black Mask
2008                   Shanghai Noon
2440                     Rush Hour 2
4071                     City Hunter
4976                     Thunderbolt
5320                     Rush Hour 3
9890                    Dragon Blade
11352                      Skiptrace
11463                Railroad Tigers
1226                    Mr. Nice Guy
3052                Shanghai Knights
4397                        Gorgeous
4649                New Police Story
6441               Shinjuku Incident
6894              Little Big Soldier
4974                    Swordsman II
0                          Toy Story
Name: Title, dtype: object

#### Combine Systems

In [31]:
# already loaded
# indices = pd.Series(movies_bow.index, index = movies_bow['Title'])

indices

Title
Toy Story                                                       0
Jumanji                                                         1
Grumpier Old Men                                                2
Waiting to Exhale                                               3
Father of the Bride Part II                                     4
Heat                                                            5
Heat                                                            6
Sabrina                                                         7
Sabrina                                                         8
Tom and Huck                                                    9
Sudden Death                                                   10
GoldenEye                                                      11
Dracula: Dead and Loving It                                    12
Balto                                                          13
Nixon                                                          14
Cutt

In [32]:
idx = indices['Jumanji']

In [36]:
sim_scores = list(enumerate(cosine_sim[idx]))



In [37]:
sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

In [38]:
sim_scores = sim_scores[1:26]

In [39]:
movie_indices = [i[0] for i in sim_scores]

In [73]:
content_results = pd.DataFrame(movies_bow['movieId'].iloc[movie_indices])


content_results 

Unnamed: 0,movieId
120,137
121,48593
412,500
601,765
1120,1702
1409,2253
1430,2297
1483,2431
1520,2501
1635,2718


In [75]:
# get user predictions

user_row_number = 60 # User ID starts at 1, not 0

sorted_user_predictions = pd.DataFrame(preds.iloc[user_row_number].sort_values(ascending=False))

sorted_user_predictions.head()

Unnamed: 0_level_0,60
movieId,Unnamed: 1_level_1
296,5.001825
2858,4.363025
608,3.931749
1193,3.701406
778,3.50941


In [91]:
# Get the movies the user originally rated
user_data = ratings[ratings.userId == user_row_number]

#combine the movies the user originally rated with the movies dataset and only keeping the movies
#the user rated

user_full = (user_data.merge(movies, how = 'left', left_on = 'movieId', right_on = 'movieId').
                     sort_values(['rating'], ascending=False))

In [93]:
print(user_full.columns)

user_full = user_full[['userId', 'movieId', 'Title_x', 'rating', 'genres', 'Actors', 'Director', 'Plot', 'Poster']].rename(columns = {'Title_x': 'Title'})


user_full.head()

Index(['userId', 'movieId', 'Title_x', 'rating', 'genres', 'Actors',
       'Director', 'Plot', 'Poster'],
      dtype='object')


Unnamed: 0,userId,movieId,Title,rating,genres,Actors,Director,Plot,Poster
0,60,4878,Donnie Darko,5.0,Drama|Mystery|Sci-Fi|Thriller,"Jake Gyllenhaal, Holmes Osborne, Maggie Gyllen...",Richard Kelly,A troubled teenager is plagued by visions of a...,https://m.media-amazon.com/images/M/MV5BZjZlZD...
52,60,8957,Saw,5.0,Horror|Mystery|Thriller,"Leigh Whannell, Cary Elwes, Danny Glover, Ken ...",James Wan,"Two strangers, who awaken in a room with no re...",https://m.media-amazon.com/images/M/MV5BMjE4MD...
32,60,714,Dead Man,5.0,Drama|Mystery|Western,"Johnny Depp, Gary Farmer, Crispin Glover, Lanc...",Jim Jarmusch,"On the run after murdering a man, accountant W...",https://m.media-amazon.com/images/M/MV5BYTJlZm...
33,60,1466,Donnie Brasco,5.0,Crime|Drama,"Al Pacino, Johnny Depp, Michael Madsen, Bruno ...",Mike Newell,An FBI undercover agent infiltrates the mob an...,https://m.media-amazon.com/images/M/MV5BYzMzMD...
22,60,2329,American History X,5.0,Crime|Drama,"Edward Norton, Edward Furlong, Beverly D'Angel...",Tony Kaye,A former neo-nazi skinhead tries to prevent hi...,https://m.media-amazon.com/images/M/MV5BZjA0MT...


(70, 9)

In [77]:
num_recommendations = 50

## TOM, I need to figure this section out

Need to convert the original code to fit into the hybrid. I was trying to work around the missing Predictions column without having to create extra dataframes. In cell 102, I think I'm making a basic mistake with my merge. 

The last cell below shows the results using the original code in cell 78. The issue is I can't sort it without the prediction scores. I need the outcome from cell 78 below to include the prediction scores.

In [78]:
# Original Code from Collaborative Filter Function

# Recommend the highest predicted rating movies that the user hasn't seen yet.
movie_preds = (movies[~movies['movieId'].isin(user_full['movieId'])].
         merge(pd.DataFrame(sorted_user_predictions).reset_index(), how = 'left',
               left_on = 'movieId',
               right_on = 'movieId').
         rename(columns = {user_row_number: 'Predictions'}).
         sort_values('Predictions', ascending = False).
                       iloc[:num_recommendations, :-1]
)

# join the predictions and the movies the user rated, removing the ones already rated from th


In [102]:
preds_test = pd.merge(sorted_user_predictions, user_full, how='left',
            left_on= 'movieId', right_on='movieId')
# .rename(columns = {user_row_number: 'Predictions'})

#             sort_values('Predictions', ascending = False).
#                        iloc[:num_recommendations, :-1]
preds_test

Unnamed: 0,movieId,60,userId,Title,rating,genres,Actors,Director,Plot,Poster
0,296,5.001825,,,,,,,,
1,2858,4.363025,,,,,,,,
2,608,3.931749,,,,,,,,
3,1193,3.701406,,,,,,,,
4,778,3.509410,,,,,,,,
5,2997,3.317179,,,,,,,,
6,7361,2.735573,,,,,,,,
7,6016,2.589245,,,,,,,,
8,2959,2.500412,,,,,,,,
9,3949,2.479702,60.0,Requiem for a Dream,5.0,Drama,"Ellen Burstyn, Jared Leto, Jennifer Connelly, ...",Darren Aronofsky,The drug-induced utopias of four Coney Island ...,https://m.media-amazon.com/images/M/MV5BOTdiNz...


In [95]:
movie_preds.shape

(50, 7)

In [52]:
# lookup movie predictions from content system from this dataframe

movie_recs = pd.merge(content_results, movie_preds, how='left', left_on='movieId', right_on='movieId').
    sort_values()

Unnamed: 0,movieId,Title,genres,Actors,Director,Plot,Poster
250,296,Pulp Fiction,Comedy|Crime|Drama|Thriller,"Tim Roth, Amanda Plummer, Laura Lovelace, John...",Quentin Tarantino,"The lives of two mob hitmen, a boxer, a gangst...",https://m.media-amazon.com/images/M/MV5BNGNhMD...
1658,2858,American Beauty,Drama|Romance,"Kevin Spacey, Annette Bening, Thora Birch, Wes...",Sam Mendes,A sexually frustrated suburban father has a mi...,https://m.media-amazon.com/images/M/MV5BNTBmZW...
485,608,Fargo,Comedy|Crime|Drama|Thriller,"William H. Macy, Steve Buscemi, Peter Stormare...","Joel Coen, Ethan Coen",Jerry Lundegaard's inept crime falls apart due...,https://m.media-amazon.com/images/M/MV5BNDJiZD...
799,1193,One Flew Over the Cuckoo's Nest,Drama,"Jack Nicholson, Louise Fletcher, Will Sampson,...",Milos Forman,A criminal pleads insanity after getting into ...,https://m.media-amazon.com/images/M/MV5BZjA0OW...
597,778,Trainspotting,Comedy|Crime|Drama,"Ewan McGregor, Ewen Bremner, Jonny Lee Miller,...",Danny Boyle,"Renton, deeply immersed in the Edinburgh drug ...",https://m.media-amazon.com/images/M/MV5BMzA5Zj...
1714,2997,Being John Malkovich,Comedy|Drama|Fantasy,"John Cusack, Cameron Diaz, Ned Bellamy, Eric W...",Spike Jonze,A puppeteer discovers a portal that leads lite...,https://m.media-amazon.com/images/M/MV5BYmUxY2...
3530,7361,Eternal Sunshine of the Spotless Mind,Drama|Romance|Sci-Fi,"Jim Carrey, Kate Winslet, Gerry Robert Byrne, ...",Michel Gondry,"When their relationship turns sour, a couple u...",https://m.media-amazon.com/images/M/MV5BMTY4Nz...
2967,6016,City of God,Action|Adventure|Crime|Drama|Thriller,"Alexandre Rodrigues, Leandro Firmino, Phellipe...","Fernando Meirelles, Kátia Lund(co-director)","In the slums of Rio, two kids' paths diverge a...",https://m.media-amazon.com/images/M/MV5BMGU5OW...
1698,2959,Fight Club,Action|Crime|Drama|Thriller,"Edward Norton, Brad Pitt, Meat Loaf, Zach Grenier",David Fincher,An insomniac office worker and a devil-may-car...,https://m.media-amazon.com/images/M/MV5BMjJmYT...
3254,6711,Lost in Translation,Comedy|Drama|Romance,"Scarlett Johansson, Bill Murray, Akiko Takeshi...",Sofia Coppola,A faded movie star and a neglected young woman...,https://m.media-amazon.com/images/M/MV5BMTI2ND...
