In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict
from IPython.display import HTML
import progressbar
import tests as t
import pickle


%matplotlib inline

movies = pd.read_csv('data/movies_clean.csv')
reviews = pd.read_csv('data/reviews_clean.csv')

del movies['Unnamed: 0']
del reviews['Unnamed: 0']


all_recs = pickle.load(open("all_recs.p", "rb"))


In [2]:
users_with_all_recs = []
for user, movie_recs in all_recs.items():
    if len(movie_recs) > 9:
        users_with_all_recs.append(user)

print("There are {} users with all reccomendations from collaborative filtering.".format(len(users_with_all_recs)))

users = np.unique(reviews['user_id'])
users_who_need_recs = np.setdiff1d(users, users_with_all_recs)

print("There are {} users who still need recommendations.".format(len(users_who_need_recs)))
print("Only {}% of users received all 10 of their recommendations using collaborative filtering".format(round(len(users_with_all_recs)/len(np.unique(reviews['user_id'])), 4)*100))   

There are 22187 users with all reccomendations from collaborative filtering.
There are 31781 users who still need recommendations.
Only 41.11% of users received all 10 of their recommendations using collaborative filtering


### Content Based Recommendations

In [3]:
ranked_reviews = reviews.sort_values(by=['user_id', 'rating'], ascending=False)

In [4]:
movie_content = np.array(movies.iloc[:,4:])

dot_prod_movies = movie_content.dot(np.transpose(movie_content))

In [5]:
def find_similar_movies(movie_id):
    '''
    INPUT
    movie_id - a movie_id 
    OUTPUT
    similar_movies - an array of the most similar movies by title
    '''
    # finding the row of each movie id
    movie_idx = np.where(movies['movie_id'] == movie_id)[0][0]
    
    # finding the most similar movie indices - to start I said they need to be the same for all content
    similar_idxs = np.where(dot_prod_movies[movie_idx] == np.max(dot_prod_movies[movie_idx]))[0]
    
    # pulling the movie titles based on the indices
    similar_movies = np.array(movies.iloc[similar_idxs, ]['movie'])
    
    return similar_movies
    
    
def get_movie_names(movie_ids):
    '''
    INPUT
    movie_ids - a list of movie_ids
    OUTPUT
    movies - a list of movie names associated with the movie_ids
    
    '''
    movie_lst = list(movies[movies['movie_id'].isin(movie_ids)]['movie'])
   
    return movie_lst

def make_recs():
    '''
    INPUT
    None
    OUTPUT
    recs - a dictionary with keys of the user and values of the recommendations
    '''
    # Creating a dictionary to return with users and ratings
    recs = defaultdict(set)
    # Users for progress bar
    n_users = len(users)

    
    # Creating the progressbar
    cnter = 0
    bar = progressbar.ProgressBar(maxval=n_users+1, widgets=[progressbar.Bar('=', '[', ']'), ' ', progressbar.Percentage()])
    bar.start()
    
    
    for user in users:
        
        # Progress bar
        cnter+=1 
        bar.update(cnter)

        # Pulling only the reviews the user has seen
        reviews_temp = ranked_reviews[ranked_reviews['user_id'] == user]
        movies_temp = np.array(reviews_temp['movie_id'])
        movie_names = np.array(get_movie_names(movies_temp))

        # Looking at each of the movies (highest ranked first), 
        # pulling the movies the user hasn't seen that are most similar
     
        for movie in movies_temp:
            rec_movies = find_similar_movies(movie)
            temp_recs = np.setdiff1d(rec_movies, movie_names)
            recs[user].update(temp_recs)

             
            if len(recs[user]) > 9:
                break

    bar.finish()
    
    return recs

In [6]:
recs = make_recs()



# Recommendations

In [7]:
users_without_all_recs = []
users_with_all_recs = []
no_recs = []
for user, movie_recs in recs.items():
    if len(movie_recs) < 10:
        users_without_all_recs.append(user)
    if len(movie_recs) > 9:
        users_with_all_recs.append(user)
    if len(movie_recs) == 0:
        no_recs.append(user)

In [8]:
print(" {} users without all 10 recommendations assigned.".format(len(users_without_all_recs)))
print(" {} users with all 10 recommendations assigned.".format(len(users_with_all_recs)))
print(" {} users with no recommendations at all!".format(len(no_recs)))

 2179 users without all 10 recommendations assigned.
 51789 users with all 10 recommendations assigned.
 174 users with no recommendations at all!


In [9]:
user_items = reviews[['user_id', 'movie_id', 'rating']]
user_by_movie = user_items.groupby(['user_id', 'movie_id'])['rating'].max().unstack()

def movies_watched(user_id):
    '''
    INPUT:
    user_id - the user_id of an individual as int
    OUTPUT:
    movies - an array of movies the user has watched
    '''
    movies = user_by_movie.loc[user_id][user_by_movie.loc[user_id].isnull() == False].index.values

    return movies


movies_watched(189)

array([457430], dtype=int64)

In [10]:
cnter = 0
print("Some of the movie lists for users without any recommendations include:")
for user_id in no_recs:
    print(user_id)
    print(get_movie_names(movies_watched(user_id)))
    cnter+=1
    if cnter > 10:
        break

Some of the movie lists for users without any recommendations include:
189
['El laberinto del fauno (2006)']
797
['The 414s (2015)']
1603
['Beauty and the Beast (2017)']
2056
['Brimstone (2016)']
2438
['Baby Driver (2017)']
3322
['Rosenberg (2013)']
3925
['El laberinto del fauno (2006)']
4325
['Beauty and the Beast (2017)']
4773
['The Frozen Ground (2013)']
4869
['Beauty and the Beast (2017)']
4878
['American Made (2017)']
