In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.sparse import csr_matrix

In [2]:
%matplotlib inline

In [3]:
# Read movies from csv
movies = pd.read_csv('ignore/movies.csv')
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
# Read the ratings
ratings = pd.read_csv('ignore/ratings.csv')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,1112486027
1,1,29,3.5,1112484676
2,1,32,3.5,1112484819
3,1,47,3.5,1112484727
4,1,50,3.5,1112484580


In [5]:
print('The dataset contains: ', len(ratings), ' ratings of ', len(movies), ' movies.')

The dataset contains:  20000263  ratings of  27278  movies.


In [6]:
ratings = ratings.drop('timestamp', axis=1)
ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,2,3.5
1,1,29,3.5
2,1,32,3.5
3,1,47,3.5
4,1,50,3.5


In [7]:
def get_genres(df):
    genres = set()
    for i in df:
        temp = i.split('|')
        for genre in temp:
            if genre not in genres:
                genres.add(genre)
    return genres

In [8]:
all_genres = get_genres(movies['genres'])
all_genres.remove('(no genres listed)')
all_genres.remove('IMAX')
all_genres = list(all_genres)
all_genres

['Crime',
 'Film-Noir',
 'Action',
 'Fantasy',
 'Comedy',
 'Adventure',
 'Drama',
 'Romance',
 'Thriller',
 'Sci-Fi',
 'Documentary',
 'War',
 'Western',
 'Horror',
 'Mystery',
 'Children',
 'Musical',
 'Animation']

In [9]:
column_names = []
for genre in all_genres:
    column_names.append('Avg_{}'.format(genre))
column_names

['Avg_Crime',
 'Avg_Film-Noir',
 'Avg_Action',
 'Avg_Fantasy',
 'Avg_Comedy',
 'Avg_Adventure',
 'Avg_Drama',
 'Avg_Romance',
 'Avg_Thriller',
 'Avg_Sci-Fi',
 'Avg_Documentary',
 'Avg_War',
 'Avg_Western',
 'Avg_Horror',
 'Avg_Mystery',
 'Avg_Children',
 'Avg_Musical',
 'Avg_Animation']

In [10]:
def get_genre_ratings(ratings, movies, genres, column_names):
    genre_ratings = pd.DataFrame()
    for genre in genres:        
        genre_movies = movies[movies['genres'].str.contains(genre) ]
        avg_genre_votes_per_user = ratings[ratings['movieId'].isin(genre_movies['movieId'])].loc[:, ['userId', 'rating']].groupby(['userId'])['rating'].mean().round(2)
        
        genre_ratings = pd.concat([genre_ratings, avg_genre_votes_per_user], axis=1)
        
    # print(genre_ratings)
    genre_ratings.columns = column_names
    return genre_ratings

In [11]:
genre_ratings = get_genre_ratings(ratings, movies, all_genres, column_names)
genre_ratings.head()

Unnamed: 0_level_0,Avg_Crime,Avg_Film-Noir,Avg_Action,Avg_Fantasy,Avg_Comedy,Avg_Adventure,Avg_Drama,Avg_Romance,Avg_Thriller,Avg_Sci-Fi,Avg_Documentary,Avg_War,Avg_Western,Avg_Horror,Avg_Mystery,Avg_Children,Avg_Musical,Avg_Animation
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
1,3.81,,3.73,3.79,3.73,3.79,3.77,3.95,3.76,3.71,,3.67,3.38,3.74,3.61,3.61,3.67,3.65
2,5.0,5.0,4.63,2.0,3.9,4.82,3.89,3.83,4.26,4.61,,4.25,4.5,3.56,4.5,3.0,3.0,3.0
3,4.29,3.0,4.11,4.3,4.06,4.22,4.22,4.06,4.26,4.0,4.0,4.67,4.33,3.94,4.36,4.3,4.0,3.75
4,3.17,,3.54,3.67,3.55,3.83,3.75,3.5,3.46,3.0,,4.0,4.0,,2.67,3.75,4.0,4.0
5,4.14,,4.5,3.73,4.08,4.52,4.19,3.94,4.33,4.6,,4.0,5.0,3.0,3.5,4.18,4.38,4.67


In [12]:
import math

In [13]:
genre_ratings = genre_ratings.replace(math.nan, 0)
genre_ratings.head()

Unnamed: 0_level_0,Avg_Crime,Avg_Film-Noir,Avg_Action,Avg_Fantasy,Avg_Comedy,Avg_Adventure,Avg_Drama,Avg_Romance,Avg_Thriller,Avg_Sci-Fi,Avg_Documentary,Avg_War,Avg_Western,Avg_Horror,Avg_Mystery,Avg_Children,Avg_Musical,Avg_Animation
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
1,3.81,0.0,3.73,3.79,3.73,3.79,3.77,3.95,3.76,3.71,0.0,3.67,3.38,3.74,3.61,3.61,3.67,3.65
2,5.0,5.0,4.63,2.0,3.9,4.82,3.89,3.83,4.26,4.61,0.0,4.25,4.5,3.56,4.5,3.0,3.0,3.0
3,4.29,3.0,4.11,4.3,4.06,4.22,4.22,4.06,4.26,4.0,4.0,4.67,4.33,3.94,4.36,4.3,4.0,3.75
4,3.17,0.0,3.54,3.67,3.55,3.83,3.75,3.5,3.46,3.0,0.0,4.0,4.0,0.0,2.67,3.75,4.0,4.0
5,4.14,0.0,4.5,3.73,4.08,4.52,4.19,3.94,4.33,4.6,0.0,4.0,5.0,3.0,3.5,4.18,4.38,4.67


In [14]:
# Let's turn our dataset into a list
X = genre_ratings[column_names].values

In [57]:
# TODO: Import KMeans
from sklearn.cluster import KMeans

# TODO: Create an instance of KMeans to find two clusters
kmeans_1 = KMeans(n_clusters=18, random_state=0)

# TODO: use fit_predict to cluster the dataset
predictions = kmeans_1.fit_predict(X)

  _nan_object_mask = _nan_object_array != _nan_object_array


## Movie Level Clustering

In [16]:
ratings_title = pd.merge(ratings, movies[['movieId', 'title']], on='movieId' )
user_movie_ratings = pd.pivot_table(ratings_title, index='userId', columns= 'title', values='rating')

print('dataset dimensions: ', user_movie_ratings.shape, '\n\nSubset example:')
user_movie_ratings.iloc[:6, :10]

dataset dimensions:  (138493, 26729) 

Subset example:


title,"""Great Performances"" Cats (1998)",#chicagoGirl: The Social Network Takes on a Dictator (2013),$ (Dollars) (1971),$5 a Day (2008),$9.99 (2008),$ellebrity (Sellebrity) (2012),'71 (2014),'Hellboy': The Seeds of Creation (2004),"'Human' Factor, The (Human Factor, The) (1975)",'Neath the Arizona Skies (1934)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,,,,,,,,,,
2,,,,,,,,,,
3,,,,,,,,,,
4,,,,,,,,,,
5,,,,,,,,,,
6,,,,,,,,,,


In [None]:
def get_most_rated_movies(user_movie_ratings, max_number_of_movies):
    # 1- Count
    user_movie_ratings = user_movie_ratings.append(user_movie_ratings.count(), ignore_index=True)
    # 2- sort
    user_movie_ratings_sorted = user_movie_ratings.sort_values(len(user_movie_ratings)-1, axis=1, ascending=False)
    user_movie_ratings_sorted = user_movie_ratings_sorted.drop(user_movie_ratings_sorted.tail(1).index)
    # 3- slice
    most_rated_movies = user_movie_ratings_sorted.iloc[:, :max_number_of_movies]
    return most_rated_movies

def get_users_who_rate_the_most(most_rated_movies, max_number_of_movies):
    # Get most voting users
    # 1- Count
    most_rated_movies['counts'] = pd.Series(most_rated_movies.count(axis=1))
    # 2- Sort
    most_rated_movies_users = most_rated_movies.sort_values('counts', ascending=False)
    # 3- Slice
    most_rated_movies_users_selection = most_rated_movies_users.iloc[:max_number_of_movies, :]
    most_rated_movies_users_selection = most_rated_movies_users_selection.drop(['counts'], axis=1)
    
    return most_rated_movies_users_selection

def sort_by_rating_density(user_movie_ratings, n_movies, n_users):
    most_rated_movies = get_most_rated_movies(user_movie_ratings, n_movies)
    most_rated_movies = get_users_who_rate_the_most(most_rated_movies, n_users)
    return most_rated_movies

In [None]:
def draw_movie_clusters(clustered, max_users, max_movies):
    c=1
    for cluster_id in clustered.group.unique():
        # To improve visibility, we're showing at most max_users users and max_movies movies per cluster.
        # You can change these values to see more users & movies per cluster
        d = clustered[clustered.group == cluster_id].drop(['index', 'group'], axis=1)
        n_users_in_cluster = d.shape[0]
        
        d = sort_by_rating_density(d, max_movies, max_users)
        
        d = d.reindex_axis(d.mean().sort_values(ascending=False).index, axis=1)
        d = d.reindex_axis(d.count(axis=1).sort_values(ascending=False).index)
        d = d.iloc[:max_users, :max_movies]
        n_users_in_plot = d.shape[0]
        
        # We're only selecting to show clusters that have more than 9 users, otherwise, they're less interesting
        if len(d) > 9:
            print('cluster # {}'.format(cluster_id))
            print('# of users in cluster: {}.'.format(n_users_in_cluster), '# of users in plot: {}'.format(n_users_in_plot))
            fig = plt.figure(figsize=(15,4))
            ax = plt.gca()

            ax.invert_yaxis()
            ax.xaxis.tick_top()
            labels = d.columns.str[:40]

            ax.set_yticks(np.arange(d.shape[0]) , minor=False)
            ax.set_xticks(np.arange(d.shape[1]) , minor=False)

            ax.set_xticklabels(labels, minor=False)
                        
            ax.get_yaxis().set_visible(False)

            # Heatmap
            heatmap = plt.imshow(d, vmin=0, vmax=5, aspect='auto')

            ax.set_xlabel('movies')
            ax.set_ylabel('User id')

            divider = make_axes_locatable(ax)
            cax = divider.append_axes("right", size="5%", pad=0.05)

            # Color bar
            cbar = fig.colorbar(heatmap, ticks=[5, 4, 3, 2, 1, 0], cax=cax)
            cbar.ax.set_yticklabels(['5 stars', '4 stars','3 stars','2 stars','1 stars','0 stars'])

            plt.setp(ax.get_xticklabels(), rotation=90, fontsize=9)
            plt.tick_params(axis='both', which='both', bottom='off', top='off', left='off', labelbottom='off', labelleft='off') 
            #print('cluster # {} \n(Showing at most {} users and {} movies)'.format(cluster_id, max_users, max_movies))

            plt.show()


In [None]:
n_movies = 50
n_users = 25
most_rated_movies_users_selection = sort_by_rating_density(user_movie_ratings, n_movies, n_users)

In [None]:
print('dataset dimensions: ', most_rated_movies_users_selection.shape)
most_rated_movies_users_selection.head()

In [None]:
draw_movies_heatmap(most_rated_movies_users_selection)

For performance reasons, we'll only use ratings for 15000 movies.

In [None]:
user_movie_ratings =  pd.pivot_table(ratings_title, index='userId', columns= 'title', values='rating')
most_rated_movies_1k = helper.get_most_rated_movies(user_movie_ratings, 15000)

To have sklearn run k-means clustering to a dataset with missing values like this, we will first cast it to the [sparse csr matrix](https://docs.scipy.org/doc/scipy-0.19.0/reference/generated/scipy.sparse.csr_matrix.html) type defined in the SciPi library. 

To convert from a pandas dataframe to a sparse matrix, we'll have to convert to SparseDataFrame, then use pandas' `to_coo()` method for the conversion.

Note: `to_coo()` was only added in later versions of pandas. If you run into an error with the next cell, make sure pandas is up to date.

In [None]:
sparse_ratings = csr_matrix(pd.SparseDataFrame(most_rated_movies_1k).to_coo())

In [None]:
# 20 clusters
predictions = KMeans(n_clusters=20, algorithm='full').fit_predict(sparse_ratings)

In [None]:
clustered = pd.concat([most_rated_movies_1k.reset_index(), pd.DataFrame({'group':predictions})], axis=1)

In [None]:
cluster_number = 4

# Let's filter to only see the region of the dataset with the most number of values 
n_users = 75
n_movies = 300
cluster = clustered[clustered.group == cluster_number].drop(['index', 'group'], axis=1)

cluster = sort_by_rating_density(cluster, n_movies, n_users)
helper.draw_movies_heatmap(cluster, axis_labels=False)

In [None]:
cluster.fillna('').head()

In [None]:
movie_name = 'Shakespeare in Love (1998)'

cluster[movie_name].mean()

In [None]:
# The average rating of 20 movies as rated by the users in the cluster
cluster.mean().head(20)

In [None]:
user_id = 7

# Get all this user's ratings
user_2_ratings  = cluster.loc[user_id, :]
user_2_ratings

In [None]:
user_id = 7

# Get all this user's ratings
user_2_ratings  = cluster.loc[user_id, :]

# Which movies did they not rate? (We don't want to recommend movies they've already rated)
user_2_unrated_movies =  user_2_ratings[user_2_ratings.isnull()]

# What are the ratings of these movies the user did not rate?
avg_ratings = pd.concat([user_2_unrated_movies, cluster.mean()], axis=1, join='inner').loc[:,0]

# Let's sort by rating so the highest rated movies are presented first
avg_ratings.sort_values(ascending=False)[:20]