In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.sparse import csr_matrix

In [2]:
%matplotlib inline

In [3]:
# Read movies from csv
movies = pd.read_csv('ignore/movies.csv')
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
# Read the ratings
ratings = pd.read_csv('ignore/ratings.csv')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,1112486027
1,1,29,3.5,1112484676
2,1,32,3.5,1112484819
3,1,47,3.5,1112484727
4,1,50,3.5,1112484580


In [5]:
print('The dataset contains: ', len(ratings), ' ratings of ', len(movies), ' movies.')

The dataset contains:  20000263  ratings of  27278  movies.


In [6]:
ratings = ratings.drop('timestamp', axis=1)
ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,2,3.5
1,1,29,3.5
2,1,32,3.5
3,1,47,3.5
4,1,50,3.5


In [15]:
def get_genres(df):
    genres = set()
    for i in df:
        temp = i.split('|')
        for genre in temp:
            if genre not in genres:
                genres.add(genre)
    return genres

In [35]:
all_genres = get_genres(movies['genres'])
all_genres.remove('(no genres listed)')
all_genres.remove('IMAX')
all_genres = list(all_genres)
all_genres

['Animation',
 'Action',
 'Horror',
 'Crime',
 'Western',
 'Drama',
 'Romance',
 'Adventure',
 'Mystery',
 'Documentary',
 'Fantasy',
 'War',
 'Musical',
 'Film-Noir',
 'Thriller',
 'Comedy',
 'Sci-Fi',
 'Children']

In [38]:
column_names = []
for genre in all_genres:
    column_names.append('Avg_{}'.format(genre))
column_names

['Avg_Animation',
 'Avg_Action',
 'Avg_Horror',
 'Avg_Crime',
 'Avg_Western',
 'Avg_Drama',
 'Avg_Romance',
 'Avg_Adventure',
 'Avg_Mystery',
 'Avg_Documentary',
 'Avg_Fantasy',
 'Avg_War',
 'Avg_Musical',
 'Avg_Film-Noir',
 'Avg_Thriller',
 'Avg_Comedy',
 'Avg_Sci-Fi',
 'Avg_Children']

In [39]:
def get_genre_ratings(ratings, movies, genres, column_names):
    genre_ratings = pd.DataFrame()
    for genre in genres:        
        genre_movies = movies[movies['genres'].str.contains(genre) ]
        avg_genre_votes_per_user = ratings[ratings['movieId'].isin(genre_movies['movieId'])].loc[:, ['userId', 'rating']].groupby(['userId'])['rating'].mean().round(2)
        
        genre_ratings = pd.concat([genre_ratings, avg_genre_votes_per_user], axis=1)
        
    # print(genre_ratings)
    genre_ratings.columns = column_names
    return genre_ratings

In [40]:
genre_ratings = get_genre_ratings(ratings, movies, all_genres, column_names)
genre_ratings.head()

Unnamed: 0_level_0,Avg_Animation,Avg_Action,Avg_Horror,Avg_Crime,Avg_Western,Avg_Drama,Avg_Romance,Avg_Adventure,Avg_Mystery,Avg_Documentary,Avg_Fantasy,Avg_War,Avg_Musical,Avg_Film-Noir,Avg_Thriller,Avg_Comedy,Avg_Sci-Fi,Avg_Children
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
1,3.65,3.73,3.74,3.81,3.38,3.77,3.95,3.79,3.61,,3.79,3.67,3.67,,3.76,3.73,3.71,3.61
2,3.0,4.63,3.56,5.0,4.5,3.89,3.83,4.82,4.5,,2.0,4.25,3.0,5.0,4.26,3.9,4.61,3.0
3,3.75,4.11,3.94,4.29,4.33,4.22,4.06,4.22,4.36,4.0,4.3,4.67,4.0,3.0,4.26,4.06,4.0,4.3
4,4.0,3.54,,3.17,4.0,3.75,3.5,3.83,2.67,,3.67,4.0,4.0,,3.46,3.55,3.0,3.75
5,4.67,4.5,3.0,4.14,5.0,4.19,3.94,4.52,3.5,,3.73,4.0,4.38,,4.33,4.08,4.6,4.18


In [50]:
import math

In [52]:
genre_ratings = genre_ratings.replace(math.nan, 0)
genre_ratings.head()

Unnamed: 0_level_0,Avg_Animation,Avg_Action,Avg_Horror,Avg_Crime,Avg_Western,Avg_Drama,Avg_Romance,Avg_Adventure,Avg_Mystery,Avg_Documentary,Avg_Fantasy,Avg_War,Avg_Musical,Avg_Film-Noir,Avg_Thriller,Avg_Comedy,Avg_Sci-Fi,Avg_Children
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
1,3.65,3.73,3.74,3.81,3.38,3.77,3.95,3.79,3.61,0.0,3.79,3.67,3.67,0.0,3.76,3.73,3.71,3.61
2,3.0,4.63,3.56,5.0,4.5,3.89,3.83,4.82,4.5,0.0,2.0,4.25,3.0,5.0,4.26,3.9,4.61,3.0
3,3.75,4.11,3.94,4.29,4.33,4.22,4.06,4.22,4.36,4.0,4.3,4.67,4.0,3.0,4.26,4.06,4.0,4.3
4,4.0,3.54,0.0,3.17,4.0,3.75,3.5,3.83,2.67,0.0,3.67,4.0,4.0,0.0,3.46,3.55,3.0,3.75
5,4.67,4.5,3.0,4.14,5.0,4.19,3.94,4.52,3.5,0.0,3.73,4.0,4.38,0.0,4.33,4.08,4.6,4.18


In [56]:
# Let's turn our dataset into a list
X = genre_ratings[column_names].values

In [57]:
# TODO: Import KMeans
from sklearn.cluster import KMeans

# TODO: Create an instance of KMeans to find two clusters
kmeans_1 = KMeans(n_clusters=18, random_state=0)

# TODO: use fit_predict to cluster the dataset
predictions = kmeans_1.fit_predict(X)

  _nan_object_mask = _nan_object_array != _nan_object_array
