In [None]:
import os
import codecs
import pandas as pd

# change dir for custom imports
os.chdir('../')
from scripts.dataset_downloader import initialize_dataset
from scripts.k_means import create_clsuters, clustering_errors
from scripts.helpers import get_most_rated_movies


dataset = 'ml-100k'
dataset_path = os.path.join('datasets', dataset)

In [None]:
# If dataset hasn't been previously initialized, it can be done with this function
initialize_dataset(dataset)

In [None]:
ratings_path = os.path.join(dataset_path, 'u.data')
items_path = os.path.join(dataset_path, 'u.item')
genres_path = os.path.join(dataset_path, 'u.genre')
ratings_file = codecs.open(ratings_path, 'rU', 'UTF-8')
items_file = codecs.open(items_path, 'rU', 'latin-1')

# load data
ratings_df = pd.read_csv(ratings_file, sep='\t', names=('user_id', 'item_id', 'rating', 'timestamp'))
genres_df = pd.read_csv(genres_path, sep='|', names=('title', 'id'))
cols_names = ('id', 'title', 'year', 'nan', 'link') + tuple(genres_df.title.to_list())
items_df = pd.read_csv(items_file, sep='|', usecols=list(range(0,24)), names=cols_names).drop(columns=['nan', 'link'])

# dataset stats
print(f"Total dataset users: {len(set(ratings_df.user_id.to_list()))}")
print(f"Total dataset ratings: {len(ratings_df.user_id.to_list())}")

In [None]:
ratings_df.head()

In [None]:
# merge the two tables then pivot so we have Users X Movies dataframe
ratings_title = pd.merge(ratings_df, items_df[['id', 'title']], left_on='item_id', right_on='id').drop(columns=['id'])
user_movie_ratings = pd.pivot_table(ratings_title, index='user_id', columns= 'title', values='rating')

# print the number of dimensions and a subset of the dataset
print('dataset dimensions: ', user_movie_ratings.shape, '\n\nSubset example:')
user_movie_ratings.iloc[:6, :10]

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from mpl_toolkits.axes_grid1 import make_axes_locatable

# Define the plotting heatmap function
def draw_movies_heatmap(most_rated_movies_users_selection, axis_labels=True):
    fig = plt.figure(figsize=(15,4))
    ax = plt.gca()
    
    # Draw heatmap
    heatmap = ax.imshow(most_rated_movies_users_selection,  interpolation='nearest', vmin=0, vmax=5, aspect='auto')

    if axis_labels:
        ax.set_yticks(np.arange(most_rated_movies_users_selection.shape[0]) , minor=False)
        ax.set_xticks(np.arange(most_rated_movies_users_selection.shape[1]) , minor=False)
        ax.invert_yaxis()
        ax.xaxis.tick_top()
        labels = most_rated_movies_users_selection.columns.str[:40]
        ax.set_xticklabels(labels, minor=False)
        ax.set_yticklabels(most_rated_movies_users_selection.index, minor=False)
        plt.setp(ax.get_xticklabels(), rotation=90)
    else:
        ax.get_xaxis().set_visible(False)
        ax.get_yaxis().set_visible(False)
    
    ax.grid(False)
    ax.set_ylabel('User id')
    # Separate heatmap from color bar
    divider = make_axes_locatable(ax)
    cax = divider.append_axes("right", size="5%", pad=0.05)
    # Color bar
    cbar = fig.colorbar(heatmap, ticks=[5, 4, 3, 2, 1, 0], cax=cax)
    cbar.ax.set_yticklabels(['5 stars', '4 stars','3 stars','2 stars','1 stars','0 stars'])

plt.show()
# Print the heatmap
draw_movies_heatmap(user_movie_ratings.iloc[:6, :10])

In [None]:
user_movie_ratings = pd.DataFrame(user_movie_ratings).fillna(0)

In [None]:
from scipy.sparse import csr_matrix
import scipy

# Conversion to sparse csr matrix
# sparse_ratings = csr_matrix(pd.SparseDataFrame(user_movie_ratings).to_coo())
sparse_ratings = scipy.sparse.csr_matrix(user_movie_ratings.values)
sparse_ratings

In [None]:
from sklearn.cluster import KMeans

# 20 clusters
predictions = KMeans(n_clusters=20, algorithm='full').fit_predict(sparse_ratings)

In [None]:
# cluster and print some of them
clustered = pd.concat([user_movie_ratings.reset_index(), pd.DataFrame({'group':predictions})], axis=1)

In [None]:
clustered.head()

In [None]:
clustered.to_csv('./output/exp-1/clusters.csv', index=False)