#  Netflix Recommendations



## Control Flow

1. Load the data into a dataframe.
2. Remove outlier movies and users.
3. Gerate a training set (90% of users) and a test set (10% of users).
4. Cluster the training set according to a clustering algorithm.
5. Define the center of the cluster as the ranked list of movies.
6. Test phase for each clustering algorithm:
    i. For each user in the test set:
        I. Get their top 3 favorite movies.
        II. Assign them to a most likely cluster.
        III. Give them the ranked list of movies (defined by the cluster center in part 5) as recommendations.
    ii. Calculate precision, recall, and RMSE for the clustering method.
7. Make data visualizations for the clusters and performance.

### Importing libraries

In [14]:
import time
import pandas as pd
import numpy as np
import os
from yellowbrick.cluster import KElbowVisualizer
from sklearn.cluster import KMeans
from sklearn.metrics import mean_squared_error

### Control Flow Steps 1-2
The result of the computation will be a parquet file called `./Data/removedoutliersdf.parquet.gzip` of the dataframe which holds all of the ratings data with outlier movies and users removed.

To make this more efficient, we are only using the ratings data from `./Data/combined_data_1.txt`.

In [5]:
# Loading the ratings data and filtering out the outliers
if not os.path.isfile('./Data/removedoutliersdf.parquet.gzip'):
    # import the rating data as pandas dataframe
    df = pd.read_csv('./Data/combined_data_1.txt', header=None, names=['UserId', 'Rating'], usecols=[0, 1])

    df['Rating'] = df['Rating'].astype(float)  # Rating is temporarily a float

    df.index = np.arange(0, len(df))  # reindex the ratings

    # Adding the MovieId to the data frame
    df_nan = pd.DataFrame(pd.isnull(df.Rating))
    df_nan = df_nan[df_nan['Rating'] == True]
    df_nan = df_nan.reset_index()

    movie_np = []
    movie_id = 1

    for i, j in zip(df_nan['index'][1:], df_nan['index'][:-1]):
        # numpy approach
        temp = np.full((1, i-j-1), movie_id)
        movie_np = np.append(movie_np, temp)
        movie_id += 1

    last_record = np.full((1, len(df) - df_nan.iloc[-1, 0] - 1), movie_id)
    movie_np = np.append(movie_np, last_record)

    df = df[pd.notnull(df['Rating'])]

    df['MovieId'] = movie_np.astype(int)
    df['UserId'] = df['UserId'].astype(int)


    # Removing unpopular movies and users with too few reviews
    # Removing the 70% least popular movies and users with the least ratings
    f = ['count', 'mean']

    df_movie_summary = df.groupby('MovieId')['Rating'].agg(f)
    df_movie_summary.index = df_movie_summary.index.map(int)
    movie_benchmark = round(df_movie_summary['count'].quantile(0.7), 0)
    drop_movie_list = df_movie_summary[df_movie_summary['count'] < movie_benchmark].index

    print('Movie minimum times of review: {}'.format(movie_benchmark))

    df_cust_summary = df.groupby('UserId')['Rating'].agg(f)
    df_cust_summary.index = df_cust_summary.index.map(int)
    cust_benchmark = round(df_cust_summary['count'].quantile(0.7), 0)
    drop_cust_list = df_cust_summary[df_cust_summary['count'] < cust_benchmark].index

    print('Customer minimum times of review: {}'.format(cust_benchmark))

    print('Original Shape: {}'.format(df.shape))
    df = df[~df['MovieId'].isin(drop_movie_list)]
    df = df[~df['UserId'].isin(drop_cust_list)]

    df['Rating'] = df['Rating'].astype(int)

    print('After Trim Shape: {}'.format(df.shape))

    print(df.describe())

    df.to_parquet('./Data/removedoutliersdf.parquet.gzip', compression='gzip')


Here getting the list of movie titles minus the outliers that were filtered out from above.
The result is a parquet file titled `./Data/filteredmovietitlesdf.parquet.gzip` to regenerate the dataframe for later use, if needed.

In [6]:
# Get a list of movie titles that passed the above filter
if not os.path.isfile('./Data/filteredmovietitlesdf.parquet.gzip'):
    df = pd.read_parquet('./Data/removedoutliersdf.parquet.gzip')
    df_title = pd.read_csv('./Data/movie_titles.csv', encoding="ISO-8859-1", header=None, usecols=[0, 2],
                           names=['MovieId', 'Name'])
    # df_title.set_index('MovieId', inplace=True)
    df_title = pd.merge(df, df_title, how='inner', on='MovieId').drop_duplicates(subset=['MovieId'])[['MovieId', 'Name']]
    df_title.to_parquet('./Data/filteredmovietitlesdf.parquet.gzip', compression='gzip')

### Generating the training set
The result is a parquet file titled `./Data/trainingusersdf.parquet.gzip`.
This should be statistically similar to the removedoutliers dataset.

In [7]:
# Make a training set of users
if not os.path.isfile('./Data/trainingusersdf.parquet.gzip'):
    df = pd.read_parquet('./Data/removedoutliersdf.parquet.gzip')
    df2 = df.loc[df['UserId'] % 10 != 0]
    print("Original data set statistics:")
    print(df.describe())
    print("Training data set statistics:")
    print(df2.describe())
    df2.to_parquet('./Data/trainingusersdf.parquet.gzip', compression='gzip')

### Generating the test set
The result is a parquet file titled `./Data/testusersdf.parquet.gzip`.
This should be statistically similar to the removedoutliers dataset.

In [8]:
# Make a test set of users
if not os.path.isfile('./Data/testusersdf.parquet.gzip'):
    df = pd.read_parquet('./Data/removedoutliersdf.parquet.gzip')
    df2 = df.loc[df['UserId'] % 10 == 0]
    print("Original data set statistics:")
    print(df.describe())
    print("Test data set statistics:")
    print(df2.describe())
    df2.to_parquet('./Data/testusersdf.parquet.gzip', compression='gzip')


Pivoting the training data into a user-item matrix. 

In [15]:
# Pivot the data frame into a user-item matrix
df = pd.read_parquet('./Data/trainingusersdf.parquet.gzip')
df = pd.pivot_table(df, values='Rating', index='UserId', columns='MovieId', fill_value=0)
df.head(10)

MovieId,3,8,16,17,18,26,28,30,32,33,...,4472,4474,4478,4479,4485,4488,4490,4492,4493,4496
UserId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6,0,0,0,0,0,0,0,3,0,0,...,3,0,0,0,0,0,0,0,0,0
7,0,5,0,0,0,0,4,5,0,0,...,3,0,0,5,0,0,0,0,0,0
79,0,0,0,0,0,0,0,3,0,0,...,4,0,0,0,0,0,4,0,0,0
97,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
134,0,0,0,0,0,0,5,0,0,0,...,0,0,0,0,0,0,0,0,0,0
169,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
183,0,0,0,0,0,0,0,4,0,0,...,0,0,0,0,0,0,3,0,0,0
188,0,0,0,0,0,0,0,3,0,0,...,0,0,0,0,0,0,3,3,0,0
195,0,0,0,0,0,0,0,0,0,0,...,4,0,0,0,0,0,0,0,0,0
199,0,0,0,0,0,0,0,5,0,0,...,0,0,0,0,0,0,0,0,0,0


## Clustering

### kmeans

Below, we are using the elbow method to find an optimal number of clusters to use for kmeans. It takes about 2-3hrs to run, the result is the following figure:
![kmeansclusters](Figure_1.png)

The important DataFrames in this section are:

`dfkmeansclustercenters` - This is all of the cluster centers after kmeans. This also represents a ranked list of movies for the cluster, which is the recommendation list we givee to the new users in the test data when they fall close to this cluster center.

`dfkmeanslabels` - This is a table associating a UserId with a cluster number after kmeans.

`dfkmeans` This is the user-item DataFrame with an extra column associating each UserId with a cluster number.

In [10]:
# # Finding best number of clusters for kmeans This takes about 2-3hrs.
# model = KMeans()
# # k is range of number of clusters.
# visualizer = KElbowVisualizer(model, k=[(5 * i) + 2 for i in range(20)], timings=True)
# visualizer.fit(df)        # Fit data to visualizer
# visualizer.show()        # Finalize and render figure

In [5]:
# cluster df using kmeans
time_start = time.time()
kmeans = KMeans(n_clusters=22).fit(df)
print('Clustering with k-means took {} seconds'.format(time.time()-time_start))

KeyboardInterrupt: 

Exception ignored in: 'sklearn.cluster._k_means_common._relocate_empty_clusters_dense'
Traceback (most recent call last):
  File "<__array_function__ internals>", line 2, in where
KeyboardInterrupt: 


Clustering with k-means took 105.43892407417297 seconds


In [16]:
# cluster centers after kmeans
# each cluster is a community of users who like the same movies. The center is our ranked list of movies for the cluster.
dfkmeansclustercenters = pd.DataFrame(kmeans.cluster_centers_, columns=df.columns)

In [17]:
dfkmeansclustercenters

MovieId,3,8,16,17,18,26,28,30,32,33,...,4472,4474,4478,4479,4485,4488,4490,4492,4493,4496
0,0.128142,0.338645,0.003815,0.007855,0.614228,0.010099,0.551167,2.457136,0.033214,0.017953,...,2.182899,0.009425,0.084605,2.464318,0.068896,1.407989,0.10974,0.00763,0.023115,0.72868
1,0.002819,0.066132,0.007806,0.046184,0.114267,0.015178,2.94948,2.521899,0.135516,0.061145,...,1.809627,0.044016,0.001735,0.259107,0.029922,0.192975,0.144623,0.039462,0.039896,0.044883
2,0.3823,0.650813,0.18543,0.041541,1.810355,0.074052,1.075256,2.843468,0.044551,0.170379,...,2.854907,0.159542,0.528597,3.410596,0.499097,3.108368,0.33835,0.096328,0.273931,1.872366
3,0.014245,0.069052,0.008917,0.012179,0.10896,0.009678,0.466398,1.343954,0.030448,0.008482,...,0.827751,0.027186,0.01816,0.667029,0.028273,0.464659,0.047956,0.015659,0.022292,0.093845
4,0.125088,0.425956,0.001291,0.00223,0.267895,0.008566,0.292068,1.930767,0.00223,0.054447,...,2.417977,0.003638,0.094227,0.687397,0.006219,1.192912,0.111007,0.021474,0.056677,0.494954
5,0.039617,0.48224,0.298839,0.028689,1.007855,0.067281,1.372268,1.780396,0.016393,0.9375,...,2.110314,0.413593,0.115779,0.808743,0.280055,1.919057,0.17179,0.605191,0.507172,0.372951
6,0.019551,0.168858,0.004771,0.270453,0.036541,0.290469,0.4968,2.838706,0.020831,0.033516,...,2.023624,0.024438,0.012568,0.058187,0.01606,0.169091,0.421506,0.220761,0.09403,0.038869
7,0.006575,0.15631,0.251538,0.073595,0.196182,0.040297,1.134464,1.037752,0.046448,0.178367,...,0.70456,0.236479,0.009756,0.252174,0.174549,0.257688,0.070838,0.108802,0.142948,0.088865
8,0.000885,0.053614,0.023622,0.139609,0.018225,0.090065,0.577811,1.270459,0.041847,0.073697,...,0.553216,0.072901,0.006282,0.041493,0.018137,0.04804,0.065469,0.103424,0.050252,0.020349
9,0.00596,0.136424,0.390066,0.319868,0.255298,0.351656,1.921523,2.348013,0.097351,0.294702,...,1.46457,0.626159,0.010265,0.248675,0.171854,0.566887,0.306623,0.725828,0.227483,0.067881


In [52]:
# The cluster label given to a UserId
dfkmeanslabels = pd.DataFrame(kmeans.labels_)

In [53]:
# Reindexing the labels to be UserIds and renaming the column to "cluster_number"
dfkmeanslabels.index = df.index
dfkmeanslabels.columns = ["cluster_number"]

In [54]:
# dfkmeans is the user-item matrix with an extra column labeling which cluster the UserId belongs to after kmeans.
# The index are UserIds and columns are MovieIds + cluster_number
dfkmeans = df.join(dfkmeanslabels)
dfkmeans.head(10)

Unnamed: 0_level_0,3,8,16,17,18,26,28,30,32,33,...,4474,4478,4479,4485,4488,4490,4492,4493,4496,cluster_number
UserId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6,0,0,0,0,0,0,0,3,0,0,...,0,0,0,0,0,0,0,0,0,10
7,0,5,0,0,0,0,4,5,0,0,...,0,0,5,0,0,0,0,0,0,16
79,0,0,0,0,0,0,0,3,0,0,...,0,0,0,0,0,4,0,0,0,11
97,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,15
134,0,0,0,0,0,0,5,0,0,0,...,0,0,0,0,0,0,0,0,0,2
169,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3
183,0,0,0,0,0,0,0,4,0,0,...,0,0,0,0,0,3,0,0,0,19
188,0,0,0,0,0,0,0,3,0,0,...,0,0,0,0,0,3,3,0,0,14
195,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,21
199,0,0,0,0,0,0,0,5,0,0,...,0,0,0,0,0,0,0,0,0,14


DBSCAN Clustering Algorithm

In [25]:
# from sklearn.cluster import DBSCAN
# import numpy as np

# X = df.to_numpy()
# clustering = DBSCAN(eps=0.5, min_samples=5).fit(X)

In [16]:
from sklearn.decomposition import PCA
pca = PCA().fit(df)
top_PCA=["%.2f" % a for a in pca.explained_variance_ratio_ if a >=0.01]
print(len(top_PCA))
print(top_PCA)
sumall=sum(pca.explained_variance_ratio_)

pca39 = PCA(n_components=len(top_PCA)).fit(df)

7
['0.08', '0.05', '0.03', '0.02', '0.02', '0.01', '0.01']


In [17]:
from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_score

Xpca=pca39.transform(df)

for eps in [0.5,1,2,3,4,5,6,7,8,9,10,11]:
    for min_samples in range(1,50):
        db = DBSCAN(eps=eps, min_samples=min_samples)
        y = db.fit_predict(Xpca)
        if len(set(db.labels_))>1 and len(set(db.labels_))<250:
            print ("Number of labels: " +str(len(set(db.labels_))) + " Eps: " + str(eps) + " min_samples: " + str(min_samples) + " => "+ str(silhouette_score(Xpca, db.labels_, metric='euclidean')))

Number of labels: 5 Eps: 0.5 min_samples: 2 => -0.3114008736685848


KeyboardInterrupt: 

In [19]:
#Performing Clustering with DBSCAN
from sklearn.cluster import DBSCAN
db = DBSCAN(eps=1, min_samples=3).fit(Xpca)
y = DBSCAN(eps=1, min_samples=3).fit_predict(Xpca)
dbscanlabels= db.labels_

In [20]:
dbscanlabels

array([-1, -1, -1, ..., -1, -1, -1])

In [21]:
np.unique(y)

array([-1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
       16, 17, 18, 19, 20, 21])

In [22]:
print(len(np.where(y ==-1)[0]))
print(len(Xpca))
print(len(Xpca) - len(np.where(y ==-1)[0]))

129037
129120
83


In [23]:
dfdbscanlabels = pd.DataFrame(db.labels_)

In [24]:
# Reindexing the labels to be UserIds and renaming the column to "cluster_number"
dfdbscanlabels.index = df.index
dfdbscanlabels.columns = ["cluster_number"]

In [65]:
# dfdbscan is the user-item matrix with an extra column labeling which cluster the UserId belongs to after dbscan.
# The index are UserIds and columns are MovieIds + cluster_number
dfdbscan = df.join(dfdbscanlabels)
dfdbscan.head(100)

Unnamed: 0_level_0,3,8,16,17,18,26,28,30,32,33,...,4474,4478,4479,4485,4488,4490,4492,4493,4496,cluster_number
UserId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6,0,0,0,0,0,0,0,3,0,0,...,0,0,0,0,0,0,0,0,0,-1
7,0,5,0,0,0,0,4,5,0,0,...,0,0,5,0,0,0,0,0,0,-1
79,0,0,0,0,0,0,0,3,0,0,...,0,0,0,0,0,4,0,0,0,-1
97,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,-1
134,0,0,0,0,0,0,5,0,0,0,...,0,0,0,0,0,0,0,0,0,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1894,0,0,0,0,0,0,4,0,0,0,...,0,0,0,0,0,0,0,0,0,-1
1897,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,5,0,0,0,0,-1
1918,0,0,0,0,0,0,3,5,0,0,...,0,0,0,0,2,0,0,0,0,-1
1922,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,3,0,-1


## Assigning new users to clusters per clustering algorithm

### Helper functions

In [55]:
# Get the 3 favorite movies from a test user. It is likely there is more than 3 tied. Get 3 random movies from those tied so that we aren't biased to the lower movieids.
def get3favoritemovies(testuserstransposed, user):
    return testuserstransposed.nlargest(3, user, keep='all')[user].sample(n=3)

In [56]:
# All of the different cluster center dataframes.
# The nth object in this list is a DataFrame for the nth clustering algorithm such that 
# the row indices is an index of a cluster center and the columns are movieids.
clustercenters = [dfkmeansclustercenters] 

# All of the classified test users for each cluster algorithm
# The nth object in this list is a DataFrame for the nth clustering algorithm. The rows in the DataFrame are test userids 
# and there is one column, which is the cluster number the test user belongs in.
classifiedtestusers = [] 

In [57]:
# loading the test user dataset into dftestusers
dftestusers = pd.read_parquet('./Data/testusersdf.parquet.gzip')
dftestusers = pd.pivot_table(dftestusers, values='Rating', index='UserId', columns='MovieId', fill_value=0)
dftestusers.head(10)

MovieId,3,8,16,17,18,26,28,30,32,33,...,4472,4474,4478,4479,4485,4488,4490,4492,4493,4496
UserId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1070,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1310,0,0,0,0,0,0,3,3,0,0,...,0,0,0,4,0,0,3,0,0,5
1500,0,0,0,0,0,0,1,4,0,0,...,3,0,0,0,0,0,0,0,0,0
1830,0,0,0,0,0,0,5,5,0,0,...,2,0,0,0,0,0,0,0,0,0
1900,0,0,0,0,0,0,4,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2000,0,0,0,0,3,0,4,0,0,0,...,5,0,0,5,0,4,0,0,0,4
2050,0,0,0,0,0,0,0,4,0,0,...,0,0,0,0,0,0,0,0,0,0
2250,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2270,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2280,0,0,0,0,0,0,0,0,0,0,...,0,0,0,4,0,4,0,0,0,0


In [58]:
testuserids = dftestusers.index # List of UserIds in the test set
testuserstransposed = dftestusers.T # Needs to be transposed to easily find 3 favorite movies
testuserstransposed.head(10)

UserId,1070,1310,1500,1830,1900,2000,2050,2250,2270,2280,...,2647690,2648260,2648290,2648730,2649050,2649080,2649100,2649110,2649120,2649370
MovieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
16,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
17,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
18,0,0,0,0,0,3,0,0,0,0,...,0,0,0,0,3,0,0,0,0,0
26,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
28,0,3,1,5,4,4,0,0,0,0,...,0,0,4,4,0,0,5,0,0,0
30,0,3,4,5,0,0,4,0,0,0,...,0,4,3,3,4,0,0,3,1,0
32,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
33,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [59]:
# HERE WE ARE LOOPING FOR EACH CLUSTER ALGORITHM
#    FOR EACH USER IN TEST SET
#        ASSIGN USER TO CLUSTER
#    STORE ALL TEST USER ASSIGNMENTS FOR THE CLUSTERING ALGORITHM

for algoindex in range(len(clustercenters)):  # for each clustering algorithm
    start_time = time.time()
    testuserclassifications = []  # this is a list of new users assigned to clusters
    for user in testuserids:  # for each userid in the test set
        favs = get3favoritemovies(testuserstransposed, user)  # get 3 favorite movies of the user
        clustermatrix = clustercenters[algoindex][favs.index].to_numpy()  # make numpy matrix where rows is the cluster_center_# and column is the 3 favorite movie ids
        dist = [np.linalg.norm(clustermatrix[i] - favs.to_numpy()) for i in range(len(clustermatrix))]  # calculate the distance of the user to each cluster center
        testuserclassifications.append(dist.index(min(dist)))  # classify the new user to the closest cluster center
    print('Assigning test users to a cluster using clustering algorithm {0} took {1} seconds'.format(algoindex, time.time()-start_time))
    dftestuserclassifications = pd.DataFrame(testuserclassifications, index=dftestusers.index, columns=["cluster_number"]) # make a dataframe with userids assigned to clusters
    classifiedtestusers.append(dftestuserclassifications) # add this assignment too classified test users list

Assigning test users to a cluster using clustering algorithm 0 took 1267.4887969493866 seconds


In [60]:
classifiedtestusers[0].head(10)  # These are the first 10 test users and which kmeans cluster they belong in 

Unnamed: 0_level_0,cluster_number
UserId,Unnamed: 1_level_1
1070,18
1310,8
1500,13
1830,13
1900,13
2000,13
2050,17
2250,13
2270,8
2280,13


### Building movie rating prediction DataFrames

In [61]:
# The nth value in the list is a DataFrame corresponding to the nth clustering algorithm 
# where the rows a test userids and the columns are movieids 
# and the values of the DataFrame are predicted movie ratings
predictions = []  

In [62]:
for algonumber in range(len(clustercenters)):
    dftestuserpredictions = pd.DataFrame(columns=dftestusers.columns) # set up the test user predictions DataFrame
    start_time = time.time()
    for index, row in classifiedtestusers[algonumber].iterrows(): # for each classified test user
        # Give the user the same recommendations as their assigned cluster. This will be the cluster center point.
        dftestuserpredictions.loc[index] = clustercenters[algonumber].loc[row["cluster_number"]]
    print('Giving test users recommendations for clustering algorithm {0} took {1} seconds'.format(algoindex, time.time()-start_time))
    predictions.append(dftestuserpredictions)

Giving test users recommendations for clustering algorithm 0 took 353.0375020503998 seconds


In [63]:
# This is the first 10 test users and their predicted movie rating preferences for kmeans (the 0th clustering algorithm)
predictions[0].head(10) 

MovieId,3,8,16,17,18,26,28,30,32,33,...,4472,4474,4478,4479,4485,4488,4490,4492,4493,4496
1070,0.03603,0.452753,0.328348,0.043168,0.947995,0.082597,1.437458,2.038749,0.021414,0.865058,...,2.236574,0.461591,0.097893,0.781441,0.244392,1.851462,0.227056,0.693406,0.443916,0.296397
1310,0.385529,0.617711,0.164687,0.038337,1.775378,0.076674,1.036177,2.659287,0.032937,0.204104,...,2.723002,0.161447,0.573434,3.24568,0.455724,3.139849,0.303456,0.094492,0.322894,1.884989
1500,0.064356,0.358557,0.895332,0.427157,1.132956,0.562235,2.893918,3.154173,0.352192,0.333098,...,2.458982,1.050919,0.094767,1.550919,0.825318,1.62942,0.748232,0.811174,0.48727,0.449788
1830,0.064356,0.358557,0.895332,0.427157,1.132956,0.562235,2.893918,3.154173,0.352192,0.333098,...,2.458982,1.050919,0.094767,1.550919,0.825318,1.62942,0.748232,0.811174,0.48727,0.449788
1900,0.064356,0.358557,0.895332,0.427157,1.132956,0.562235,2.893918,3.154173,0.352192,0.333098,...,2.458982,1.050919,0.094767,1.550919,0.825318,1.62942,0.748232,0.811174,0.48727,0.449788
2000,0.064356,0.358557,0.895332,0.427157,1.132956,0.562235,2.893918,3.154173,0.352192,0.333098,...,2.458982,1.050919,0.094767,1.550919,0.825318,1.62942,0.748232,0.811174,0.48727,0.449788
2050,0.006882,0.129387,0.054026,0.145905,0.436339,0.177908,2.907089,3.468685,0.183414,0.116655,...,2.920165,0.110461,0.009635,0.691672,0.094288,0.949415,0.470062,0.227116,0.14384,0.172402
2250,0.064356,0.358557,0.895332,0.427157,1.132956,0.562235,2.893918,3.154173,0.352192,0.333098,...,2.458982,1.050919,0.094767,1.550919,0.825318,1.62942,0.748232,0.811174,0.48727,0.449788
2270,0.385529,0.617711,0.164687,0.038337,1.775378,0.076674,1.036177,2.659287,0.032937,0.204104,...,2.723002,0.161447,0.573434,3.24568,0.455724,3.139849,0.303456,0.094492,0.322894,1.884989
2280,0.064356,0.358557,0.895332,0.427157,1.132956,0.562235,2.893918,3.154173,0.352192,0.333098,...,2.458982,1.050919,0.094767,1.550919,0.825318,1.62942,0.748232,0.811174,0.48727,0.449788


### Calculating RMSE for the different recommender systems

In [65]:
# The nth value in the list is the RMSE value for the recommender system built with the nth clustering algorithm
RMSEvalues = []

MovieId,3,8,16,17,18,26,28,30,32,33,...,4472,4474,4478,4479,4485,4488,4490,4492,4493,4496
0,0.009765,0.131574,0.018331,0.235909,0.110845,0.220147,1.139284,3.519616,0.093199,0.014734,...,2.418194,0.054823,0.007538,0.395409,0.02467,0.464108,0.364913,0.141168,0.077094,0.066473
1,0.165303,0.318769,0.007103,0.005381,0.578993,0.009255,0.445329,1.783039,0.017004,0.033577,...,1.669608,0.020232,0.137107,2.312312,0.075979,1.590616,0.093844,0.00904,0.046922,0.941455
2,0.005455,0.084123,0.291703,0.376687,0.173988,0.39822,1.848981,2.387023,0.093597,0.167959,...,1.311513,0.519667,0.009762,0.19265,0.124318,0.382142,0.282228,0.599196,0.19064,0.04565
3,0.002908,0.04577,0.024198,0.065279,0.033483,0.039767,0.553367,0.990246,0.030951,0.041831,...,0.461264,0.051585,0.009473,0.133371,0.028888,0.133371,0.051679,0.04577,0.034703,0.032264
4,0.01212,0.213812,0.033615,0.004116,0.172079,0.007203,0.334782,0.510176,0.005488,0.492568,...,0.666362,0.060256,0.044477,0.219186,0.048136,0.61148,0.037503,0.124514,0.145438,0.113995
5,0.000652,0.05923,0.034442,0.255186,0.016569,0.190476,0.813829,1.764384,0.058447,0.082192,...,0.806523,0.118721,0.003523,0.022179,0.016438,0.045793,0.092629,0.204958,0.089759,0.015264
6,0.11284,0.442485,0.002554,0.002189,0.254742,0.008268,0.28964,1.814202,0.001581,0.077578,...,2.383025,0.003405,0.095696,0.59448,0.007417,1.214129,0.101411,0.026265,0.063351,0.453915
7,0.079608,0.194719,0.017405,0.019778,0.143592,0.019778,0.327433,0.865111,0.01068,0.067741,...,0.819324,0.029371,0.067642,0.392207,0.059138,0.415546,0.102551,0.040941,0.090981,0.210047
8,0.385529,0.617711,0.164687,0.038337,1.775378,0.076674,1.036177,2.659287,0.032937,0.204104,...,2.723002,0.161447,0.573434,3.24568,0.455724,3.139849,0.303456,0.094492,0.322894,1.884989
9,0.001668,0.061003,0.001906,0.024783,0.044204,0.013464,1.139402,2.468605,0.053259,0.040033,...,2.20672,0.004051,0.003455,0.08793,0.010962,0.132491,0.081139,0.026093,0.020493,0.020732


In [None]:
# for algonumber in len(predictions):
#     RMSEvalues.append(mean_squared_error(squared=False))

In [66]:
dftestusers.head(10)

MovieId,3,8,16,17,18,26,28,30,32,33,...,4472,4474,4478,4479,4485,4488,4490,4492,4493,4496
UserId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1070,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1310,0,0,0,0,0,0,3,3,0,0,...,0,0,0,4,0,0,3,0,0,5
1500,0,0,0,0,0,0,1,4,0,0,...,3,0,0,0,0,0,0,0,0,0
1830,0,0,0,0,0,0,5,5,0,0,...,2,0,0,0,0,0,0,0,0,0
1900,0,0,0,0,0,0,4,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2000,0,0,0,0,3,0,4,0,0,0,...,5,0,0,5,0,4,0,0,0,4
2050,0,0,0,0,0,0,0,4,0,0,...,0,0,0,0,0,0,0,0,0,0
2250,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2270,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2280,0,0,0,0,0,0,0,0,0,0,...,0,0,0,4,0,4,0,0,0,0


In [68]:
df1 = dftestusers.where(dftestusers == 0, predictions[0])
df1.head(10)

MovieId,3,8,16,17,18,26,28,30,32,33,...,4472,4474,4478,4479,4485,4488,4490,4492,4493,4496
UserId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1070,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1310,0.0,0.0,0.0,0.0,0.0,0.0,1.036177,2.659287,0.0,0.0,...,0.0,0.0,0.0,3.24568,0.0,0.0,0.303456,0.0,0.0,1.884989
1500,0.0,0.0,0.0,0.0,0.0,0.0,2.893918,3.154173,0.0,0.0,...,2.458982,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1830,0.0,0.0,0.0,0.0,0.0,0.0,2.893918,3.154173,0.0,0.0,...,2.458982,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1900,0.0,0.0,0.0,0.0,0.0,0.0,2.893918,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2000,0.0,0.0,0.0,0.0,1.132956,0.0,2.893918,0.0,0.0,0.0,...,2.458982,0.0,0.0,1.550919,0.0,1.62942,0.0,0.0,0.0,0.449788
2050,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.468685,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2250,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2270,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2280,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.550919,0.0,1.62942,0.0,0.0,0.0,0.0
