## Clustering movies

We cluster movies from user ratings using hierarchical and k-means clustering.  Note, the clusters are written to files. 

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

### Import the ratings

In [2]:
ratingsData = pd.read_csv('ratings.csv')
ratingsData.head()

Unnamed: 0,user id,item id,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


### Import the movie data

In [3]:
movieName = pd.read_csv('movies.csv')
numMovies = len(movieName)
movieName.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


### Create a user by movie matrix that we will cluster.

Note, this will take a moment.

In [4]:
userID = np.unique(ratingsData['user id'])
numUsers = len(userID)
userMovieMatrix = np.zeros((numUsers,numMovies))

for i in range(len(ratingsData)): 
    currentUser   = ratingsData['user id'][i] 
    currentMovie  = ratingsData['item id'][i]
    currentRating = ratingsData['rating'][i]
    
    userIndex = np.where(userID == currentUser)[0]
    movieIndex = np.where(movieName['movieId'] == currentMovie)[0] 
    userMovieMatrix[userIndex,movieIndex] = currentRating

### Let’s now apply hierarchical clustering

In [5]:
from sklearn.cluster import AgglomerativeClustering

numberOfClusters = 10

hierarchicalCluster = AgglomerativeClustering(n_clusters=numberOfClusters, affinity='euclidean', linkage='single')  
hierarchicalCluster.fit_predict(userMovieMatrix.T)

array([0, 0, 0, ..., 0, 0, 0])

In [6]:
import sys
original_stdout = sys.stdout

with open('movieClusterHierarchical.txt', 'w') as f: 
   sys.stdout = f
   # Output the clusters
   for i in range(numberOfClusters):
       index = np.where(hierarchicalCluster.labels_ == i)[0]
       print('\n\nCluster %d' % i)
       for j in range(len(index)):
           print(movieName['title'][index[j]])
   sys.stdout = original_stdout 

### Let’s now apply k-means clustering

In [7]:
from sklearn.cluster import KMeans
kmeansCluster = KMeans(n_clusters=numberOfClusters)
kmeansCluster.fit(userMovieMatrix.T)
y_kmeans = kmeansCluster.predict(userMovieMatrix.T)

In [8]:
import sys
original_stdout = sys.stdout

with open('movieClusterKmeans.txt', 'w') as f: 
   sys.stdout = f
   # Output the clusters
   for i in range(numberOfClusters):
       index = np.where(kmeansCluster.labels_ == i)[0]
       print('Cluster %d' % i)
       for j in range(len(index)):
           print(movieName['title'][index[j]])
   sys.stdout = original_stdout 

### Let's see what cluster a movie of your choice resides. 

In [11]:
yourMovie = 'Karate Kid, The (1984)'

indexOfYourMovieInDataFrame = np.where(movieName['title']==yourMovie)[0]

clusterOfMovie = kmeansCluster.labels_[indexOfYourMovieInDataFrame]
index = np.where(kmeansCluster.labels_ == clusterOfMovie)[0]
print('\nk-means cluster')
print('===============')
for j in range(len(index)):
     print(movieName['title'][index[j]])

# clusterOfMovie = hierarchicalCluster.labels_[indexOfYourMovieInDataFrame]
# index = np.where(hierarchicalCluster.labels_ == clusterOfMovie)[0]
# print('\n\nhierarchical cluster')
# print('===============')
# for j in range(len(index)):
#      print(movieName['title'][index[j]])


k-means cluster
Heat (1995)
American President, The (1995)
Casino (1995)
Ace Ventura: When Nature Calls (1995)
Get Shorty (1995)
Pocahontas (1995)
Mr. Holland's Opus (1995)
From Dusk Till Dawn (1996)
Broken Arrow (1996)
Happy Gilmore (1996)
Birdcage, The (1996)
Desperado (1995)
Mallrats (1995)
Clerks (1994)
Legends of the Fall (1994)
Santa Clause, The (1994)
Crow, The (1994)
Maverick (1994)
Naked Gun 33 1/3: The Final Insult (1994)
Addams Family Values (1993)
Demolition Man (1993)
Hot Shots! Part Deux (1993)
Robin Hood: Men in Tights (1993)
So I Married an Axe Murderer (1993)
Nightmare Before Christmas, The (1993)
Tombstone (1993)
True Romance (1993)
Snow White and the Seven Dwarfs (1937)
Pinocchio (1940)
Primal Fear (1996)
Dragonheart (1996)
James and the Giant Peach (1996)
Truth About Cats & Dogs, The (1996)
Kingpin (1996)
Nutty Professor, The (1996)
Phenomenon (1996)
Ransom (1996)
Wizard of Oz, The (1939)
Cinderella (1950)
Mary Poppins (1964)
Alice in Wonderland (1951)
That Thing Y