## Clustering movies

We cluster movies from user ratings using hierarchical and k-means clustering.  Note, the clusters are written to files. 

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

### Import the ratings

In [2]:
ratingsData = pd.read_csv('ratings.csv')
ratingsData.head()

Unnamed: 0,user id,item id,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


### Import the movie data

In [3]:
movieName = pd.read_csv('movies.csv')
numMovies = len(movieName)
movieName.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


### Create a user by movie matrix that we will cluster.

Note, this will take a moment.

In [4]:
userID = np.unique(ratingsData['user id'])
numUsers = len(userID)
userMovieMatrix = np.zeros((numUsers,numMovies))

for i in range(len(ratingsData)): 
    currentUser   = ratingsData['user id'][i] 
    currentMovie  = ratingsData['item id'][i]
    currentRating = ratingsData['rating'][i]
    
    userIndex = np.where(userID == currentUser)[0]
    movieIndex = np.where(movieName['movieId'] == currentMovie)[0] 
    userMovieMatrix[userIndex,movieIndex] = currentRating

### Let’s now apply hierarchical clustering

In [5]:
from sklearn.cluster import AgglomerativeClustering

numberOfClusters = 10

hierarchicalCluster = AgglomerativeClustering(n_clusters=numberOfClusters, affinity='euclidean', linkage='single')  
hierarchicalCluster.fit_predict(userMovieMatrix.T)

array([0, 0, 0, ..., 0, 0, 0])

In [6]:
import sys
original_stdout = sys.stdout

with open('movieClusterHierarchical.txt', 'w') as f: 
   sys.stdout = f
   # Output the clusters
   for i in range(numberOfClusters):
       index = np.where(hierarchicalCluster.labels_ == i)[0]
       print('\n\nCluster %d' % i)
       for j in range(len(index)):
           print(movieName['title'][index[j]])
   sys.stdout = original_stdout 

### Let’s now apply k-means clustering

In [7]:
from sklearn.cluster import KMeans
kmeansCluster = KMeans(n_clusters=numberOfClusters)
kmeansCluster.fit(userMovieMatrix.T)
y_kmeans = kmeansCluster.predict(userMovieMatrix.T)

In [8]:
import sys
original_stdout = sys.stdout

with open('movieClusterKmeans.txt', 'w') as f: 
   sys.stdout = f
   # Output the clusters
   for i in range(numberOfClusters):
       index = np.where(kmeansCluster.labels_ == i)[0]
       print('Cluster %d' % i)
       for j in range(len(index)):
           print(movieName['title'][index[j]])
   sys.stdout = original_stdout 

### Let's see what cluster a movie of your choice resides. 

In [9]:
yourMovie = 'Lord of the Rings: The Two Towers, The (2002)'

indexOfYourMovieInDataFrame = np.where(movieName['title']==yourMovie)[0]

clusterOfMovie = kmeansCluster.labels_[indexOfYourMovieInDataFrame]
index = np.where(kmeansCluster.labels_ == clusterOfMovie)[0]
print('\nk-means cluster')
print('===============')
for j in range(len(index)):
     print(movieName['title'][index[j]])

clusterOfMovie = hierarchicalCluster.labels_[indexOfYourMovieInDataFrame]
index = np.where(hierarchicalCluster.labels_ == clusterOfMovie)[0]
print('\n\nhierarchical cluster')
print('===============')
for j in range(len(index)):
     print(movieName['title'][index[j]])


k-means cluster
Truman Show, The (1998)
American History X (1998)
Lock, Stock & Two Smoking Barrels (1998)
Matrix, The (1999)
Fight Club (1999)
Toy Story 2 (1999)
Green Mile, The (1999)
Gladiator (2000)
X-Men (2000)
Almost Famous (2000)
Crouching Tiger, Hidden Dragon (Wo hu cang long) (2000)
Snatch (2000)
Cast Away (2000)
Memento (2000)
Shrek (2001)
Donnie Darko (2001)
Monsters, Inc. (2001)
Harry Potter and the Sorcerer's Stone (a.k.a. Harry Potter and the Philosopher's Stone) (2001)
Ocean's Eleven (2001)
Amelie (Fabuleux destin d'Amélie Poulain, Le) (2001)
Lord of the Rings: The Fellowship of the Ring, The (2001)
Beautiful Mind, A (2001)
Spider-Man (2002)
Bourne Identity, The (2002)
Minority Report (2002)
Harry Potter and the Chamber of Secrets (2002)
Lord of the Rings: The Two Towers, The (2002)
Catch Me If You Can (2002)
City of God (Cidade de Deus) (2002)
X2: X-Men United (2003)
Matrix Reloaded, The (2003)
Finding Nemo (2003)
Pirates of the Caribbean: The Curse of the Black Pearl 