## Clustering movies

We cluster movies from user ratings using hierarchical and k-means clustering.  Note, the clusters are written to files. 

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

### Import the ratings

In [3]:
ratingsData = pd.read_csv('ratings.csv')
ratingsData.head()

Unnamed: 0,user id,item id,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


### Import the movie data

In [4]:
movieName = pd.read_csv('movies.csv')
numMovies = len(movieName)
movieName.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


### Create a user by movie matrix that we will cluster.

Note, this will take a moment.

In [5]:
userID = np.unique(ratingsData['user id'])
numUsers = len(userID)
userMovieMatrix = np.zeros((numUsers,numMovies))

for i in range(len(ratingsData)): 
    currentUser   = ratingsData['user id'][i] 
    currentMovie  = ratingsData['item id'][i]
    currentRating = ratingsData['rating'][i]
    
    userIndex = np.where(userID == currentUser)[0]
    movieIndex = np.where(movieName['movieId'] == currentMovie)[0] 
    userMovieMatrix[userIndex,movieIndex] = currentRating

### Let’s now apply hierarchical clustering

In [8]:
from sklearn.cluster import AgglomerativeClustering

numberOfClusters = 20

hierarchicalCluster = AgglomerativeClustering(n_clusters=numberOfClusters, affinity='euclidean', linkage='single')  
hierarchicalCluster.fit_predict(userMovieMatrix.T)

array([10,  0,  0, ...,  0,  0,  0])

In [9]:
import sys
original_stdout = sys.stdout

with open('movieClusterHierarchical.txt', 'w') as f: 
   sys.stdout = f
   # Output the clusters
   for i in range(numberOfClusters):
       index = np.where(hierarchicalCluster.labels_ == i)[0]
       print('\n\nCluster %d' % i)
       for j in range(len(index)):
           print(movieName['title'][index[j]])
   sys.stdout = original_stdout 

### Let’s now apply k-means clustering

In [7]:
numberOfClusters = 20
from sklearn.cluster import KMeans
kmeansCluster = KMeans(n_clusters=numberOfClusters)
kmeansCluster.fit(userMovieMatrix.T)
y_kmeans = kmeansCluster.predict(userMovieMatrix.T)

In [8]:
import sys
original_stdout = sys.stdout

with open('movieClusterKmeans.txt', 'w') as f: 
   sys.stdout = f
   # Output the clusters
   for i in range(numberOfClusters):
       index = np.where(kmeansCluster.labels_ == i)[0]
       print('Cluster %d' % i)
       for j in range(len(index)):
           print(movieName['title'][index[j]])
   sys.stdout = original_stdout 

### Let's see what cluster a movie of your choice resides. 

In [9]:
yourMovie = 'Karate Kid, The (1984)'

indexOfYourMovieInDataFrame = np.where(movieName['title']==yourMovie)[0]

clusterOfMovie = kmeansCluster.labels_[indexOfYourMovieInDataFrame]
index = np.where(kmeansCluster.labels_ == clusterOfMovie)[0]
print('\nk-means cluster')
print('===============')
for j in range(len(index)):
     print(movieName['title'][index[j]])

# clusterOfMovie = hierarchicalCluster.labels_[indexOfYourMovieInDataFrame]
# index = np.where(hierarchicalCluster.labels_ == clusterOfMovie)[0]
# print('\n\nhierarchical cluster')
# print('===============')
# for j in range(len(index)):
#      print(movieName['title'][index[j]])


k-means cluster
Grumpier Old Men (1995)
Father of the Bride Part II (1995)
Copycat (1995)
Powder (1995)
Pocahontas (1995)
Indian in the Cupboard, The (1995)
From Dusk Till Dawn (1996)
Muppet Treasure Island (1996)
Rumble in the Bronx (Hont faan kui) (1995)
Bad Boys (1995)
Casper (1995)
Congo (1995)
Desperado (1995)
First Knight (1995)
Johnny Mnemonic (1995)
Judge Dredd (1995)
Mallrats (1995)
Nine Months (1995)
Species (1995)
Strange Days (1995)
Billy Madison (1995)
Tommy Boy (1995)
Naked Gun 33 1/3: The Final Insult (1994)
Reality Bites (1994)
River Wild, The (1994)
Carlito's Way (1993)
City Slickers II: The Legend of Curly's Gold (1994)
Coneheads (1993)
Dazed and Confused (1993)
Free Willy (1993)
Hot Shots! Part Deux (1993)
Kalifornia (1993)
Last Action Hero (1993)
Robin Hood: Men in Tights (1993)
Rudy (1993)
So I Married an Axe Murderer (1993)
Three Musketeers, The (1993)
Brady Bunch Movie, The (1995)
Heavy Metal (1981)
Aristocats, The (1970)
Primal Fear (1996)
Courage Under Fire (1