# Clustering Algorithms - K Means

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler
from sklearn.cluster import KMeans
from ast import literal_eval

# Load and merge the data
dataset1 = pd.read_csv('tmdb_5000_credits.csv')
dataset2 = pd.read_csv('tmdb_5000_movies.csv')

dataset1.columns = ['id', 'title', 'cast', 'crew']
movies = dataset2.merge(dataset1, on='id')

# Rename columns to resolve conflicts
movies.rename(columns={'title_x': 'title'}, inplace=True)
movies.drop(columns=['title_y'], inplace=True)

Data Preparation

In [2]:

def extractFeature(obj):
    if isinstance(obj, str):
        obj = literal_eval(obj)
    if isinstance(obj, list):
        return [d['name'] for d in obj]
    return []


def topCastNames(cast_list, top_n=5):
    if isinstance(cast_list, list):
        names = [member['name'] for member in cast_list[:top_n]]
        return names
    return []


Feature Extraction

In [3]:
movies['genres'] = movies['genres'].apply(extractFeature)
movies['keywords'] = movies['keywords'].apply(extractFeature)
movies['cast'] = movies['cast'].apply(literal_eval)
movies['castNames'] = movies['cast'].apply(topCastNames)


genres = movies['genres']
keywords = movies['keywords']
cast = movies['castNames']

Handle missing values

In [4]:
genres = genres.dropna()
keywords = keywords.dropna()
cast = cast.dropna()

Encoding the Categorical Features

In [5]:
genreMlb = MultiLabelBinarizer()
encodedGenres = genreMlb.fit_transform(genres)

keywordsMlb = MultiLabelBinarizer()
encodedKeywords = keywordsMlb.fit_transform(keywords)

castMlb = MultiLabelBinarizer()
encodedCast = castMlb.fit_transform(cast)

Clustering

In [33]:
k = 100

genresKmeans = KMeans(n_clusters = k, random_state = 42, n_init = 'auto')
genresKmeans.fit(encodedGenres)

keywordsKmeans = KMeans(n_clusters = k, random_state = 42, n_init = 'auto')
keywordsKmeans.fit(encodedKeywords)

castKmeans = KMeans(n_clusters = k, random_state= 42, n_init = 'auto')
castKmeans.fit(encodedCast)

Assigning the cluster labels to the respective features.

In [34]:
movies['genresClusters'] = genresKmeans.labels_
movies['keywordsClusters'] = keywordsKmeans.labels_
movies['castClusters'] = castKmeans.labels_

Function to recommend the movies on the basis of Genres.

In [35]:
def recommendationsOnGenres(movieTitle, movies, count):
    selectedMovie = movies[movies['title'] == movieTitle]
    if selectedMovie.empty:
        print(f"Movie '{movieTitle}' not found.")
        return pd.DataFrame()
    selectedCluster = selectedMovie['genresClusters'].values[0]
    clusterMovies = movies[movies['genresClusters'] == selectedCluster]
    recommendations = clusterMovies[clusterMovies['title'] != movieTitle]
    recommendations = recommendations.sort_values(by='popularity', ascending=False)
    return recommendations.head(count)


Function to recommend the movies on the basis of Keywords.

In [36]:
def recommendationsOnKeywords(movieTitle, movies, count=5):
    selectedMovie = movies[movies['title'] == movieTitle]
    if selectedMovie.empty:
        print(f"Movie '{movieTitle}' not found.")
        return pd.DataFrame()
    selectedCluster = selectedMovie['keywordsClusters'].values[0]
    clusterMovies = movies[movies['keywordsClusters'] == selectedCluster]
    recommendations = clusterMovies[clusterMovies['title'] != movieTitle]
    recommendations = recommendations.sort_values(by='popularity', ascending=False)
    return recommendations.head(count)


Function to recommend the movies on the basis of Cast.

In [37]:
def recommendationsOnCast(movieTitle, movies, count=5):
    selectedMovie = movies[movies['title'] == movieTitle]
    if selectedMovie.empty:
        print(f"Movie '{movieTitle}' not found.")
        return pd.DataFrame()
    selectedCluster = selectedMovie['castClusters'].values[0]
    clusterMovies = movies[movies['castClusters'] == selectedCluster]
    recommendations = clusterMovies[clusterMovies['title'] != movieTitle]
    recommendations = recommendations.sort_values(by='popularity', ascending=False)
    return recommendations.head(count)


In [38]:
moviesRecommendedOnGenres = recommendationsOnGenres('Deadpool', movies, count=5)
print("\nRecommendations based on genres:")
print(moviesRecommendedOnGenres[['title']])


Recommendations based on genres:
                       title
342             Men in Black
150          Men in Black II
783                Mortdecai
531  The Man from U.N.C.L.E.
70            Wild Wild West


In [39]:
moviesRecommendedOnKeywords = recommendationsOnKeywords('Deadpool', movies, count=5)
print("\nRecommendations based on keywords:")
print(moviesRecommendedOnKeywords[['title']])


Recommendations based on keywords:
                                 title
94             Guardians of the Galaxy
26          Captain America: Civil War
65                     The Dark Knight
9   Batman v Superman: Dawn of Justice
16                        The Avengers


In [40]:
moviesRecommendedOnCast = recommendationsOnCast('Deadpool', movies, count=5)
print("\nRecommendations based on cast:")
print(moviesRecommendedOnCast[['title', 'castNames']])


Recommendations based on cast:
                       title  \
546                  Minions   
95              Interstellar   
94   Guardians of the Galaxy   
127       Mad Max: Fury Road   
28            Jurassic World   

                                             castNames  
546  [Sandra Bullock, Jon Hamm, Michael Keaton, All...  
95   [Matthew McConaughey, Jessica Chastain, Anne H...  
94   [Chris Pratt, Zoe Saldana, Dave Bautista, Vin ...  
127  [Tom Hardy, Charlize Theron, Nicholas Hoult, H...  
28   [Chris Pratt, Bryce Dallas Howard, Irrfan Khan...  


# Evaluation

In [54]:
import pandas as pd
import numpy as np

k = 100
sampledMovies = movies['title'].sample(n=k, random_state=42)

precisionList = []
recallList = []
f1ScoreList = []

for title in sampledMovies:
    selectedMovie = movies[movies['title'] == title]
    if selectedMovie.empty:
        continue

    selectedGenres = set(selectedMovie['genres'].values[0])
    expectedRecommendations = movies[movies['genres'].apply(lambda genres: len(selectedGenres.intersection(set(genres))) > 0)]
    expectedMovieTitles = set(expectedRecommendations['title'].values) - {title}

    actualRecommendations = recommendationsOnGenres(title, movies, count=500)
    actualMovieTitles = set(actualRecommendations['title'].values)

    relevantRecommendations = actualMovieTitles.intersection(expectedMovieTitles)
    
    if len(actualMovieTitles) > 0:
        precision = len(relevantRecommendations) / len(actualMovieTitles)
    else:
        precision = 0

  
    if len(expectedMovieTitles) > 0:
        recall = len(relevantRecommendations) / len(expectedMovieTitles)
    else:
        recall = 0

    
    if precision + recall > 0:
        f1Score = 2 * (precision * recall) / (precision + recall)
    else:
        f1Score = 0

    
    precisionList.append(precision)
    recallList.append(recall)
    f1ScoreList.append(f1Score)

averagePrecision = np.mean(precisionList)
averageRecall = np.mean(recallList)
averageF1Score = np.mean(f1ScoreList)

print(f"Average Precision: {averagePrecision:.2f}")
print(f"Average Recall: {averageRecall:.4f}")
print(f"Average F1 Score: {averageF1Score:.2f}")


Average Precision: 0.99
Average Recall: 0.0555
Average F1 Score: 0.09


cluster size 10 and recommendation list size 100

- Average Precision: 0.94
- Average Recall: 0.0429
- Average F1 Score: 0.08

cluster size 50 and recommendation list size 100

- Average Precision: 0.98
- Average Recall: 0.0513
- Average F1 Score: 0.09

cluster size 100 and recommendation list size 200

- Average Precision: 0.99
- Average Recall: 0.0512
- Average F1 Score: 0.09

In [55]:
def evaluate_recommendations(feature, recommendFunction):
    precisionList = []
    recallList = []
    f1ScoreList = []

    for title in sampledMovies:
        selectedMovie = movies[movies['title'] == title]
        if selectedMovie.empty:
            continue

        if feature == 'genres':
            selectedFeature = set(selectedMovie['genres'].values[0])
            expectedRecommendations = movies[movies['genres'].apply(lambda x: len(selectedFeature.intersection(set(x))) > 0)]
        elif feature == 'keywords':
            selectedFeature = set(selectedMovie['keywords'].values[0])
            expectedRecommendations = movies[movies['keywords'].apply(lambda x: len(selectedFeature.intersection(set(x))) > 0)]
        elif feature == 'castNames':
            selectedFeature = set(selectedMovie['castNames'].values[0])
            expectedRecommendations = movies[movies['castNames'].apply(lambda x: len(selectedFeature.intersection(set(x))) > 0)]
        else:
            continue

        expectedMovieTitles = set(expectedRecommendations['title'].values) - {title}
        
        actualRecommendations = recommendFunction(title, movies, count=200)
        actualMovieTitles = set(actualRecommendations['title'].values)
        
        relevantRecommendations = actualMovieTitles.intersection(expectedMovieTitles)

        # Precision
        if len(actualMovieTitles) > 0:
            precision = len(relevantRecommendations) / len(actualMovieTitles)
        else:
            precision = 0

        # Recall
        if len(expectedMovieTitles) > 0:
            recall = len(relevantRecommendations) / len(expectedMovieTitles)
        else:
            recall = 0

        # F1 Score
        if precision + recall > 0:
            f1Score = 2 * (precision * recall) / (precision + recall)
        else:
            f1Score = 0
        
        precisionList.append(precision)
        recallList.append(recall)
        f1ScoreList.append(f1Score)

    averagePrecision = np.mean(precisionList)
    averageRecall = np.mean(recallList)
    averageF1Score = np.mean(f1ScoreList)

    print(f"Feature - {feature}")
    print(f"Average Precision: {averagePrecision:.2f}")
    print(f"Average Recall: {averageRecall:.4f}")
    print(f"Average F1 Score: {averageF1Score:.2f}\n")


evaluate_recommendations('genres', recommendationsOnGenres)
evaluate_recommendations('keywords', recommendationsOnKeywords)
evaluate_recommendations('castNames', recommendationsOnCast)


Feature - genres
Average Precision: 0.99
Average Recall: 0.0512
Average F1 Score: 0.09

Feature - keywords
Average Precision: 0.26
Average Recall: 0.1331
Average F1 Score: 0.15

Feature - castNames
Average Precision: 0.03
Average Recall: 0.0552
Average F1 Score: 0.02

