In [21]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [16]:
data = {
    "movie_id": [1, 2, 3, 4, 5, 6],
    "title": ["Inception", "Interstellar", "The Matrix", "The Dark Knight", "Memento", "Dunkirk"],
    "genre": ["Sci-Fi Thriller", "Sci-Fi Drama", "Sci-Fi Action", "Action Crime Drama", "Mystery Thriller", "War Drama"],
    "director": ["Christopher Nolan", "Christopher Nolan", "Lana Wachowski", "Christopher Nolan", "Christopher Nolan", "Christopher Nolan"]
}

df = pd.DataFrame(data)
df.head()

Unnamed: 0,movie_id,title,genre,director
0,1,Inception,Sci-Fi Thriller,Christopher Nolan
1,2,Interstellar,Sci-Fi Drama,Christopher Nolan
2,3,The Matrix,Sci-Fi Action,Lana Wachowski
3,4,The Dark Knight,Action Crime Drama,Christopher Nolan
4,5,Memento,Mystery Thriller,Christopher Nolan


# Cosine Similarity as a measure of distance
Larger values indicate more similarity

In [17]:
tfidf_genre = TfidfVectorizer(stop_words="english")
genre_matrix = tfidf_genre.fit_transform(df["genre"])
genre_matrix.shape

(6, 8)

In [18]:
tfidf_director = TfidfVectorizer(stop_words="english")
director_matrix = tfidf_director.fit_transform(df["director"])
director_matrix.shape

(6, 4)

In [22]:
genre_sim = cosine_similarity(genre_matrix, genre_matrix)
director_sim = cosine_similarity(director_matrix, director_matrix)

print("Genre Similarity Matrix:\n", pd.DataFrame(genre_sim, index=df["title"], columns=df["title"]))
print("Director Similarity Matrix:\n", pd.DataFrame(director_sim, index=df["title"], columns=df["title"]))

Genre Similarity Matrix:
 title            Inception  Interstellar  The Matrix  The Dark Knight  \
title                                                                   
Inception         1.000000      0.625954    0.587727         0.000000   
Interstellar      0.625954      1.000000    0.625954         0.272489   
The Matrix        0.587727      0.625954    1.000000         0.358939   
The Dark Knight   0.000000      0.272489    0.358939         1.000000   
Memento           0.407137      0.000000    0.000000         0.000000   
Dunkirk           0.000000      0.328635    0.000000         0.268648   

title             Memento   Dunkirk  
title                                
Inception        0.407137  0.000000  
Interstellar     0.000000  0.328635  
The Matrix       0.000000  0.000000  
The Dark Knight  0.000000  0.268648  
Memento          1.000000  0.000000  
Dunkirk          0.000000  1.000000  
Director Similarity Matrix:
 title            Inception  Interstellar  The Matrix  Th

In [33]:
genre_weight = 0.7
director_weight = 0.3

combined_sim = (genre_weight * genre_sim) + (director_weight * director_sim)

In [39]:
def recommend_movies(movie_title, df, combined_sim, top_n=3):
    # Find index of given movie
    idx = df[df["title"] == movie_title].index[0]

    # Get similarity scores and sort them
    sim_scores = list(enumerate(combined_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)


    # Get top N similar movies (excluding itself)
    top_movies = [df.iloc[i[0]]["title"] for i in sim_scores[1:top_n+1]]
    return top_movies

# Example usage
print(recommend_movies("Inception", df, combined_sim))

['Interstellar', 'Memento', 'The Matrix']


# Euclidean Distance as a measure of similarity
smaller distance is more similar
(for this example it was converted to similarity score (larger more similar))

In [72]:
from sklearn.preprocessing import normalize
from sklearn.metrics.pairwise import euclidean_distances

In [73]:
tfidf_genre = TfidfVectorizer(stop_words="english")
genre_matrix = tfidf_genre.fit_transform(df["genre"]).toarray()  # Convert sparse matrix to array

# Vectorize director
tfidf_director = TfidfVectorizer(stop_words="english")
director_matrix = tfidf_director.fit_transform(df["director"]).toarray()

In [92]:
genre_matrix_norm = normalize(genre_matrix)
director_matrix_norm = normalize(director_matrix)

genre_dist = euclidean_distances(genre_matrix_norm, genre_matrix_norm)
director_dist = euclidean_distances(director_matrix_norm, director_matrix_norm)

genre_sim = 1 / (1 + genre_dist)
director_sim = 1 / (1 + director_dist)

print("Genre Similarity:\n", pd.DataFrame(genre_sim, index=df["title"], columns=df["title"]))
print("Director Similarity:\n", pd.DataFrame(director_sim, index=df["title"], columns=df["title"]))

(6, 8)
Genre Similarity:
 title            Inception  Interstellar  The Matrix  The Dark Knight  \
title                                                                   
Inception         1.000000      0.536215    0.524097         0.414214   
Interstellar      0.536215      1.000000    0.536215         0.453259   
The Matrix        0.524097      0.536215    1.000000         0.468975   
The Dark Knight   0.414214      0.453259    0.468975         1.000000   
Memento           0.478718      0.414214    0.414214         0.414214   
Dunkirk           0.414214      0.463228    0.414214         0.452607   

title             Memento   Dunkirk  
title                                
Inception        0.478718  0.414214  
Interstellar     0.414214  0.463228  
The Matrix       0.414214  0.414214  
The Dark Knight  0.414214  0.452607  
Memento          1.000000  0.414214  
Dunkirk          0.414214  1.000000  
Director Similarity:
 title            Inception  Interstellar  The Matrix  The Dark 

In [77]:
genre_weight = 0.7
director_weight = 0.3

combined_sim = (genre_weight * genre_sim) + (director_weight * director_sim)

In [78]:
print(recommend_movies("Inception", df, combined_sim))

['Interstellar', 'Memento', 'The Dark Knight']


# Dot Product as a measure

In [71]:
import numpy as np
genre_sim = np.dot(genre_matrix, genre_matrix.T)  # Genre similarity using dot product
director_sim = np.dot(director_matrix, director_matrix.T)  # Director similarity using dot product

print("Genre Similarity Matrix:\n", pd.DataFrame(genre_sim, index=df["title"], columns=df["title"]))
print("Director Similarity Matrix:\n", pd.DataFrame(director_sim, index=df["title"], columns=df["title"]))

np.dot(genre_matrix[0], genre_matrix[1])

Genre Similarity Matrix:
 title            Inception  Interstellar  The Matrix  The Dark Knight  \
title                                                                   
Inception         1.000000      0.625954    0.587727         0.000000   
Interstellar      0.625954      1.000000    0.625954         0.272489   
The Matrix        0.587727      0.625954    1.000000         0.358939   
The Dark Knight   0.000000      0.272489    0.358939         1.000000   
Memento           0.407137      0.000000    0.000000         0.000000   
Dunkirk           0.000000      0.328635    0.000000         0.268648   

title             Memento   Dunkirk  
title                                
Inception        0.407137  0.000000  
Interstellar     0.000000  0.328635  
The Matrix       0.000000  0.000000  
The Dark Knight  0.000000  0.268648  
Memento          1.000000  0.000000  
Dunkirk          0.000000  1.000000  
Director Similarity Matrix:
 title            Inception  Interstellar  The Matrix  Th

0.6259538619624666

In [60]:
genre_weight = 0.7
director_weight = 0.3

# Compute final similarity score
combined_sim = (genre_weight * genre_sim) + (director_weight * director_sim)

In [61]:
print(recommend_movies("Inception", df, combined_sim))

['Interstellar', 'Memento', 'The Matrix']
