In [578]:
import pandas as pd
import seaborn as snas
import matplotlib.pyplot as plt
import numpy as np
from ast import literal_eval # use literal eval instead of eval
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import jaccard_score

In [389]:
credits = pd.read_csv('../dataset/credits.csv')
keywords = pd.read_csv('../dataset/keywords.csv')
links = pd.read_csv('../dataset/links.csv')
md = pd.read_csv('../dataset/movies_metadata.csv')
ratings = pd.read_csv('../dataset/ratings.csv')

  md = pd.read_csv('../dataset/movies_metadata.csv')


In [26]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

def embed_texts(sentences):
    embeddings = model.encode(sentences, normalize_embeddings=True)
    return embeddings

In [391]:
eval_columns = ['belongs_to_collection', 'production_companies', 'production_countries', 'spoken_languages', 'genres']

for eval_column in eval_columns:
    md[eval_column] = md[eval_column].fillna('[]').apply(literal_eval)
    md[eval_column] = md[eval_column].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

bad_data = md[md['imdb_id'] == '0'].index
md.drop(bad_data, inplace=True)

cols_to_float = ['revenue', 'vote_count', 'vote_average', 'budget', 'popularity']
md[cols_to_float] = md[cols_to_float].astype(float)

md['id'] = md['id'].astype(int)

In [392]:
eval_columns = ['cast', 'crew']

for eval_column in eval_columns:
    credits[eval_column] = credits[eval_column].fillna('[]').apply(literal_eval)

credits['id'] = credits['id'].astype(int)

In [393]:
eval_columns = ['keywords']

for eval_column in eval_columns:
    keywords[eval_column] = keywords[eval_column].fillna('[]').apply(literal_eval)

keywords['id'] = keywords['id'].astype(int)

### Genres Encoder

In [420]:
all_genres = md['genres'].explode().unique()

def encode_genres(genres_df):
    genres_df = genres_df.copy()

    genres_df = genres_df.apply(
        lambda genres_list: np.array([genre in genres_list for genre in all_genres], dtype=int)
    )

    return genres_df

### Top K Credtis

In [427]:
def get_top_k_casts(top_k, cast_df):
    def process_cast(cast_list_str):
        cast_list = json.loads(cast_list_str) if isinstance(cast_list_str, str) else cast_list_str
        ids = [int(i['id']) for i in cast_list]
        return (ids + [0] * top_k)[:top_k]
    return cast_df.apply(process_cast)

In [429]:
def get_top_k_directors(top_k, crew_df):
    def process_directors(crew_list_str):
        crew_list = json.loads(crew_list_str) if isinstance(crew_list_str, str) else crew_list_str
        ids = [int(d['id']) for d in crew_list if d.get('job') == 'Director']
        return (ids + [0] * top_k)[:top_k]
    return crew_df.apply(process_directors)

### Top K Keywords

In [356]:
def get_top_k_keywords(top_k, keyword_df):
    keyword_df = keyword_df.copy()
    # Add underscore between same group keyword to they treated as one token
    keyword_df = keyword_df.apply(lambda x: " ".join(['_'.join(i['name'].split()) for i in x][:top_k]))
    return keyword_df

### Movie Textual Data

In [425]:
def get_movie_textual_data(overview_df, tagline_df):
    result_df = overview_df.fillna(' ') + ' ' + tagline_df.fillna(' ')
    return result_df

## Movie Latent vector

In [467]:
def get_movie_latent_vector(top_k, movie_dataframe):
    # Genres
    genres_df = encode_genres(movie_dataframe['genres'])

    # Credits
    cast_df = get_top_k_casts(top_k, movie_dataframe['cast'])
    director_df = get_top_k_directors(1, movie_dataframe['crew'])

    # Keywords (Textual)
    keywords_df = get_top_k_keywords(10, movie_dataframe['keywords'])
    keywords_embedding_df = keywords_df.apply(embed_texts)

    # Movie Textual Data (Textual)
    movie_textual_df = get_movie_textual_data(movie_dataframe['overview'], movie_dataframe['tagline'])
    movie_textual_embedding_df = movie_textual_df.apply(embed_texts)

    result = pd.concat([genres_df, cast_df, director_df, keywords_embedding_df, movie_textual_embedding_df], axis=1)
    result = result.rename({0: 'overview_embedding', 'keywords': 'keywords_embedding'}, axis=1)
    return result

In [546]:
movie_latent_datas = get_movie_latent_vector(3, df.sample(100))
movie_latent_datas

Unnamed: 0,genres,cast,crew,keywords_embedding,overview_embedding
27289,"[0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, ...","[13578, 10160, 34981]",[15705],"[-0.0012443794, 0.05501268, 0.01243257, 0.0286...","[0.006677093, -0.054407287, -0.05573156, 0.029..."
33871,"[0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...","[1238990, 17124, 1299249]",[583175],"[-0.11883843, 0.04829872, -0.0025480906, -0.01...","[-0.076529264, 0.111708134, -0.050451986, 0.02..."
35659,"[0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, ...","[79455, 138554, 128479]",[565681],"[-0.010180261, 0.037788138, -0.04850083, -0.04...","[-0.07389996, 0.034996036, -0.07015445, 0.0639..."
15935,"[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, ...","[16896, 14500, 8231]",[67451],"[-0.010666075, 0.048292287, -0.038215708, 0.08...","[0.049407076, 0.023529641, -0.029362736, 0.089..."
14870,"[0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[19536, 40462, 5657]",[16837],"[-0.018039033, -0.06284508, 0.0760691, 0.03363...","[-0.032726604, 0.0119959265, 0.020066652, -0.0..."
...,...,...,...,...,...
14595,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, ...","[1951, 28641, 1073162]",[29471],"[-0.021537993, 0.044938304, -0.0009304776, 0.0...","[0.014717803, 0.11948516, 0.06547682, -0.00943..."
19612,"[0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...","[78216, 146287, 35137]",[71506],"[-0.12019701, -0.0018522454, -0.0019567779, 0....","[0.062389, 0.04296044, 0.022625083, 0.1564795,..."
13241,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, ...","[2922, 78848, 45380]",[30129],"[0.006877668, 0.021917293, -0.017348241, 0.071...","[-0.0071391216, 0.038150106, -0.014431906, -0...."
34760,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...","[57674, 117525, 155621]",[130050],"[-0.11883843, 0.04829872, -0.0025480906, -0.01...","[0.0387425, 0.05333087, 0.0047938246, -0.03891..."


## Merge Datasets

In [471]:
df = pd.merge(md, credits).merge(keywords)

## Get Similarities

### Genres Similarity

In [534]:
genres1 = np.expand_dims(movie_latent_datas.iloc[5, 0], axis=0)
genres2 = np.expand_dims(movie_latent_datas.iloc[6, 0], axis=0)

In [536]:
cosine_similarity(genres1, genres2)

array([[0.5]])

### Cast Similarity

In [572]:
cast1 = movie_latent_datas.loc[26318, 'cast']
cast2 = movie_latent_datas.loc[15888, 'cast']

In [603]:
def jaccard_similarity(list1, list2):
    s1 = set(list1)
    s2 = set(list2)

    # Remove 0 IDS, because they aren't valid
    intersection = s1.intersection(s2) - {0}
    
    return float(len(intersection) / len(s1.union(s2)))

In [605]:
jaccard_similarity(cast1, cast2)

0.2

### Directors Similarity

In [613]:
crew1 = movie_latent_datas.loc[35198, 'crew']
crew2 = movie_latent_datas.loc[23906, 'crew']

In [615]:
crew1, crew2

([80570], [80570])

In [617]:
jaccard_similarity(crew1, crew2)

1.0

### Keywords Similarity

In [626]:
keyword1 = np.expand_dims(movie_latent_datas.iloc[0, 3], axis=0)
keyword2 = np.expand_dims(movie_latent_datas.iloc[1, 3], axis=0)

In [628]:
cosine_similarity(keyword1, keyword2)

array([[0.18156236]], dtype=float32)

### Overview Similarity

In [632]:
overview1 = np.expand_dims(movie_latent_datas.iloc[0, 4], axis=0)
overview2 = np.expand_dims(movie_latent_datas.iloc[1, 4], axis=0)

In [634]:
cosine_similarity(overview1, overview2)

array([[0.27793163]], dtype=float32)

## All similarities together

In [816]:
def jaccard_similarity(list1, list2):
    s1 = set(list1)
    s2 = set(list2)

    # Remove 0 IDS, because they aren't valid
    intersection = s1.intersection(s2) - {0}
    
    return float(len(intersection) / len(s1.union(s2)))

def get_similarity(row1, row2):
    # Genres
    genres1 = np.expand_dims(row1['genres'], axis=0)
    genres2 = np.expand_dims(row2['genres'], axis=0)
    genres_similarity = cosine_similarity(genres1, genres2)[0, 0]

    # Cast
    cast1 = row1['cast']
    cast2 = row2['cast']   
    cast_similarity = jaccard_similarity(cast1, cast2)
        
    # Crew
    crew1 = row1['crew']
    crew2 = row2['crew']
    crew_similarity = jaccard_similarity(crew1, crew2)

    # Keyword
    keyword1 = np.expand_dims(row1['keywords_embedding'], axis=0)
    keyword2 = np.expand_dims(row2['keywords_embedding'], axis=0)
    keyword_similarity = cosine_similarity(keyword1, keyword2)[0, 0]

    # Overview
    overview1 = np.expand_dims(row1['overview_embedding'], axis=0)
    overview2 = np.expand_dims(row2['overview_embedding'], axis=0)
    overview_similarity = cosine_similarity(overview1, overview2)[0, 0]

    # Weights:
    genre_w = 0.25
    cast_w = 0.10
    crew_w = 0.10
    keyword_w = 0.20
    overview_w = 0.35

    final_similarity = genre_w * genres_similarity + cast_w * cast_similarity + crew_w * crew_similarity + keyword_w * keyword_similarity + overview_w * overview_similarity

    return final_similarity

In [741]:
def create_similarity_matrix(latent_data):
    # Get the number of items
    n_items = len(latent_data)
    
    # Initialize the similarity matrix
    similarity_matrix = np.zeros((n_items, n_items))
    
    # Compute similarities
    for i in range(n_items):
        for j in range(i, n_items):  # Use symmetry to reduce computation
            similarity = get_similarity(latent_data.iloc[i], latent_data.iloc[j])
            similarity_matrix[i, j] = similarity
            similarity_matrix[j, i] = similarity  # Symmetric matrix
    
    return similarity_matrix

In [744]:
def get_recommendations(movie_identifier, similarity_matrix, movie_dataframe, top_k=5):
    # Ensure movie_dataframe has the same index as the similarity matrix
    movie_dataframe = movie_dataframe.reset_index(drop=True)
    
    # Find the movie index
    if isinstance(movie_identifier, str):
        if 'title' not in movie_dataframe.columns:
            raise ValueError("Movie DataFrame must contain a 'title' column.")
        try:
            movie_idx = movie_dataframe[movie_dataframe['title'] == movie_identifier].index[0]
        except IndexError:
            raise ValueError(f"Movie title '{movie_identifier}' not found in the dataset.")
    else:
        movie_idx = movie_identifier
        if movie_idx >= len(movie_dataframe) or movie_idx < 0:
            raise ValueError(f"Invalid movie index: {movie_idx}")
    
    # Get similarity scores for the movie
    sim_scores = similarity_matrix[movie_idx]
    
    # Get indices of top-k similar movies (excluding the movie itself)
    # Use argsort to sort indices by similarity score in descending order
    top_indices = np.argsort(sim_scores)[::-1][1:top_k+1]
    top_scores = sim_scores[top_indices]
    
    # Get movie titles and their similarity scores
    recommendations = []
    for idx, score in zip(top_indices, top_scores):
        movie_title = movie_dataframe.iloc[idx]['title']
        recommendations.append((movie_title, score))
    
    return recommendations

In [826]:
real_world_df = df[df['vote_count'] > 425]
movie_dataframe = real_world_df
movie_latent_datas = get_movie_latent_vector(5, movie_dataframe)
sim_matrix = create_similarity_matrix(movie_latent_datas)

In [828]:
movie_latent_datas.to_csv('movie_latent_datas.csv', index=False)

In [834]:
movie_title = "The Dark Knight"
try:
    recommendations = get_recommendations(movie_title, sim_matrix, movie_dataframe, top_k=10)
    print(f"Recommendations for '{movie_title}':")
    for title, score in recommendations:
        print(f"- {title} (Similarity: {score:.4f})")
except ValueError as e:
    print(e)

Recommendations for 'The Dark Knight':
- The Dark Knight Rises (Similarity: 0.7764)
- Batman Begins (Similarity: 0.7090)
- Batman: The Killing Joke (Similarity: 0.5486)
- Batman & Robin (Similarity: 0.4888)
- Brick Mansions (Similarity: 0.4843)
- Training Day (Similarity: 0.4704)
- Batman Forever (Similarity: 0.4655)
- American Psycho (Similarity: 0.4504)
- Heat (Similarity: 0.4441)
- Dead Man Down (Similarity: 0.4423)


In [794]:
real_world_df = df[df['vote_count'] > 425]

In [814]:
real_world_df

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,spoken_languages,status,tagline,title,video,vote_average,vote_count,cast,crew,keywords
0,False,[],30000000.0,"[Animation, Comedy, Family]",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,[English],Released,,Toy Story,False,7.7,5415.0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...","[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,False,[],65000000.0,"[Adventure, Fantasy, Family]",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,"[English, Français]",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...","[{'id': 10090, 'name': 'board game'}, {'id': 1..."
5,False,[],60000000.0,"[Action, Crime, Drama, Thriller]",,949,tt0113277,en,Heat,"Obsessive master thief, Neil McCauley leads a ...",...,"[English, Español]",Released,A Los Angeles Crime Saga,Heat,False,7.7,1886.0,"[{'cast_id': 25, 'character': 'Lt. Vincent Han...","[{'credit_id': '52fe4292c3a36847f802916d', 'de...","[{'id': 642, 'name': 'robbery'}, {'id': 703, '..."
9,False,[],58000000.0,"[Adventure, Action, Thriller]",http://www.mgm.com/view/movie/757/Goldeneye/,710,tt0113189,en,GoldenEye,James Bond must unmask the mysterious head of ...,...,"[English, Pусский, Español]",Released,No limits. No fears. No substitutes.,GoldenEye,False,6.6,1194.0,"[{'cast_id': 1, 'character': 'James Bond', 'cr...","[{'credit_id': '52fe426ec3a36847f801e14b', 'de...","[{'id': 701, 'name': 'cuba'}, {'id': 769, 'nam..."
15,False,[],52000000.0,"[Drama, Crime]",,524,tt0112641,en,Casino,The life of the gambling paradise – Las Vegas ...,...,[English],Released,No one stays at the top forever.,Casino,False,7.8,1343.0,"[{'cast_id': 4, 'character': 'Sam 'Ace' Rothst...","[{'credit_id': '52fe424dc3a36847f80139cd', 'de...","[{'id': 383, 'name': 'poker'}, {'id': 726, 'na..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45777,False,[],0.0,"[Science Fiction, Thriller]",,406990,tt1536537,en,What Happened to Monday,In a world where families are limited to one c...,...,[English],Released,Seven sisters. One identity.,What Happened to Monday,False,7.3,598.0,"[{'cast_id': 0, 'character': 'Monday / Tuesday...","[{'credit_id': '5814111e92514152d502abf9', 'de...","[{'id': 3713, 'name': 'chase'}, {'id': 3864, '..."
45785,False,[],30000000.0,"[Action, Thriller]",http://www.atomicblonde.com/,341013,tt2406566,en,Atomic Blonde,An undercover MI6 agent is sent to Berlin duri...,...,"[svenska, English, Deutsch, Pусский]",Released,,Atomic Blonde,False,6.1,748.0,"[{'cast_id': 0, 'character': 'Lorraine Brought...","[{'credit_id': '555da697925141757e0010d8', 'de...","[{'id': 220, 'name': 'berlin'}, {'id': 470, 'n..."
45831,False,[],100000000.0,"[Action, Drama, History, Thriller, War]",http://www.dunkirkmovie.com/,374720,tt5013056,en,Dunkirk,The miraculous evacuation of Allied soldiers f...,...,"[English, Français, Deutsch]",Released,The event that shaped our world,Dunkirk,False,7.5,2712.0,"[{'cast_id': 56, 'character': 'Tommy', 'credit...","[{'credit_id': '598138b5925141519b008a5e', 'de...","[{'id': 254, 'name': 'france'}, {'id': 966, 'n..."
46001,False,[],260000000.0,"[Action, Science Fiction, Thriller, Adventure]",http://www.transformersmovie.com/,335988,tt3371366,en,Transformers: The Last Knight,"Autobots and Decepticons are at war, with huma...",...,[English],Released,"For one world to live, the other must die.",Transformers: The Last Knight,False,6.2,1440.0,"[{'cast_id': 2, 'character': 'Cade Yeager', 'c...","[{'credit_id': '5553e38bc3a368208f000502', 'de...","[{'id': 10466, 'name': 'knight'}, {'id': 10607..."
