In [1]:
import warnings
import re
from PIL import Image
from wordcloud import WordCloud
import pandas as pd
import numpy as np
import umap
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# display options for pandas dataframe
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', -1)

In [3]:
# movie titles
mv_title = pd.read_csv('movie.csv')
# movie tags
mv_tags = pd.read_csv('genome_scores.csv')
# movie tag names
mv_tags_names = pd.read_csv('genome_tags.csv')

In [4]:
# clean title in movies.csv file
def movie_title_clean(title):
  
  # search title for (year) pattern
  s = re.search('\(([^)]+)', title)

  # if pattern exists, remove from string
  year = 9999
  if s:

    title = title[:s.span()[0]].strip()
    year = s.group(1)
  
    # check if year is actual year
    if str(year).isdigit():
      year = int(year)
    else:
      year = 9999  
  
  # if ', The' or ', A' is a the end of the string, move it to the front
  # e.g. change "Illusionist, The" to "The Illusionist"
  if title[-5:] == ', The':
    title = 'The ' + title[:-5]
  elif title[-4:] == ', An':
    title = 'An ' + title[:-4]
  elif title[-3:] == ', A':
    title = 'A ' + title[:-3]
    
  return title, year

In [5]:
# clean title and extract release year
mv_title['title'] = mv_title['title'].str.strip()
mv_title['year'] = mv_title['title'].map(movie_title_clean)
mv_title['title'] = mv_title['year'].apply(lambda x: x[0])
mv_title['Release Year'] = mv_title['year'].apply(lambda x: x[1])

In [6]:
# join dataframes to get tag description and movie title name all in one table
mv_tags_denorm = mv_tags.merge(mv_tags_names, on = 'tagId').merge(mv_title, on = 'movieId')

# for each movie, compute the relevance rank of tags so we can eventually rank order tags for each movie
mv_tags_denorm['relevance_rank'] = mv_tags_denorm.groupby("movieId")["relevance"].rank(method = "first", ascending = False).astype('int64')


Unnamed: 0,movieId,title,tag,relevance,relevance_rank
1035,1,Toy Story,toys,0.99925,1
243,1,Toy Story,computer animation,0.9985,2
785,1,Toy Story,pixar animation,0.996,3
588,1,Toy Story,kids and family,0.99075,4
63,1,Toy Story,animation,0.98575,5
587,1,Toy Story,kids,0.97925,6
784,1,Toy Story,pixar,0.96675,7
203,1,Toy Story,children,0.96425,8
185,1,Toy Story,cartoon,0.9565,9
535,1,Toy Story,imdb top 250,0.942,10


In [7]:
# flatten tags table to get a list of top 100 tags for each movie
mv_tags_list = mv_tags_denorm[mv_tags_denorm.relevance_rank <= 100].groupby(['movieId','title'])['tag'].apply(lambda x: ','.join(x)).reset_index()
mv_tags_list['tag_list'] = mv_tags_list.tag.map(lambda x: x.split(','))

In [8]:
# compute Jaccard Index to get most similar movies to target movie

pd.reset_option('display.max_colwidth')

target_movie = 'Toy Story'

target_tag_list = mv_tags_list[mv_tags_list.title == target_movie].tag_list.values[0]
mv_tags_list_sim = mv_tags_list[['movieId','title','tag_list','tag']]
mv_tags_list_sim['jaccard_sim'] = mv_tags_list_sim.tag_list.map(lambda x: len(set(x).intersection(set(target_tag_list))) / len(set(x).union(set(target_tag_list))))
#print(f'Movies most similar to {target_movie} based on tags:')
#text = ','.join(mv_tags_list_sim.sort_values(by = 'jaccard_sim', ascending = False).head(25)['tag'].values)
#mv_tags_list_sim.sort_values(by = 'jaccard_sim', ascending = False).head(10)

Movies most similar to Toy Story based on tags:


Unnamed: 0,movieId,title,tag_list,tag,jaccard_sim
0,1,Toy Story,"[3d, action, adventure, affectionate, animal m...","3d,action,adventure,affectionate,animal movie,...",1.0
4331,4886,"Monsters, Inc.","[adventure, affectionate, allegory, alone in t...","adventure,affectionate,allegory,alone in the w...",0.724138
2064,2355,A Bug's Life,"[3d, action, adventure, alter ego, animal movi...","3d,action,adventure,alter ego,animal movie,ani...",0.680672
2769,3114,Toy Story 2,"[action, adventure, animal movie, animals, ani...","action,adventure,animal movie,animals,animated...",0.652893
5445,6377,Finding Nemo,"[adventure, affectionate, animal movie, animal...","adventure,affectionate,animal movie,animals,an...",0.612903
4602,5218,Ice Age,"[3d, action, adventure, animal movie, animals,...","3d,action,adventure,animal movie,animals,anima...",0.612903
7994,50872,Ratatouille,"[allegory, animal movie, animals, animated, an...","allegory,animal movie,animals,animated,animati...",0.574803
7766,45517,Cars,"[action, adventure, alter ego, animal movie, a...","action,adventure,alter ego,animal movie,animal...",0.574803
3809,4306,Shrek,"[adventure, animal movie, animals, animated, a...","adventure,animal movie,animals,animated,animat...",0.550388
554,588,Aladdin,"[action, adventure, affectionate, animal movie...","action,adventure,affectionate,animal movie,ani...",0.550388


In [9]:

# Create a dictionary to map movie IDs to their genres
movie_genre_dict = dict(zip(mv_title['movieId'], mv_title['genres']))

# Function to get genres of similar movies
def get_similar_movie_genres(similar_movies):
    similar_movie_genres = []
    # Get the genres of the first movie in the list
    first_movie_id = similar_movies[0]
    if first_movie_id in movie_genre_dict:
        first_movie_genres = set(movie_genre_dict[first_movie_id].split('|'))
    for movie_id in similar_movies[1:]:
        if movie_id in movie_genre_dict:
            genres = set(movie_genre_dict[movie_id].split('|'))
            # Find the intersection of genres
            common_genres = first_movie_genres.intersection(genres)
            if common_genres:
                similar_movie_genres.append((movie_id, '|'.join(common_genres)))
    return similar_movie_genres

# Get the most similar movies to 'Toy Story' based on tags
similar_movies = mv_tags_list_sim.sort_values(by='jaccard_sim', ascending=False).head(10)['movieId'].tolist()

# Get the genres of similar movies with intersecting genres
similar_movie_genres = get_similar_movie_genres(similar_movies)



Genres of movies most similar to Toy Story based on tags with intersecting genres:
Movie ID: 4886, Genres: Adventure|Children|Animation|Comedy|Fantasy
Movie ID: 2355, Genres: Children|Adventure|Animation|Comedy
Movie ID: 3114, Genres: Adventure|Children|Animation|Comedy|Fantasy
Movie ID: 6377, Genres: Children|Adventure|Animation|Comedy
Movie ID: 5218, Genres: Children|Adventure|Animation|Comedy
Movie ID: 50872, Genres: Children|Animation
Movie ID: 45517, Genres: Children|Animation|Comedy
Movie ID: 4306, Genres: Adventure|Children|Animation|Comedy|Fantasy
Movie ID: 588, Genres: Children|Adventure|Animation|Comedy


In [10]:
# corpus of movie tags
mv_tags_corpus = mv_tags_list.tag.values
stop_words = stopwords.words('english')

# tokenize document and clean
def word_tokenize_clean(doc):
  
  # split into lower case word tokens
  tokens = word_tokenize(doc.lower())
  
  # remove tokens that are not alphabetic (including punctuation) and not a stop word
  tokens = [word for word in tokens if word.isalpha() and not word in stop_words]
  
  return tokens

# preprocess corpus of movie tags before feeding it into Doc2Vec model
mv_tags_doc = [TaggedDocument(words=word_tokenize_clean(D), tags=[str(i)]) for i, D in enumerate(mv_tags_corpus)]

In [13]:
# instantiate Doc2Vec model

max_epochs = 50
vec_size = 20
alpha = 0.025

model = Doc2Vec(vector_size=vec_size,
                alpha=alpha, 
                min_alpha=0.00025,
                min_count=1,
                dm=0) # paragraph vector distributed bag-of-words (PV-DBOW)
  
model.build_vocab(mv_tags_doc)

In [17]:
# train Doc2Vec model

for epoch in range(max_epochs):
  
  model.train(mv_tags_doc,
              total_examples=model.corpus_count,
              epochs=model.epochs)
  # decrease the learning rate
  model.alpha -= 0.0002
  # fix the learning rate, no decay
  model.min_alpha = model.alpha

In [14]:
# List the document vectors
mv_tags_vectors = model.dv.vectors
mv_tags_vectors.shape

(10381, 20)

In [15]:
from collections import Counter

# History of movies the user watched and liked
user_movies = ['Money Train', 'Good Will Hunting', 'Gravity', 'Interstellar', 'Toy Story', 'Monsters, Inc']

# Create a dictionary to map movies to their genres
movie_to_genres = {}
for index, row in mv_tags_list.iterrows():
    movie_to_genres[row['title']] = set(mv_title[mv_title['movieId'] == row['movieId']]['genres'].values[0].split('|'))

# Create a list of all genres in the user's watchlist
user_watchlist_genres = []
for mv in user_movies:
    if mv in movie_to_genres:
        genres = movie_to_genres[mv]
        user_watchlist_genres.extend(genres)
        print(f"{mv} (Genres: {' | '.join(genres)})")

# Count the frequency of each genre in the user's watchlist
genre_counts = Counter(user_watchlist_genres)

# Create a set of unique genres for accurate counting
unique_genres = set(user_watchlist_genres)

# Sort genres by frequency in descending order
sorted_genres = sorted(unique_genres, key=lambda genre: genre_counts[genre], reverse=True)

# Print genres in descending order of frequency
print('\nGenres in User\'s Watchlist by Frequency (Descending Order):')
for genre in sorted_genres:
    count = genre_counts[genre]
    print(f"{genre}: {count}")

# Compute user vector as an average of movie vectors seen by that user
user_movie_vector = np.zeros(shape=mv_tags_vectors.shape[1])
for mv in user_movies:
    if mv in movie_to_genres:
        mv_genres = movie_to_genres[mv]
        mv_index = mv_tags_list[mv_tags_list["title"] == mv].index.values
        if len(mv_index) > 0:
            mv_index = mv_index[0]
            user_movie_vector += mv_tags_vectors[mv_index]

user_movie_vector /= len(user_movies)

# Find movies similar to user vector to generate movie recommendations
print('\nMovie Recommendations with Intersecting Genres:')
recommendations = []
sims = model.docvecs.most_similar(positive=[user_movie_vector], topn=10)
for i, j in sims:
    movie_sim = mv_tags_list.loc[int(i), "title"].strip()
    if movie_sim not in user_movies:
        mv_index = mv_tags_list[mv_tags_list["title"] == movie_sim].index.values
        if len(mv_index) > 0:
            mv_index = mv_index[0]
            if mv_index in mv_tags_list.index:
                movie_genres = movie_to_genres.get(movie_sim, set())
                intersecting_genres = set(user_watchlist_genres).intersection(movie_genres)
                if intersecting_genres:
                    movie_id = mv_tags_list.loc[int(i), "movieId"]
                    recommendations.append({"movieId": movie_id, "genre": ', '.join(intersecting_genres)})
                    print(f"{movie_sim} is of Genre: {' | '.join(intersecting_genres)} with a Movie ID: {movie_id}")

# Print the first recommended movie and its genre
if recommendations:
    first_recommendation = recommendations[0]
    print(f"The movie '{first_recommendation['movieId']}' was recommended for you because '{first_recommendation['genre']}' is the most watched genre.")

# Return the list of recommendations
recommendations


Money Train (Genres: Crime | Drama | Thriller | Action | Comedy)
Good Will Hunting (Genres: Drama | Romance)
Gravity (Genres: Action | IMAX | Sci-Fi)
Interstellar (Genres: IMAX | Sci-Fi)
Toy Story (Genres: Adventure | Children | Animation | Comedy | Fantasy)

Genres in User's Watchlist by Frequency (Descending Order):
IMAX: 2
Drama: 2
Action: 2
Comedy: 2
Sci-Fi: 2
Crime: 1
Adventure: 1
Romance: 1
Children: 1
Thriller: 1
Animation: 1
Fantasy: 1

Movie Recommendations with Intersecting Genres:
Changing Lanes is of Genre: Thriller | Drama with a Movie ID: 5293
You Will Meet a Tall Dark Stranger is of Genre: Comedy | Romance with a Movie ID: 80864
Love Is All There Is is of Genre: Drama | Comedy with a Movie ID: 1045
Dragonslayer is of Genre: Adventure | Action | Fantasy with a Movie ID: 5039
Tears of the Sun is of Genre: Thriller | Action | Drama with a Movie ID: 6213
Joy Ride is of Genre: Adventure | Thriller with a Movie ID: 4821
Thursday is of Genre: Crime | Action | Thriller with a Mo

[{'movieId': 5293, 'genre': 'Thriller, Drama'},
 {'movieId': 80864, 'genre': 'Comedy, Romance'},
 {'movieId': 1045, 'genre': 'Drama, Comedy'},
 {'movieId': 5039, 'genre': 'Adventure, Action, Fantasy'},
 {'movieId': 6213, 'genre': 'Thriller, Action, Drama'},
 {'movieId': 4821, 'genre': 'Adventure, Thriller'},
 {'movieId': 27022, 'genre': 'Crime, Action, Thriller'},
 {'movieId': 26055, 'genre': 'Drama'},
 {'movieId': 2430, 'genre': 'Children, Adventure, Drama'},
 {'movieId': 107718, 'genre': 'Crime, Drama'}]