In [4]:
import warnings
import re
from PIL import Image
from wordcloud import WordCloud
import pandas as pd
import numpy as np
import umap
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from collections import Counter

def create_movie_recommendations():
    # Suppress warnings
    warnings.filterwarnings('ignore')

    # Display options for pandas dataframe
    pd.set_option('display.max_rows', 500)
    pd.set_option('display.max_columns', None)
    pd.set_option('display.max_colwidth', -1)

    # Load movie titles
    mv_title = pd.read_csv('movie.csv')

    # Load movie tags
    mv_tags = pd.read_csv('genome_scores.csv')

    # Load movie tag names
    mv_tags_names = pd.read_csv('genome_tags.csv')

    # Clean title in movies.csv file
    def movie_title_clean(title):
        # Search title for (year) pattern
        s = re.search('\(([^)]+)', title)
        year = 9999

        if s:
            title = title[:s.span()[0]].strip()
            year = s.group(1)
            if str(year).isdigit():
                year = int(year)
            else:
                year = 9999

        # If ', The' or ', A' is at the end of the string, move it to the front
        if title[-5:] == ', The':
            title = 'The ' + title[:-5]
        elif title[-4:] == ', An':
            title = 'An ' + title[:-4]
        elif title[-3:] == ', A':
            title = 'A ' + title[:-3]

        return title, year

    # Clean title and extract release year
    mv_title['title'] = mv_title['title'].str.strip()
    mv_title['year'] = mv_title['title'].map(movie_title_clean)
    mv_title['title'] = mv_title['year'].apply(lambda x: x[0])
    mv_title['Release Year'] = mv_title['year'].apply(lambda x: x[1])

    # Join dataframes to get tag description and movie title name all in one table
    mv_tags_denorm = mv_tags.merge(mv_tags_names, on='tagId').merge(mv_title, on='movieId')

    # For each movie, compute the relevance rank of tags to eventually rank order tags for each movie
    mv_tags_denorm['relevance_rank'] = mv_tags_denorm.groupby("movieId")["relevance"].rank(method="first", ascending=False).astype('int64')

    # Flatten tags table to get a list of the top 100 tags for each movie
    mv_tags_list = mv_tags_denorm[mv_tags_denorm.relevance_rank <= 100].groupby(['movieId', 'title'])['tag'].apply(lambda x: ','.join(x)).reset_index()
    mv_tags_list['tag_list'] = mv_tags_list.tag.map(lambda x: x.split(','))

    target_movie = 'Toy Story'

    target_tag_list = mv_tags_list[mv_tags_list.title == target_movie].tag_list.values[0]
    mv_tags_list_sim = mv_tags_list[['movieId','title','tag_list','tag']]
    mv_tags_list_sim['jaccard_sim'] = mv_tags_list_sim.tag_list.map(lambda x: len(set(x).intersection(set(target_tag_list))) / len(set(x).union(set(target_tag_list))))


    # Create a dictionary to map movie IDs to their genres
    movie_genre_dict = dict(zip(mv_title['movieId'], mv_title['genres']))

    # Function to get genres of similar movies
    def get_similar_movie_genres(similar_movies):
        similar_movie_genres = []
        first_movie_id = similar_movies[0]
        if first_movie_id in movie_genre_dict:
            first_movie_genres = set(movie_genre_dict[first_movie_id].split('|'))
        for movie_id in similar_movies[1:]:
            if movie_id in movie_genre_dict:
                genres = set(movie_genre_dict[movie_id].split('|'))
                common_genres = first_movie_genres.intersection(genres)
                if common_genres:
                    similar_movie_genres.append((movie_id, '|'.join(common_genres)))
        return similar_movie_genres

    # Get the most similar movies to 'Toy Story' based on tags
    similar_movies = mv_tags_list_sim.sort_values(by='jaccard_sim', ascending=False).head(10)['movieId'].tolist()

    # Get the genres of similar movies with intersecting genres
    similar_movie_genres = get_similar_movie_genres(similar_movies)

    # Create a corpus of movie tags
    mv_tags_corpus = mv_tags_list.tag.values
    stop_words = stopwords.words('english')

    # Tokenize document and clean
    def word_tokenize_clean(doc):
        tokens = word_tokenize(doc.lower())
        tokens = [word for word in tokens if word.isalpha() and not word in stop_words]
        return tokens

    # Preprocess corpus of movie tags before feeding it into the Doc2Vec model
    mv_tags_doc = [TaggedDocument(words=word_tokenize_clean(D), tags=[str(i)]) for i, D in enumerate(mv_tags_corpus)]

    # Instantiate Doc2Vec model
    max_epochs = 50
    vec_size = 20
    alpha = 0.025

    model = Doc2Vec(vector_size=vec_size,
                    alpha=alpha,
                    min_alpha=0.00025,
                    min_count=1,
                    dm=0)  # paragraph vector distributed bag-of-words (PV-DBOW)

    model.build_vocab(mv_tags_doc)

    # Train Doc2Vec model
    for epoch in range(max_epochs):
        model.train(mv_tags_doc,
                    total_examples=model.corpus_count,
                    epochs=model.epochs)
        model.alpha -= 0.0002
        model.min_alpha = model.alpha

    # List the document vectors
    mv_tags_vectors = model.dv.vectors

    # History of movies the user watched and liked
    user_movies = ['Money Train', 'Good Will Hunting', 'Gravity', 'Interstellar', 'Toy Story', 'Monsters, Inc']

    # Create a dictionary to map movies to their genres
    movie_to_genres = {}
    for index, row in mv_tags_list.iterrows():
        movie_to_genres[row['title']] = set(mv_title[mv_title['movieId'] == row['movieId']]['genres'].values[0].split('|'))

    # Create a list of all genres in the user's watchlist
    user_watchlist_genres = []
    for mv in user_movies:
        if mv in movie_to_genres:
            genres = movie_to_genres[mv]
            user_watchlist_genres.extend(genres)
            print(f"{mv} (Genres: {' | '.join(genres)})")

    # Count the frequency of each genre in the user's watchlist
    genre_counts = Counter(user_watchlist_genres)

    # Create a set of unique genres for accurate counting
    unique_genres = set(user_watchlist_genres)

    # Sort genres by frequency in descending order
    sorted_genres = sorted(unique_genres, key=lambda genre: genre_counts[genre], reverse=True)

    # Print genres in descending order of frequency
    print('\nGenres in User\'s Watchlist by Frequency (Descending Order):')
    for genre in sorted_genres:
        count = genre_counts[genre]
        print(f"{genre}: {count}")

    # Compute user vector as an average of movie vectors seen by that user
    user_movie_vector = np.zeros(shape=mv_tags_vectors.shape[1])
    for mv in user_movies:
        if mv in movie_to_genres:
            mv_genres = movie_to_genres[mv]
            mv_index = mv_tags_list[mv_tags_list["title"] == mv].index.values
            if len(mv_index) > 0:
                mv_index = mv_index[0]
                user_movie_vector += mv_tags_vectors[mv_index]

    user_movie_vector /= len(user_movies)

    # Find movies similar to user vector to generate movie recommendations
    print('\nMovie Recommendations with Intersecting Genres:')
    recommendations = []
    sims = model.docvecs.most_similar(positive=[user_movie_vector], topn=10)
    for i, j in sims:
        movie_sim = mv_tags_list.loc[int(i), "title"].strip()
        if movie_sim not in user_movies:
            mv_index = mv_tags_list[mv_tags_list["title"] == movie_sim].index.values
            if len(mv_index) > 0:
                mv_index = mv_index[0]
                if mv_index in mv_tags_list.index:
                    movie_genres = movie_to_genres.get(movie_sim, set())
                    intersecting_genres = set(user_watchlist_genres).intersection(movie_genres)
                    if intersecting_genres:
                        movie_id = mv_tags_list.loc[int(i), "movieId"]
                        recommendations.append({"movieId": movie_id, "genre": ', '.join(intersecting_genres)})
                        print(f"{movie_sim} is of Genre: {' | '.join(intersecting_genres)} with a Movie ID: {movie_id}")

    # Print the first recommended movie and its genre
    if recommendations:
        first_recommendation = recommendations[0]
        print(f"The movie '{first_recommendation['movieId']}' was recommended for you because '{first_recommendation['genre']}' is the most watched genre.")

    # Return the list of recommendations
    return recommendations

# Call the function to generate movie recommendations
movie_recommendations = create_movie_recommendations()


  from .autonotebook import tqdm as notebook_tqdm


Money Train (Genres: Thriller | Action | Crime | Drama | Comedy)
Good Will Hunting (Genres: Drama | Romance)
Gravity (Genres: IMAX | Sci-Fi | Action)
Interstellar (Genres: IMAX | Sci-Fi)
Toy Story (Genres: Adventure | Children | Fantasy | Animation | Comedy)

Genres in User's Watchlist by Frequency (Descending Order):
IMAX: 2
Action: 2
Drama: 2
Sci-Fi: 2
Comedy: 2
Adventure: 1
Thriller: 1
Animation: 1
Children: 1
Fantasy: 1
Crime: 1
Romance: 1

Movie Recommendations with Intersecting Genres:
Cast Away is of Genre: Drama with a Movie ID: 4022
Starman is of Genre: Drama | Adventure | Romance | Sci-Fi with a Movie ID: 3699
Star Trek: First Contact is of Genre: Thriller | Adventure | Action | Sci-Fi with a Movie ID: 1356
Rush is of Genre: Drama | Action with a Movie ID: 104913
The Curious Case of Benjamin Button is of Genre: Fantasy | Drama | Romance with a Movie ID: 64957
Star Trek II: The Wrath of Khan is of Genre: Thriller | Adventure | Action | Sci-Fi with a Movie ID: 1374
Moon is of G