In [None]:
from math import sqrt
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
# Reading ratings file
ratings = pd.read_csv('/content/gdrive/MyDrive/Taplix/ratings.csv', sep=',', encoding='latin-1', usecols=['userId','movieId','rating','timestamp'])

# Reading movies file
movies = pd.read_csv('/content/gdrive/MyDrive/Taplix/movies.csv', sep=',', encoding='latin-1', usecols=['movieId','title','genres'])


In [None]:
df_movies = movies 
df_ratings = ratings 

In [None]:
df_movies.tail(50)

Unnamed: 0,movieId,title,genres
9692,184471,Tomb Raider (2018),Action|Adventure|Fantasy
9693,184641,Fullmetal Alchemist 2018 (2017),Action|Adventure|Fantasy
9694,184721,First Reformed (2017),Drama|Thriller
9695,184791,Fred Armisen: Standup for Drummers (2018),Comedy
9696,184931,Death Wish (2018),Action|Crime|Drama|Thriller
9697,184987,A Wrinkle in Time (2018),Adventure|Children|Fantasy|Sci-Fi
9698,184997,"Love, Simon (2018)",Comedy|Drama
9699,185029,A Quiet Place (2018),Drama|Horror|Thriller
9700,185031,Alpha (2018),Adventure|Thriller
9701,185033,I Kill Giants (2018),Drama|Fantasy|Thriller


In [None]:
plt.figure(figsize=(20,7))
generlist = df_movies['genres'].apply(lambda generlist_movie : str(generlist_movie).split("|"))
geners_count = {}

for generlist_movie in generlist:
    for gener in generlist_movie:
        if(geners_count.get(gener,False)):
            geners_count[gener]=geners_count[gener]+1
        else:
            geners_count[gener] = 1       
geners_count.pop("(no genres listed)")
plt.bar(geners_count.keys(),geners_count.values(),color='g')

NameError: ignored

In [None]:
df_ratings.head(5)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [None]:
# Define a TF-IDF Vectorizer Object.
tfidf_movies_genres = TfidfVectorizer(token_pattern = '[a-zA-Z0-9\-]+')

#Replace NaN with an empty string
df_movies['genres'] = df_movies['genres'].replace(to_replace="(no genres listed)", value="")

#Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_movies_genres_matrix = tfidf_movies_genres.fit_transform(df_movies['genres'])
# print(tfidf_movies_genres.get_feature_names())
# Compute the cosine similarity matrix
# print(tfidf_movies_genres_matrix.shape)
# print(tfidf_movies_genres_matrix.dtype)
cosine_sim_movies = linear_kernel(tfidf_movies_genres_matrix, tfidf_movies_genres_matrix)
# print(cosine_sim_movies)

In [None]:
def get_recommendations_based_on_genres(movie_title, cosine_sim_movies=cosine_sim_movies):
    """
    Calculates top 2 movies to recommend based on given movie titles genres. 
    :param movie_title: title of movie to be taken for base of recommendation
    :param cosine_sim_movies: cosine similarity between movies 
    :return: Titles of movies recommended to user
    """
    # Get the index of the movie that matches the title
    idx_movie = df_movies.loc[df_movies['title'].isin([movie_title])]
    idx_movie = idx_movie.index
    
    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores_movies = list(enumerate(cosine_sim_movies[idx_movie][0]))
    
    # Sort the movies based on the similarity scores
    sim_scores_movies = sorted(sim_scores_movies, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores_movies = sim_scores_movies[1:3]
    
    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores_movies]
    
    # Return the top 2 most similar movies
    return df_movies['title'].iloc[movie_indices]


In [None]:
get_recommendations_based_on_genres("Love, Simon (2018)")


67                   Big Bully (1996)
74    Antonia's Line (Antonia) (1995)
Name: title, dtype: object

In [None]:
import pickle

In [None]:
with open('genre_pkl', 'wb') as files:
    pickle.dump(tfidf_movies_genres_matrix, files)