<a href="https://colab.research.google.com/github/supernova276/HybridMovieRec/blob/main/Hyrbridrec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [29]:
!pip install fuzzywuzzy #used to calculate the similarity between strings

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [30]:
!pip install surprise #module for recommender systems ,provides ready touse pred algo like svd

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [31]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from surprise import SVD, SVDpp, KNNBasic
import re 
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from fuzzywuzzy import fuzz

from surprise import Dataset
from surprise.model_selection import cross_validate,train_test_split, GridSearchCV
from surprise import NormalPredictor
from surprise import Reader

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from IPython.display import display
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [32]:
movies=pd.read_csv('/content/movies.csv')
ratings=pd.read_csv('/content/ratings.csv')


In [33]:
movies.head(5)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [34]:
ratings.head(5)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [35]:

ratings_array = ratings['rating'].unique() #stores all the unique ratings out of the ratings df
max_rating = np.amax( ratings_array )   #finding the max ratings
min_rating = np.amin( ratings_array )   #finding the min rating
print( ratings_array )                 

[4.  5.  3.  2.  1.  4.5 3.5 2.5 0.5 1.5]


In [36]:
#converting a series to a dict
movie_map = pd.Series(movies.movieId.values,index=movies.title).to_dict() #dict{title:movieId}
#reversing the above moviemap  dict{movieId:title}
reverse_movie_map = {v: k for k, v in movie_map.items()}
#dict={movies.index:movieId}
movieId_to_index_map = pd.Series(movies.index.values,index=movies.movieId).to_dict()
#will store all the unique values for the movieId
movieId_all_array = movies['movieId'].unique()

In [37]:
def get_movieId( movie_name ):
    """
    return the movieId which is corresponding to the movie name

    Parameters
    ----------
    movie_name: string, the name of the movie w/ or w/o the year

    Return
    ------
    the movieId
    """

    # If luckily the movie name is 100% equal to a name writen in the database,
    # then return the id corresponding to the name.
    # Or...we need to consider the similarity between strings 
    if (movie_name in movie_map):
      return movie_map[movie_name]
    else:
      similar = []
      for title, movie_id in movie_map.items():
        ratio = fuzz.ratio(title.lower(), movie_name.lower())
        if ( ratio >= 60):
          similar.append( (title, movie_id, ratio ) ) #storing all the strings with similarity ratio>60
      if (len(similar) == 0):
        print("Oh! This movie does not exist in the database.")
      else:
        match_item = sorted( similar , key=lambda x: x[2] )[::-1]
        print( "The matched item might be:", match_item[0][0], ", ratio=",match_item[0][2] )
        return match_item[0][1]

#content based filtering with pairwise tfidf approach

In [38]:
def tokenizer(text):
  lemmatizer=WordNetLemmatizer()
  torkenized = [lemmatizer.lemmatize(word).lower() for word in text.split('|') if word not in stopwords.words('english')]
  return torkenized

In [39]:
tfid=TfidfVectorizer(analyzer='word', tokenizer=tokenizer)

In [40]:
tfidf_matrix = tfid.fit_transform(movies['genres'])

In [41]:
cos_sim = cosine_similarity(tfidf_matrix,tfidf_matrix)

In [42]:
print(tfidf_matrix.shape)
print(cos_sim.shape)
print(movies.shape)

(9742, 20)
(9742, 9742)
(9742, 3)


#SVD matrix factorization model in collaborative filtering

In [43]:


#Before we proceed to the modeling part, we need to convert our dataset into a Dataset object from the Surprise library.
# In order to do this, we need to define a Reader object to be able to parse the DataFrame. We also need to follow this
# specific column order: user ID, item ID, and rating.

features=['userId','movieId','rating']
reader = Reader(rating_scale=(min_rating, max_rating))
data = Dataset.load_from_df(ratings[features], reader)

#hyperparameter tuning using grdisearachcv
#hyperparameter tuning is for finding out the most optimal parameters for a learning algorithm
#It runs through all the different parameters that is fed into the parameter grid and produces the best combination
# of parameters, based on a scoring metric of your choice
param_grid = {'n_epochs': [5, 10], 'lr_all': [0.002, 0.005],
              'reg_all': [0.4, 0.6]}

#estimator: estimator object being used
#param_grid: dictionary that contains all of the parameters to try
#scoring: evaluation metric to use when ranking results
#cv: cross-validation, the number of cv folds for each combination of parameters
#metrics used for evaluation is root mean square and mean absolute error
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)

In [44]:
gs.fit(data)

In [45]:
print(gs.best_score['rmse'])
print(gs.best_params['rmse'])

0.8932350459425479
{'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.4}


In [46]:
print(gs.best_score['mae'])
print(gs.best_params['mae'])

0.6915270728932857
{'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.4}


In [47]:
best_params = gs.best_params['rmse']
model_svd = gs.best_estimator['rmse']
model_svd.fit(data.build_full_trainset())

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f2d14aab4d0>

In [48]:
def get_rating_from_prediction( prediction, ratings_array ):
    """
    return the closest rating number to the prediction value

    Parameters
    ----------
    prediction: float, the prediction value from the model

    ratings_array: the 1D array of the discrete rating number

    Return
    ------
    the rating number corresponding to the prediction value
    """
    rating = ratings_array[ np.argmin( [ np.abs(item - prediction) for item in ratings_array ] ) ]
    return rating

In [49]:
prediction = model_svd.predict(1,1)
print("rating", ratings[(ratings.userId ==1 ) & (ratings.movieId ==1 ) ]['rating']  )
print("prediction",prediction.est)

rating 0    4.0
Name: rating, dtype: float64
prediction 4.3808155512545355


#make movie recommendation item based

In [50]:
def make_recommendation_item_based( similarity_matrix ,movieId_all_array, ratings_data, id_to_movie_map, 
                                   movieId_to_index_map, fav_movie_list, n_recommendations, userId=-99):
    """
    return top n movie recommendation based on user's input list of favorite movies
    Currently, fav_movie_list only support one input favorate movie

    Parameters
    ----------
    similarity_matrix: 2d array, the pairwise similarity matrix

    movieId_all_array: 1d array, the array of all movie Id

    ratings_data: ratings data

    id_to_movie_map: the map from movieId to movie title

    movieId_to_index_map: the map from movieId to the index of the movie dataframe

    fav_movie_list: list, user's list of favorite movies

    n_recommendations: int, top n recommendations

    userId: int optional (default=-99), the user Id
            if userId = -99, the new user will be created
            if userId = -1, the latest inserted user is chosen

    Return
    ------
    list of top n movie recommendations

    """

    if (userId == -99):
      userId = np.amax( ratings_data['userId'].unique() ) + 1
    elif (userId == -1):
      userId = np.amax( ratings_data['userId'].unique() )
   
   #to store the id of all the movies that are currently in fav movielist
    movieId_list = []
    for movie_name in fav_movie_list:
      movieId_list.append( get_movieId(movie_name) )    

    # Get the movie id which corresponding to the movie the user didn't watch before

    #creat a list of all the unique movieIds corresponding to our particulr user
    movieId_user_exist = list( ratings_data[ ratings_data.userId==userId ]['movieId'].unique() )

    #add teh previous movies where the user exists to 
    movieId_user_exist = movieId_user_exist + movieId_list
    movieId_input = []

    # if a particular movie the user has not watched, we append the movieid in movieId.input
    for movieId in movieId_all_array:
      if (movieId not in movieId_user_exist):
         movieId_input.append( movieId )

     #we find the index of the first movie in the movieId_to_index_map
    index = movieId_to_index_map[movieId_list[0]]

    #
    cos_sim_scores=list(enumerate(similarity_matrix[index]))
    cos_sim_scores=sorted(cos_sim_scores,key=lambda x:x[1],reverse=True) 

   #to store the index of the top movies
    topn_movieIndex = []
    icount = 0

    #traversing the similarity matrix
    for i in range(len(cos_sim_scores)):
      if( cos_sim_scores[i][0] in [movieId_to_index_map[ids] for ids in movieId_input ]  ):
        icount += 1
        topn_movieIndex.append( cos_sim_scores[i][0] )
        #if we have found the top n recommendatins we will break
        if( icount == n_recommendations ):
          break
    
    topn_movie = [ movies.loc[index].title for index in topn_movieIndex ]
    return topn_movie

#movie recommendation using user based approach

In [51]:
def make_recommendation_user_based(best_model_params, movieId_all_array, ratings_data, id_to_movie_map,
                        fav_movie_list, n_recommendations, userId=-99 ):
    """
    return top n movie recommendation based on user's input list of favorite movies
    Currently, fav_movie_list only support one input favorate movie


    Parameters
    ----------
    best_model_params: dict, {'iterations': iter, 'rank': rank, 'lambda_': reg}

    movieId_all_array: the array of all movie Id

    ratings_data: ratings data

    id_to_movie_map: the map from movieId to movie title

    fav_movie_list: list, user's list of favorite movies

    n_recommendations: int, top n recommendations

    userId: int optional (default=-99), the user Id
            if userId = -99, the new user will be created
            if userId = -1, the latest inserted user is chosen

    Return
    ------
    list of top n movie recommendations
    """

    movieId_list = []
    for movie_name in fav_movie_list:
      movieId_list.append( get_movieId(movie_name) )

    if (userId == -99):
      userId = np.amax( ratings_data['userId'].unique() ) + 1
    elif (userId == -1):
      userId = np.amax( ratings_data['userId'].unique() )

    ratings_array = ratings['rating'].unique()
    max_rating = np.amax( ratings_array )
    min_rating = np.amin( ratings_array )
    
    # create the new row which corresponding to the input data
    user_rows = [[userId, movieId, max_rating] for movieId in movieId_list]
    df = pd.DataFrame(user_rows, columns =['userId', 'movieId', 'rating']) 
    train_data = pd.concat([ratings_data, df], ignore_index=True, sort=False)

    # Get the movie id which corresponding to the movie the user didn't watch before
    movieId_user_exist = train_data[ train_data.userId==userId ]['movieId'].unique()
    movieId_input = []
    for movieId in movieId_all_array:
      if (movieId not in movieId_user_exist):
         movieId_input.append( movieId )

    reader = Reader(rating_scale=(min_rating, max_rating))

    data = Dataset.load_from_df(train_data, reader)

    model = SVD(**best_model_params)
    model.fit(data.build_full_trainset())

    predictions = []
    for movieId in movieId_input:
      predictions.append( model.predict(userId,movieId) )

    
    sort_index = sorted(range(len(predictions)), key=lambda k: predictions[k].est, reverse=True)
    topn_predictions = [ predictions[i].est for i in sort_index[0:min(n_recommendations,len(predictions))] ]
    topn_movieIds = [ movieId_input[i] for i in sort_index[0:min(n_recommendations,len(predictions))] ]
    topn_rating = [ get_rating_from_prediction( pre, ratings_array ) for pre in topn_predictions ]

    topn_movie = [ id_to_movie_map[ ids ] for ids in topn_movieIds ]
    return topn_movie

#make a movie recommendation

In [52]:
def findMovie(currMovie):
    my_favorite_movies = [currMovie]

    # get recommends
    n_recommendations = 10
    recommends_user_based = make_recommendation_user_based(
    best_model_params = best_params, 
    movieId_all_array = movieId_all_array,
    ratings_data = ratings[features], 
    id_to_movie_map = reverse_movie_map, 
    fav_movie_list = my_favorite_movies, 
    n_recommendations = n_recommendations)

    '''print("--------------Search based on similarity between user's preference--------------------------------------")
    print('The users like' , my_favorite_movies , 'also like:')
    for i, title in enumerate(recommends_user_based):
        print(i+1, title)
    if( len(recommends_user_based) < n_recommendations ):
        print("Sadly, we couldn't offer so many recommendations :(")
    print(type(recommends_user_based))'''
    return recommends_user_based

In [57]:
resultDataframe=pd.DataFrame()
for i in movies['title'][:700]:
  col=findMovie(i)
  resultDataframe[i]=col
display(resultDataframe)

  after removing the cwd from sys.path.


Unnamed: 0,Toy Story (1995),Jumanji (1995),Grumpier Old Men (1995),Waiting to Exhale (1995),Father of the Bride Part II (1995),Heat (1995),Sabrina (1995),Tom and Huck (1995),Sudden Death (1995),GoldenEye (1995),...,North by Northwest (1959),"Apartment, The (1960)",Some Like It Hot (1959),Charade (1963),Casablanca (1942),"Maltese Falcon, The (1941)",My Fair Lady (1964),Sabrina (1954),Roman Holiday (1953),"Little Princess, The (1939)"
0,"Shawshank Redemption, The (1994)","Shawshank Redemption, The (1994)",Lawrence of Arabia (1962),"Shawshank Redemption, The (1994)","Shawshank Redemption, The (1994)","Shawshank Redemption, The (1994)","Shawshank Redemption, The (1994)","Shawshank Redemption, The (1994)","Shawshank Redemption, The (1994)",Lawrence of Arabia (1962),...,"Shawshank Redemption, The (1994)","Shawshank Redemption, The (1994)","Shawshank Redemption, The (1994)","Shawshank Redemption, The (1994)","Shawshank Redemption, The (1994)","Streetcar Named Desire, A (1951)","Shawshank Redemption, The (1994)","Shawshank Redemption, The (1994)","Shawshank Redemption, The (1994)","Shawshank Redemption, The (1994)"
1,Lawrence of Arabia (1962),Rear Window (1954),"Shawshank Redemption, The (1994)",Dr. Strangelove or: How I Learned to Stop Worr...,Dr. Strangelove or: How I Learned to Stop Worr...,"Philadelphia Story, The (1940)",Dr. Strangelove or: How I Learned to Stop Worr...,Dr. Strangelove or: How I Learned to Stop Worr...,Dr. Strangelove or: How I Learned to Stop Worr...,"Shawshank Redemption, The (1994)",...,Lawrence of Arabia (1962),Dr. Strangelove or: How I Learned to Stop Worr...,Dr. Strangelove or: How I Learned to Stop Worr...,Lawrence of Arabia (1962),Dr. Strangelove or: How I Learned to Stop Worr...,Lawrence of Arabia (1962),"Godfather, The (1972)",Rear Window (1954),Dr. Strangelove or: How I Learned to Stop Worr...,Lawrence of Arabia (1962)
2,Dr. Strangelove or: How I Learned to Stop Worr...,Dr. Strangelove or: How I Learned to Stop Worr...,Dr. Strangelove or: How I Learned to Stop Worr...,Lawrence of Arabia (1962),"Godfather, The (1972)",Dr. Strangelove or: How I Learned to Stop Worr...,"Streetcar Named Desire, A (1951)","Godfather, The (1972)","Godfather, The (1972)",Dr. Strangelove or: How I Learned to Stop Worr...,...,Fight Club (1999),Rear Window (1954),Rosemary's Baby (1968),Ghost in the Shell (Kôkaku kidôtai) (1995),Rear Window (1954),"Shawshank Redemption, The (1994)",Fight Club (1999),"Godfather, The (1972)",Lawrence of Arabia (1962),Rear Window (1954)
3,"Philadelphia Story, The (1940)","Godfather, The (1972)","Godfather, The (1972)","Godfather, The (1972)",Fight Club (1999),Hoop Dreams (1994),"Godfather, The (1972)","Boondock Saints, The (2000)",Fight Club (1999),"Boondock Saints, The (2000)",...,"Godfather, The (1972)","Godfather, The (1972)",Rear Window (1954),Dr. Strangelove or: How I Learned to Stop Worr...,"Godfather, The (1972)",Dr. Strangelove or: How I Learned to Stop Worr...,Rear Window (1954),Dr. Strangelove or: How I Learned to Stop Worr...,Fight Club (1999),"Streetcar Named Desire, A (1951)"
4,"Godfather, The (1972)",Sunset Blvd. (a.k.a. Sunset Boulevard) (1950),Fight Club (1999),Fight Club (1999),Rear Window (1954),"Godfather, The (1972)",Lawrence of Arabia (1962),Fight Club (1999),Rear Window (1954),Fight Club (1999),...,Dr. Strangelove or: How I Learned to Stop Worr...,Fight Club (1999),"Philadelphia Story, The (1940)","Godfather, The (1972)",Fight Club (1999),"Godfather, The (1972)",Brazil (1985),Fight Club (1999),"Godfather, The (1972)","Godfather, The (1972)"
5,Fight Club (1999),Fight Club (1999),"Streetcar Named Desire, A (1951)","Philadelphia Story, The (1940)",Pulp Fiction (1994),Lawrence of Arabia (1962),Fight Club (1999),Lawrence of Arabia (1962),Ghost in the Shell (Kôkaku kidôtai) (1995),"Godfather, The (1972)",...,Rear Window (1954),"Hustler, The (1961)","Godfather, The (1972)",Fight Club (1999),Sunset Blvd. (a.k.a. Sunset Boulevard) (1950),Fight Club (1999),Cool Hand Luke (1967),Pulp Fiction (1994),His Girl Friday (1940),Dr. Strangelove or: How I Learned to Stop Worr...
6,Rear Window (1954),In the Name of the Father (1993),Rear Window (1954),Pulp Fiction (1994),"Departed, The (2006)",Fight Club (1999),Hoop Dreams (1994),Ghost in the Shell (Kôkaku kidôtai) (1995),"Streetcar Named Desire, A (1951)","Three Billboards Outside Ebbing, Missouri (2017)",...,"Boot, Das (Boat, The) (1981)",Pulp Fiction (1994),Fight Club (1999),Monty Python's And Now for Something Completel...,"Streetcar Named Desire, A (1951)",Rear Window (1954),Dr. Strangelove or: How I Learned to Stop Worr...,"Philadelphia Story, The (1940)","Philadelphia Story, The (1940)",Fight Club (1999)
7,"Streetcar Named Desire, A (1951)",Pulp Fiction (1994),Pulp Fiction (1994),"Manchurian Candidate, The (1962)",North by Northwest (1959),Cool Hand Luke (1967),Pulp Fiction (1994),"Departed, The (2006)","Departed, The (2006)",Rear Window (1954),...,"Philadelphia Story, The (1940)",Rosemary's Baby (1968),Cool Hand Luke (1967),Pulp Fiction (1994),Pulp Fiction (1994),"Departed, The (2006)","Departed, The (2006)","Usual Suspects, The (1995)",Rear Window (1954),"Day of the Doctor, The (2013)"
8,"Departed, The (2006)",Brazil (1985),"Usual Suspects, The (1995)","Departed, The (2006)",Apocalypse Now (1979),Pulp Fiction (1994),Cool Hand Luke (1967),Pulp Fiction (1994),Pulp Fiction (1994),Brazil (1985),...,Pulp Fiction (1994),"Departed, The (2006)",Pulp Fiction (1994),Rear Window (1954),"Departed, The (2006)",Pulp Fiction (1994),Pulp Fiction (1994),"Godfather: Part II, The (1974)","Streetcar Named Desire, A (1951)",Pulp Fiction (1994)
9,Pulp Fiction (1994),Star Wars: Episode IV - A New Hope (1977),Apocalypse Now (1979),Brazil (1985),Lawrence of Arabia (1962),Rear Window (1954),Ghost in the Shell (Kôkaku kidôtai) (1995),Kiss Kiss Bang Bang (2005),Hoop Dreams (1994),"Departed, The (2006)",...,Hoop Dreams (1994),"Usual Suspects, The (1995)","Usual Suspects, The (1995)","Philadelphia Story, The (1940)",Reservoir Dogs (1992),Ghost in the Shell (Kôkaku kidôtai) (1995),Lawrence of Arabia (1962),"Departed, The (2006)",Pulp Fiction (1994),Goodfellas (1990)


In [54]:
resultDataframe.to_csv('movie_similarity.csv',index=False)
! ls

movies.csv  movie_similarity.csv  ratings.csv  sample_data


In [55]:
resultDataframe.head()

Unnamed: 0,Toy Story (1995),Jumanji (1995),Grumpier Old Men (1995),Waiting to Exhale (1995),Father of the Bride Part II (1995),Heat (1995),Sabrina (1995),Tom and Huck (1995),Sudden Death (1995),GoldenEye (1995)
0,"Shawshank Redemption, The (1994)","Shawshank Redemption, The (1994)","Shawshank Redemption, The (1994)","Shawshank Redemption, The (1994)","Shawshank Redemption, The (1994)","Shawshank Redemption, The (1994)","Shawshank Redemption, The (1994)","Shawshank Redemption, The (1994)","Shawshank Redemption, The (1994)","Shawshank Redemption, The (1994)"
1,Lawrence of Arabia (1962),Lawrence of Arabia (1962),Dr. Strangelove or: How I Learned to Stop Worr...,"Godfather, The (1972)",Dr. Strangelove or: How I Learned to Stop Worr...,Dr. Strangelove or: How I Learned to Stop Worr...,"Conversation, The (1974)",Dr. Strangelove or: How I Learned to Stop Worr...,Rear Window (1954),Dr. Strangelove or: How I Learned to Stop Worr...
2,Dr. Strangelove or: How I Learned to Stop Worr...,Dr. Strangelove or: How I Learned to Stop Worr...,"Godfather, The (1972)",Dr. Strangelove or: How I Learned to Stop Worr...,Hoop Dreams (1994),"Godfather, The (1972)",Dr. Strangelove or: How I Learned to Stop Worr...,"Godfather, The (1972)",Dr. Strangelove or: How I Learned to Stop Worr...,"Godfather, The (1972)"
3,"Godfather, The (1972)","Godfather, The (1972)",Fight Club (1999),Fight Club (1999),"Godfather, The (1972)",Lawrence of Arabia (1962),Cool Hand Luke (1967),Fight Club (1999),"Godfather, The (1972)",Fight Club (1999)
4,"Boot, Das (Boat, The) (1981)",Fight Club (1999),"Manchurian Candidate, The (1962)",Lawrence of Arabia (1962),Ghost in the Shell (Kôkaku kidôtai) (1995),His Girl Friday (1940),"Godfather, The (1972)",Rear Window (1954),Fight Club (1999),Rear Window (1954)


In [56]:
movieLiked = 'Heat (1995)'
similarityScores = movieLiked
#similarityScores.sort_values(ascending=False)[1:10]