In [140]:
#importing the libraries we're gonna use
from datetime import datetime
import pandas as pd
import numpy as np
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import warnings
warnings.filterwarnings("ignore")

Upload the `movies` and `ratings` tables into pandas dataframe below:

In [2]:
movies= pd.read_csv("Data/movies.csv")
movies.drop("Unnamed: 0",axis=1,inplace=True)

In [3]:
ratings_sample = pd.read_csv("Data/ratings_sample.csv")

### User-Item matrix / DataFrame

To find the relationship between the new user that has just entered the a movie that they like, we need a user-item matrix.

Below I'm creating a dataframe which it's rows indicate the id of users.
And the column names represent the movie IDs.
The values inside represent the rating that each user has given a movie.

In [4]:
# Create a user-item matrix
# Identify unique users and items
unique_users = ratings_sample['userId'].unique()
unique_items = ratings_sample['movieId'].unique()

# Create an empty user-item matrix
user_item_matrix = np.zeros((len(unique_users), len(unique_items)))

# Populate the user-item matrix
for index, row in ratings_sample.iterrows():
    user_id = row['userId']
    item_id = row['movieId']
    rating = row['rating']

    user_index = np.where(unique_users == user_id)[0][0]
    item_index = np.where(unique_items == item_id)[0][0]

    user_item_matrix[user_index, item_index] = rating

# Print the user-item matrix
print(user_item_matrix)

[[4.  3.5 4.  ... 0.  0.  0. ]
 [0.  0.  0.  ... 0.  0.  0. ]
 [0.  0.  0.  ... 0.  0.  0. ]
 ...
 [0.  0.  0.  ... 0.  0.  0. ]
 [0.  0.  0.  ... 0.  0.  0. ]
 [0.  0.  0.  ... 0.  0.  0. ]]


In [167]:
df = pd.DataFrame(data=user_item_matrix, index=[unique_users],
                  columns=[unique_items])
                  #user-item matrix in a dataframe as described above

Let's look at the user-item matrix, which in the columns are movie IDs and rows are UserIDs.

In [6]:
df

Unnamed: 0,356,4167,4306,4979,5574,6156,6213,6333,6383,6595,...,120905,182415,34229,35826,70978,72027,81665,3291,5457,6246
3,4.0,3.5,4.0,4.0,4.0,3.0,3.0,4.0,4.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
162519,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
162521,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
162529,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
162533,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
# df.to_csv("user_item.csv")

### Search for the movie

The function below finds the title of the movies which have similar title to what has been searched:

In [154]:
def find_similar_movies(movie_title, top_n=10, threshold=80):
    # Calculate similarity scores for each movie title
    titles = movies['title'].tolist()
    similarity_scores = process.extract(movie_title, titles, scorer=fuzz.partial_ratio)

    # Filter movies with similarity scores above the threshold
    
    similar_movies = [name for name,score in similarity_scores if score >= threshold]
    # print(similar_movies[:top_n])
    return similar_movies[:top_n]

In [158]:
similar = find_similar_movies('Love actua',top_n=2)
#this gives me a list of similar movie titles to the one searched.

In [159]:
similar

['Love Actually', 'Love']

## Recommend based on similar users data

The function `get_top_recommendations` will get a title and pass that into `find_similar_movies` to search through the movies dataframe and find the most similar movie title to the one searched.

Then it's going to take the movie's `movieID` and `user ID` of the people who like the same movie.

After collecting the user IDs it will iterate through the rows of the dataframe for those users and look for other movies that the users has rated for higher than `3.5`. (Treshold for rating = 3.5)

Then we're going to add those movies the users has liked and append them to `top_movies` dataframe.

At the end the function will return the top 10 movies based on the ratings users has given.


In [164]:
def get_top_recommendations(title):
    similar = find_similar_movies(title,top_n=2)
    idx = movies[movies['title']==similar[0]].index[0] #getting the index of the first movie that has a similar name to search
    movie_id = movies.loc[idx]['movieId']
    user_ids = ratings_sample[ratings_sample['movieId']==movie_id]['userId']
    top_movies=pd.DataFrame(columns=['movieID','rating'])
    for index, row in df.iterrows():
        if index in user_ids.values:
            
        # Iterate over each column in the row
            
            for column,val in row.iteritems():
                if (val> 3.5) &  (column in movies.movieId.values):
                    new_row=[{'movieID': column[0], "rating":val}]
                    top_movies = top_movies.append(new_row,ignore_index=True)
                   
            # if row>0:
            #     tops.append(row)
     
    top10 = top_movies.sort_values(by='rating',ascending=False).head(10)
    return top10

Function below gets the top10 movies list and displays the title + some information about the movie.

In [168]:
def get_movie_info(top_movies_df,movies):
    for movieId in top10.movieID.values:
        print("\n---------------------------------------------")
        print("Movie Title: " ,movies[movies['movieId']==movieId]['title'].values[0])
        print("Overview: " ,movies[movies['movieId']==movieId]['overview'].values[0])
        print("Genre: " ,movies[movies['movieId']==movieId]['genre'].values[0])
        
            

In [169]:
top10 = get_top_recommendations('Love actua')
get_movie_info(top10,movies)
   



---------------------------------------------
Movie Title:  Point Break
Overview:  In the coastal town of Los Angeles, a gang of bank robbers call themselves The Ex-Presidents commit their crimes while wearing masks of Reagan, Carter, Nixon and Johnson. The F.B.I. believes that the members of the gang could be surfers and send young agent Johnny Utah undercover at the beach to mix with the surfers and gather information.
Genre:  ['Action', 'Thriller', 'Crime']

---------------------------------------------
Movie Title:  The Thomas Crown Affair
Overview:  A very rich and successful playboy amuses himself by stealing artwork, but may have met his match in a seductive detective.
Genre:  ['Drama', 'Crime', 'Romance']

---------------------------------------------
Movie Title:  Romeo Must Die
Overview:  Two warring gang families (one African-American, the other Chinese) maneuver for bragging rights to the Oakland, California, docks. Hang SIng and Trish O'Day uncover a trail of deceit that 