In [1]:
import pandas as pd

In [2]:
movie_similarity_df = pd.read_pickle('../similarity_df.pkl')

In [3]:
ratings_df = pd.read_csv('../ratings.csv')

In [4]:
movie_watched = ratings_df[ratings_df['userId'] == 345]['movieId']
user_movie_rating = ratings_df[ratings_df['userId'] == 345][['movieId', 'rating']].set_index('movieId')
rating_watched = ratings_df[ratings_df['userId'] == 345]['rating']

In [5]:
rating_watched

52654    2.5
52655    3.5
52656    3.5
52657    4.0
52658    3.5
        ... 
52711    5.0
52712    3.0
52713    3.5
52714    4.0
52715    4.0
Name: rating, Length: 62, dtype: float64

In [6]:
user_movie_rating

Unnamed: 0_level_0,rating
movieId,Unnamed: 1_level_1
11.0,2.5
32.0,3.5
344.0,3.5
412.0,4.0
468.0,3.5
...,...
27751.0,5.0
33085.0,3.0
35836.0,3.5
38164.0,4.0


In [7]:
ratings_df.index

RangeIndex(start=0, stop=100836, step=1)

In [8]:
# find mean of this user rating (sum(all ratings) / len(all ratings))
user_rating_mean = rating_watched.mean()
user_rating_mean

3.903225806451613

In [9]:
# from movie sim matrix, select the rows that user watched
watched_similarity_df = movie_similarity_df.loc[movie_watched]
# now, each label/index of row is the movie that user has already watched
# the columns are all movies that the user has not watched, and we want to predict
# what rating this user will give to it

In [10]:
watched_similarity_df.head()

Unnamed: 0,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,10.0,...,193565.0,193567.0,193571.0,193573.0,193579.0,193581.0,193583.0,193585.0,193587.0,193609.0
11.0,0.034637,0.076767,-0.015869,-0.04295,-0.008352,0.063936,0.138569,-0.104668,-0.002774,0.035865,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
32.0,0.054685,-0.060976,0.063233,-0.04635,-0.09662,0.1525,-0.010147,-0.04667,0.005956,-0.110872,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
344.0,-0.10624,-0.011964,0.049935,0.038785,0.024667,0.015339,-0.033992,-0.003444,0.062695,0.055366,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
412.0,-0.011264,-0.028539,-0.034964,-0.045839,-0.112018,-0.036734,-0.048277,0.174216,-0.016629,0.035738,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
468.0,-0.027879,0.047287,0.142115,0.018095,0.091756,-0.019304,-0.023075,-0.026579,0.02368,0.121686,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
watched_movie_id_list = movie_watched.tolist()

In [12]:
recommended_movie = {}
for target_movie_id, watched_similarities in watched_similarity_df.items():
    if target_movie_id in watched_movie_id_list:
        continue

    if not (watched_similarities > 0.3).any():
        continue

    sorted_watched_similarities = watched_similarities.sort_values(ascending=False, inplace=False).iloc[:15]

    if len(sorted_watched_similarities) > 1:
        rating = user_movie_rating.loc[sorted_watched_similarities.index]
        normalized_rating = rating['rating'] - user_rating_mean
        pred = (sorted_watched_similarities * normalized_rating).sum() / sorted_watched_similarities.sum()

        recommended_movie[target_movie_id] = pred

sorted_recommended_movie = sorted(recommended_movie.items(), key=lambda x: float(x[1]), reverse=True)

In [13]:
sorted_recommended_movie

[(6158.0, 1.096774193548387),
 (63239.0, 1.096774193548387),
 (1123.0, 0.9078952373917679),
 (5767.0, 0.9078952373917679),
 (5828.0, 0.9078952373917679),
 (6342.0, 0.9078952373917679),
 (6692.0, 0.9078952373917679),
 (7349.0, 0.9078952373917679),
 (8092.0, 0.9078952373917679),
 (8236.0, 0.9078952373917679),
 (8427.0, 0.9078952373917679),
 (8494.0, 0.9078952373917679),
 (8844.0, 0.9078952373917679),
 (26819.0, 0.9078952373917679),
 (2969.0, 0.7849419968087215),
 (4083.0, 0.7849419968087215),
 (1564.0, 0.720716104505432),
 (2749.0, 0.7086880745216039),
 (25850.0, 0.6487431543493412),
 (6643.0, 0.6441668908717367),
 (7834.0, 0.6314765059244334),
 (101973.0, 0.596774193548387),
 (47644.0, 0.5961290577757803),
 (2132.0, 0.5883888796002681),
 (26564.0, 0.5767147200916913),
 (172547.0, 0.567576006985226),
 (67267.0, 0.562672028017363),
 (25825.0, 0.5461991730690822),
 (6429.0, 0.5433415447265079),
 (7243.0, 0.5433415447265079),
 (7302.0, 0.5433415447265079),
 (25797.0, 0.5433415447265079),
 (