In [173]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

# Load MovieLens dataset (assuming you have 'ratings.csv', 'movies.csv', and 'tags.csv')
ratings = pd.read_csv('ratings.csv')
movies = pd.read_csv('movies.csv')
tags = pd.read_csv('tags.csv')

In [174]:
ratings.head(3)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224


In [175]:
movies.head(3)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance


In [176]:
tags.head(3)

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992


In [177]:

user_item_matrix = ratings.pivot(index='userId', columns='movieId', values='rating').fillna(0)
user_item_matrix #User-item interaction matrix

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,2.5,0.0,0.0,0.0,0.0,0.0,2.5,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
607,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
608,2.5,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
609,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [178]:
user_similarity = cosine_similarity(user_item_matrix)
user_similarity_df = pd.DataFrame(user_similarity, index=user_item_matrix.index, columns=user_item_matrix.index)
user_similarity_df #Cosine similarity matrix (two userID array dot product divided by their magnitudes )

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,0.027283,0.059720,0.194395,0.129080,0.128152,0.158744,0.136968,0.064263,0.016875,...,0.080554,0.164455,0.221486,0.070669,0.153625,0.164191,0.269389,0.291097,0.093572,0.145321
2,0.027283,1.000000,0.000000,0.003726,0.016614,0.025333,0.027585,0.027257,0.000000,0.067445,...,0.202671,0.016866,0.011997,0.000000,0.000000,0.028429,0.012948,0.046211,0.027565,0.102427
3,0.059720,0.000000,1.000000,0.002251,0.005020,0.003936,0.000000,0.004941,0.000000,0.000000,...,0.005048,0.004892,0.024992,0.000000,0.010694,0.012993,0.019247,0.021128,0.000000,0.032119
4,0.194395,0.003726,0.002251,1.000000,0.128659,0.088491,0.115120,0.062969,0.011361,0.031163,...,0.085938,0.128273,0.307973,0.052985,0.084584,0.200395,0.131746,0.149858,0.032198,0.107683
5,0.129080,0.016614,0.005020,0.128659,1.000000,0.300349,0.108342,0.429075,0.000000,0.030611,...,0.068048,0.418747,0.110148,0.258773,0.148758,0.106435,0.152866,0.135535,0.261232,0.060792
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,0.164191,0.028429,0.012993,0.200395,0.106435,0.102123,0.200035,0.099388,0.075898,0.088963,...,0.178084,0.116534,0.300669,0.066032,0.148141,1.000000,0.153063,0.262558,0.069622,0.201104
607,0.269389,0.012948,0.019247,0.131746,0.152866,0.162182,0.186114,0.185142,0.011844,0.010451,...,0.092525,0.199910,0.203540,0.137834,0.118780,0.153063,1.000000,0.283081,0.149190,0.139114
608,0.291097,0.046211,0.021128,0.149858,0.135535,0.178809,0.323541,0.187233,0.100435,0.077424,...,0.158355,0.197514,0.232771,0.155306,0.178142,0.262558,0.283081,1.000000,0.121993,0.322055
609,0.093572,0.027565,0.000000,0.032198,0.261232,0.214234,0.090840,0.423993,0.000000,0.021766,...,0.035653,0.335231,0.061941,0.236601,0.097610,0.069622,0.149190,0.121993,1.000000,0.053225


In [179]:
#Group all tags in a movie together
movie_tags = tags.groupby('movieId')['tag'].apply(' '.join).reset_index()

#Merge movie data with tags
movie_data = pd.merge(movies, movie_tags, on='movieId', how='left')
movie_data['tag'] = movie_data['tag'].fillna('')

movie_data

Unnamed: 0,movieId,title,genres,tag
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,pixar pixar fun
1,2,Jumanji (1995),Adventure|Children|Fantasy,fantasy magic board game Robin Williams game
2,3,Grumpier Old Men (1995),Comedy|Romance,moldy old
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,
4,5,Father of the Bride Part II (1995),Comedy,pregnancy remake
...,...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy,
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy,
9739,193585,Flint (2017),Drama,
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation,


In [180]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(movie_data['genres'] + ' ' + movie_data['tag'])
tfidf_vectorizer.vocabulary_

{'adventure': 36,
 'animation': 75,
 'children': 274,
 'comedy': 317,
 'fantasy': 530,
 'pixar': 1142,
 'fun': 587,
 'magic': 925,
 'board': 177,
 'game': 595,
 'robin': 1263,
 'williams': 1651,
 'romance': 1271,
 'moldy': 992,
 'old': 1081,
 'drama': 453,
 'pregnancy': 1172,
 'remake': 1235,
 'action': 22,
 'crime': 363,
 'thriller': 1516,
 'politics': 1156,
 'president': 1175,
 'horror': 725,
 'mafia': 923,
 'jane': 811,
 'austen': 110,
 'hollywood': 713,
 'mystery': 1025,
 'serial': 1334,
 'killer': 853,
 'sci': 1311,
 'fi': 547,
 'alcoholism': 49,
 'shakespeare': 1344,
 'netflix': 1048,
 'queue': 1201,
 'kidnapping': 851,
 'high': 696,
 'school': 1309,
 'teacher': 1487,
 'time': 1524,
 'travel': 1547,
 'brad': 194,
 'pitt': 1141,
 'bruce': 211,
 'willis': 1652,
 'mindfuck': 980,
 'post': 1163,
 'apocalyptic': 84,
 'twist': 1564,
 'ending': 491,
 'animal': 73,
 'movie': 1012,
 'pigs': 1139,
 'villain': 1604,
 'nonexistent': 1063,
 'needed': 1040,
 'good': 627,
 'story': 1431,
 'deat

In [181]:
print(tfidf_matrix)

  (0, 587)	0.40432512831075756
  (0, 1142)	0.8416771249174413
  (0, 530)	0.17300098313924464
  (0, 317)	0.09576778805506665
  (0, 274)	0.18038846066758712
  (0, 75)	0.18490545901063682
  (0, 36)	0.14923127385752047
  (1, 1651)	0.3105613900424147
  (1, 1263)	0.33165713426090987
  (1, 595)	0.6850036759600161
  (1, 177)	0.35778658940156977
  (1, 925)	0.3055276791202499
  (1, 530)	0.2657634761752139
  (1, 274)	0.13855604603797952
  (1, 36)	0.11462426795143797
  (2, 1081)	0.6852356240039252
  (2, 992)	0.6852356240039252
  (2, 1271)	0.2026249156281861
  (2, 317)	0.14088088145157518
  (3, 453)	0.4666201177032598
  (3, 1271)	0.7261829999003466
  (3, 317)	0.5048999073185984
  (4, 1235)	0.6728033494323147
  (4, 1172)	0.7199029622585924
  (4, 317)	0.1705150372370789
  :	:
  (9732, 1236)	0.5511943752355779
  (9732, 619)	0.5511943752355779
  (9732, 76)	0.4424903666267339
  (9732, 547)	0.19139804035832575
  (9732, 1311)	0.19139804035832575
  (9732, 22)	0.15512521431325352
  (9732, 317)	0.22664539528

In [182]:
#Content based filtering
cosine_sim_content = cosine_similarity(tfidf_matrix, tfidf_matrix)
cosine_sim_content[0] #the problem lies in that some users have 0 similarity, this is the similarity with the first user with cosine similarity

array([1.        , 0.08807678, 0.01349185, ..., 0.        , 0.15084122,
       0.09576779])

In [183]:
#Collaborative filtering
#SVD factorization = used to simplify and represent the movie data
#Movie data is represented by a product of smaller matrices
#SVD factorization is known as A ≈ U x E x V^T, A is the input matrix
#Minimizing sum of squared errors of both LHS and RHS is SVD factorization
#Sum of Squared Errors is proportional to the square of standard deviation

R = user_item_matrix.values
U, sigma, Vt = np.linalg.svd(R, full_matrices=False)
sigma_matrix = np.diag(sigma)
predicted_ratings = np.dot(np.dot(U, sigma_matrix), Vt)

print('Standard Deviation of original matrix: ', np.std(R))
print('Standard Deviation of product: ', np.std(predicted_ratings))

Standard Deviation of original matrix:  0.4726144124098266
Standard Deviation of product:  0.4726144124098265


In [184]:
predicted_ratings[0] #the problem lies in that some users have 0 similarity, this is the similarity with the first user with SVD

array([ 4.00000000e+00,  3.67284328e-15,  4.00000000e+00, ...,
        1.71710519e-16,  1.92066415e-16, -6.87817858e-16])

In [188]:
def hybrid_recommendations(user_id, top_n=5):
    user_ratings = user_item_matrix.loc[user_id]
    collaborative_recs = pd.Series(predicted_ratings[user_id-1], index=user_item_matrix.columns).sort_values(ascending=False)
    #Collaborative recommendation ratings
    #Index of collaborative_recs is the movies that are most likely to be favored by a user
    
    content_recs = pd.Series((cosine_sim_content[user_item_matrix.columns.get_loc(item)] for item in collaborative_recs.index), index=collaborative_recs.index)
    content_recs = content_recs.sort_index()
    #Content-based Recommendation ratings
    #Find the cosine similarity of all items of a particular user, then sort index
        
    hybrid_recs = collaborative_recs + content_recs
    rating_index = user_ratings[user_ratings > 0].index
    hybrid_recs = hybrid_recs[~user_ratings.index.isin(rating_index)]  # Exclude items already rated
    return hybrid_recs

In [189]:
user_weights = [0.2, 0.8] #Can be adjusted as the size of the dataset or userbase grows
user_id = 3

In [190]:
recommended_movies = hybrid_recommendations(user_id)
print("Hybrid Recommended movies for user", user_id)
print(recommended_movies)

Hybrid Recommended movies for user 3
movieId
1    [0.9999999999999993, 0.08807678004317167, 0.01...
2    [0.08807678004312482, 0.9999999999999525, -4.7...
3    [0.013491850395764221, -1.0120376758848693e-13...
4    [0.0483531473131253, 1.4981722459839197e-14, 0...
5    [0.016329847946269484, -5.2886080931235924e-14...
dtype: object
