# Recommendation systems

In this notebook, recommendations on movies are given, based on the user's ratings.

In the first section, tried to produce movie recommendations based on the genre of the movies the user rated high in his past. This can be called a naive approach of *'Content-based filtering'*.

In the second section, matrix factorization (including SVD) or nearest neighbours are attempted to produce movie recommendations. Here, the recommendations are influenced by the ratings from other users as well. Owing to this collaboration, these can be called naive approaches of *'Collaborative filtering'*.

In [1]:
import pandas as pd

In [3]:
links_df = pd.read_csv('ml-latest-small/links.csv')
links_df

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0
...,...,...,...
9737,193581,5476944,432131.0
9738,193583,5914996,445030.0
9739,193585,6397426,479308.0
9740,193587,8391976,483455.0


In [4]:
movies_df = pd.read_csv('ml-latest-small/movies.csv')
movies_df

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [26]:
movie_genres = [
    'Action', 
    'Adventure', 
    'Animation',
    "Children's",
    'Comedy',
    'Crime',
    'Documentary',
    'Drama',
    'Fantasy',
    'Film-Noir',
    'Horror',
    'Musical',
    'Mystery',
    'Romance',
    'Sci-Fi',
    'Thriller',
    'War',
    'Western'
]

In [31]:
movies_genres = []
for movieId, genres in zip(movies_df.movieId, 
                            movies_df.genres):
    genres_split = genres.split('|')
    for g in genres_split:
        movies_genres.append(
            {
                'movieId': movieId,
                'genre': g
            }
        )
movies_genres_df = pd.DataFrame(movies_genres)

Unnamed: 0,movieId,genre
0,1,Adventure
1,1,Animation
2,1,Children
3,1,Comedy
4,1,Fantasy
...,...,...
22079,193583,Fantasy
22080,193585,Drama
22081,193587,Action
22082,193587,Animation


In [34]:
for i in movies_genres_df.groupby(['genre']):
    print(i)

('(no genres listed)',        movieId               genre
19492   114335  (no genres listed)
19881   122888  (no genres listed)
19888   122896  (no genres listed)
20100   129250  (no genres listed)
20229   132084  (no genres listed)
20373   134861  (no genres listed)
20664   141131  (no genres listed)
20699   141866  (no genres listed)
20733   142456  (no genres listed)
20772   143410  (no genres listed)
20871   147250  (no genres listed)
20948   149330  (no genres listed)
21027   152037  (no genres listed)
21084   155589  (no genres listed)
21101   156605  (no genres listed)
21199   159161  (no genres listed)
21218   159779  (no genres listed)
21274   161008  (no genres listed)
21409   165489  (no genres listed)
21428   166024  (no genres listed)
21476   167570  (no genres listed)
21536   169034  (no genres listed)
21613   171495  (no genres listed)
21614   171631  (no genres listed)
21617   171749  (no genres listed)
21634   171891  (no genres listed)
21649   172497  (no genres liste

In [None]:
movies_set = set(movies_genres_df.movieId)
movies_set

In [38]:
def get_genres_of_movie(movieId):
    for i in movies_genres_df.groupby(['movieId']):
        if i[0] == movieId:
            return list(i[1].genre)
        
def get_movies_of_genre(genre):
    for i in movies_genres_df.groupby(['genre']):
        if i[0] == genre:
            return list(i[1].movieId)

In [37]:
get_genres_of_movie(45)

['Comedy', 'Drama', 'Thriller']

In [39]:
get_movies_of_genre('Drama')

[4,
 11,
 14,
 16,
 17,
 20,
 22,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 34,
 36,
 40,
 41,
 42,
 43,
 45,
 46,
 48,
 49,
 52,
 53,
 55,
 57,
 58,
 61,
 62,
 72,
 73,
 74,
 75,
 78,
 79,
 80,
 81,
 82,
 83,
 85,
 86,
 92,
 94,
 96,
 97,
 100,
 105,
 106,
 110,
 111,
 113,
 117,
 121,
 123,
 140,
 145,
 147,
 148,
 149,
 150,
 151,
 152,
 154,
 155,
 156,
 159,
 161,
 166,
 168,
 169,
 171,
 175,
 178,
 179,
 184,
 191,
 193,
 194,
 195,
 198,
 199,
 201,
 202,
 205,
 207,
 209,
 211,
 213,
 214,
 215,
 217,
 218,
 219,
 222,
 224,
 225,
 229,
 230,
 232,
 233,
 235,
 241,
 242,
 247,
 249,
 253,
 254,
 259,
 261,
 262,
 263,
 265,
 266,
 269,
 270,
 271,
 272,
 273,
 277,
 279,
 280,
 281,
 282,
 283,
 285,
 290,
 291,
 292,
 293,
 296,
 298,
 299,
 300,
 301,
 302,
 304,
 306,
 307,
 308,
 314,
 315,
 317,
 318,
 319,
 321,
 322,
 324,
 326,
 329,
 331,
 334,
 336,
 337,
 340,
 341,
 345,
 346,
 347,
 349,
 350,
 351,
 352,
 354,
 356,
 358,
 359,
 361,
 364,
 365,
 366,
 369,
 371,

In [5]:
ratings_df = pd.read_csv('ml-latest-small/ratings.csv')
ratings_df

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [91]:
movie_avg_rating = {}
for i in ratings_df.groupby(['movieId']):
    movieId = i[0]
    avg_rating = i[1].mean().rating
    movie_avg_rating[movieId] = avg_rating

# if movies are not rate, give a 0 rating
for movie in movies_set:
    if movie in movie_avg_rating.keys():
        continue
    movie_avg_rating[movie] = 0

In [93]:
len(movie_avg_rating), len(movies_set)

(9742, 9742)

In [6]:
tags_df = pd.read_csv('ml-latest-small/tags.csv')
tags_df

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200
...,...,...,...,...
3678,606,7382,for katie,1171234019
3679,606,7936,austere,1173392334
3680,610,3265,gun fu,1493843984
3681,610,3265,heroic bloodshed,1493843978


In [95]:
def get_recommendations_for_user(userId, consider_avg=True):
    for i in ratings_df.groupby(['userId']):
        if i[0] == userId:
            print('User', i[0])
            user_df = i[1]
            for r in user_df.groupby(['rating']):
                if r[0] == 5:
                    print('Rating', r[0])
                    
                    # find the movies rated by the user
                    user_rated_movies = (list(r[1].movieId))
                    print('Movies rated by user')
                    print(user_rated_movies)
                    print()
                    
                    # find the genres user prefer from the movies he rated
                    user_genres = []
                    for movie in user_rated_movies:
                        genres = get_genre_of_movie(movie)
                        for genre in genres:
                            user_genres.append(genre)
                    
                    print('Users genre preferences')
                    user_genre_prefs = []
                    for g in movie_genres:
                        count = user_genres.count(g)
                        user_genre_prefs.append(count)
                        print(g, '----', count)
                    print()
                    
                    # find the user's favourite genre
                    user_fav_genre = movie_genres[user_genre_prefs.index(max(user_genre_prefs))]
                    print('User Favourite genre:', user_fav_genre)
                    print()

                    # get movies of the genre user likes
                    movies = get_movies_of_genre(user_fav_genre)
                    recommend_movies = []
                    for movie in movies:
                        if consider_avg:
                            if movie_avg_rating[movie] < 4:
                                continue
                        if movie not in user_rated_movies:
                            recommend_movies.append(movie)
                    print('Recommendations:', recommend_movies[:10])

In [97]:
get_recommendations_for_user(100, consider_avg=False)

User 100
Rating 5.0
Movies rated by user
[1101, 1958, 2423, 4041, 5620]

Users genre preferences
Action ---- 1
Adventure ---- 0
Animation ---- 0
Children's ---- 0
Comedy ---- 3
Crime ---- 0
Documentary ---- 0
Drama ---- 2
Fantasy ---- 0
Film-Noir ---- 0
Horror ---- 0
Musical ---- 0
Mystery ---- 0
Romance ---- 3
Sci-Fi ---- 0
Thriller ---- 0
War ---- 0
Western ---- 0

User Favourite genre: Comedy

Recommendations: [1, 3, 4, 5, 7, 11, 12, 18, 19, 20]


# Matrix factorization

Approaches:
1. Find nearest neigbours 
2. Check cosine similarity between the matrices

In [98]:
ratings_df

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [258]:
def get_movies_rated_by_user(userId):
    user_rated_movies = []
    for i in ratings_df.groupby(['userId']):
        if i[0] == userId:
            user_df = i[1]
            for r in user_df.groupby(['rating']):
                if r[0] >= 4:                   
                    # find the movies rated by the user
                    user_rated_movies = (list(r[1].movieId))
    return user_rated_movies

In [297]:
def get_recommendations(u1, u2):
    recommend_movies = []
    u1_movies = get_movies_rated_by_user(u1)
    u2_movies = get_movies_rated_by_user(u2)
    for movie in u2_movies:
        if movie not in u1_movies:
            recommend_movies.append(movie)
            
    return recommend_movies[:10]

In [268]:
'''
here, all ratings are taken into consideration
'''

user_rows = []
for user in list(set(ratings_df.userId)):
    # print('User', user)
    user_row = [0] * 5000
    for userId, movieId, rating in zip(ratings_df.userId,
                                      ratings_df.movieId,
                                      ratings_df.rating):
        if userId == user:
            if movieId in range(0,5000):
                user_row[movieId] = rating
    # print(user_row)
    user_rows.append(user_row)

In [269]:
user_rows = np.asarray(user_rows)
user_rows.shape

(610, 5000)

In [270]:
'''
here, only ratings above 4.0 are taken into consideration
these values are given a 1 (user likes the movie)
or 0 (user does not)
'''

user_rows_binary = []
for user in list(set(ratings_df.userId)):
    # print('User', user)
    user_row = [0] * 5000
    for userId, movieId, rating in zip(ratings_df.userId,
                                      ratings_df.movieId,
                                      ratings_df.rating):
        if userId == user:
            if movieId in range(0,5000):
                if rating >= 4.0:
                    user_row[movieId] = 1
    # print(user_row)
    user_rows_binary.append(user_row)

In [271]:
user_rows_binary = np.asarray(user_rows_binary)
user_rows_binary.shape

(610, 5000)

### Nearest neighbours

In [174]:
from sklearn.neighbors import NearestNeighbors
import numpy as np

In [275]:
'''
get Nearest neighbours
'''
nbrs = NearestNeighbors(n_neighbors=5, algorithm='ball_tree').fit(user_rows)
distances, indices_v1 = nbrs.kneighbors(user_rows)
indices_v1

array([[  0, 278, 211, 246,  75],
       [  1, 597, 441, 549, 290],
       [  2, 597, 441, 290, 305],
       ...,
       [607, 424, 218, 433, 297],
       [608,  80, 339,  53, 129],
       [609, 297, 278,  62, 433]], dtype=int64)

In [291]:
def check_santity(u1, u2):
    print('Checking sanity for (%d, %d)' % (u1, u2))
    user_0 = np.where(user_rows[u1]>=4.0)[0]
    user_1 = np.where(user_rows[u2]>=4.0)[0]

    count = 0
    for _ in user_0:
        if _ in user_1:
            count += 1

    print('# of concurrences', count)
    return count

In [292]:
for ind in range(5):
    check_santity(indices_v1[ind][0], indices_v1[ind][1])

Checking sanity for (0, 278)
# of concurrences 18
Checking sanity for (1, 597)
# of concurrences 0
Checking sanity for (2, 597)
# of concurrences 0
Checking sanity for (3, 581)
# of concurrences 3
Checking sanity for (4, 564)
# of concurrences 7


In [293]:
indices_v1[0][1]

278

In [329]:
for ind in range(10):
    print('Recommendations for User %d:' %(ind), get_recommendations(ind, indices_v1[ind][1]))

Recommendations for User 0: [50, 318, 527, 1635, 54272]
Recommendations for User 1: [21, 45, 104, 110, 232, 246, 281, 322, 348, 356]
Recommendations for User 2: [21, 45, 50, 101, 104, 110, 232, 246, 260, 281]
Recommendations for User 3: [318, 527, 3147, 4896, 5816, 5995, 7361]
Recommendations for User 4: [1265, 1307, 7451, 58655, 60756, 67734, 69406, 88163, 88405, 94677]
Recommendations for User 5: [260, 1213, 2959, 4993]
Recommendations for User 6: [32, 902, 910, 914, 1028, 1136, 1197, 1207, 2161, 3000]
Recommendations for User 7: [48780, 79132]
Recommendations for User 8: [111, 246, 260, 296, 608, 750, 800, 858, 898, 899]
Recommendations for User 9: [3, 24, 50, 104, 110, 187, 260, 293, 318, 333]


Here it is seen preferences are less correlated

In [300]:
'''
get Nearest neighbours
'''

nbrs = NearestNeighbors(n_neighbors=5, algorithm='ball_tree').fit(user_rows_binary)
distances, indices_v2 = nbrs.kneighbors(user_rows_binary)
indices_v2

array([[  0, 514,  24, 398,  75],
       [  1, 441, 305, 280, 213],
       [  2, 305, 441, 213, 280],
       ...,
       [607, 253, 416,  61, 552],
       [608,  53, 549, 441, 305],
       [609, 433, 399, 278, 297]], dtype=int64)

In [301]:
def check_santity_binary(u1, u2):
    print('Checking sanity for (%d, %d)' % (u1, u2))
    user_0 = np.where(user_rows_binary[u1]==1.0)[0]
    user_1 = np.where(user_rows_binary[u2]==1.0)[0]

    count = 0
    for _ in user_0:
        if _ in user_1:
            count += 1

    print('# of concurrences', count)

In [302]:
for ind in range(5):
    check_santity_binary(indices_v2[ind][0], indices_v2[ind][1])

Checking sanity for (0, 514)
# of concurrences 7
Checking sanity for (1, 441)
# of concurrences 0
Checking sanity for (2, 305)
# of concurrences 0
Checking sanity for (3, 204)
# of concurrences 3
Checking sanity for (4, 144)
# of concurrences 6


In [328]:
for ind in range(10):
    print('Recommendations for User %d:' %(ind), get_recommendations(ind, indices_v2[ind][1]))

Recommendations for User 0: [260, 593, 1196, 1197, 1198, 1200, 1270, 1302, 1387, 2398]
Recommendations for User 1: [296, 589, 1241, 2683, 3254, 4402, 4571, 4993, 5481, 7153]
Recommendations for User 2: [32, 50, 70, 163, 260, 296, 318, 357, 431, 527]
Recommendations for User 3: [47, 1136, 1222, 1653, 2329, 2710, 2959, 3266, 3949, 4878]
Recommendations for User 4: [590, 2329, 4226, 4306, 6377, 6539, 8360, 8873]
Recommendations for User 5: [2058, 3534, 4387, 5785, 31184, 59315, 68358, 71535, 72998]
Recommendations for User 6: [32, 902, 910, 914, 1028, 1136, 1197, 1207, 2161, 3000]
Recommendations for User 7: [1265, 1307, 7451, 58655, 60756, 67734, 69406, 88163, 88405, 94677]
Recommendations for User 8: [377]
Recommendations for User 9: [260, 296, 589, 1241, 1517, 2571, 2683, 2716, 3253, 3254]


### Cosine similarity

In [325]:
from sklearn.metrics.pairwise import cosine_similarity
sim = cosine_similarity(user_rows)

In [326]:
def get_recom_from_sim(user, sim):
    indices = sim[user].argsort()[-5:][::-1]
    print('Recommendations for User %d:' %(user), get_recommendations(user, indices[1]))

In [327]:
for ind in range(10):
    get_recom_from_sim(user=ind, sim=sim)

Recommendations for User 0: [82, 524, 838, 904, 910, 915, 919, 1220, 1231, 1265]
Recommendations for User 1: [2028]
Recommendations for User 2: [293, 778, 1527, 1676, 1732, 2174, 2571, 2959, 4027, 8874]
Recommendations for User 3: [308, 1193, 1217, 2291, 2959, 4973, 6350, 6377, 7361, 8254]
Recommendations for User 4: [32, 110, 111, 296, 318, 441, 471, 529, 541, 555]
Recommendations for User 5: [367, 541, 1035, 1101, 1198, 1387, 2006, 2724]
Recommendations for User 6: [32, 902, 910, 914, 1028, 1136, 1197, 1207, 2161, 3000]
Recommendations for User 7: [48780, 79132]
Recommendations for User 8: [377]
Recommendations for User 9: [2490]


### SVD + Cosine similarity

In [308]:
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer

In [320]:
svd = TruncatedSVD(n_components=10)
normalizer = Normalizer(copy=True)
lsa = make_pipeline(svd, normalizer)
X_lsa = lsa.fit_transform(user_rows)

In [323]:
sim = cosine_similarity(X_lsa)

In [324]:
for ind in range(10):
    get_recom_from_sim(user=ind, sim=sim)

Recommendations for User 0: [293, 778, 1527, 1676, 1732, 2174, 2571, 2959, 4027, 8874]
Recommendations for User 1: [2289, 6377]
Recommendations for User 2: [1, 14, 36, 62, 661, 1073]
Recommendations for User 3: [2502, 2628, 2879, 3615]
Recommendations for User 4: [1, 24, 48, 153, 158, 317, 364, 480, 553, 588]
Recommendations for User 5: [367, 541, 1035, 1101, 1198, 1387, 2006, 2724]
Recommendations for User 6: [1193, 3567]
Recommendations for User 7: [318, 527, 1198, 2571, 2959, 4993, 7153, 79132, 91529, 92259]
Recommendations for User 8: [104, 231, 356, 953, 1304, 1923, 2706, 3481, 3616, 3617]
Recommendations for User 9: [225, 1376, 1639]


In [330]:
X_lsa = lsa.fit_transform(user_rows_binary)
sim = cosine_similarity(X_lsa)
for ind in range(10):
    get_recom_from_sim(user=ind, sim=sim)

Recommendations for User 0: [110, 590, 593, 595]
Recommendations for User 1: [541, 1199, 1211, 1273, 1288, 1300, 1394, 2020, 2076]
Recommendations for User 2: [111, 260, 527, 541, 750, 903, 912, 924, 968, 1097]
Recommendations for User 3: [296, 2858, 2959, 3535, 4262, 6874, 7361, 7438, 51255, 68157]
Recommendations for User 4: [367, 541, 1035, 1101, 1198, 1387, 2006, 2724]
Recommendations for User 5: [1639]
Recommendations for User 6: [32, 902, 910, 914, 1028, 1136, 1197, 1207, 2161, 3000]
Recommendations for User 7: [318, 527, 1198, 2571, 2959, 4993, 7153, 79132, 91529, 92259]
Recommendations for User 8: [104, 231, 356, 953, 1304, 1923, 2706, 3481, 3616, 3617]
Recommendations for User 9: [1220, 106782]
