# Recommendation Models

In [298]:
import pandas as pd 
import numpy as np 
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
from scipy.sparse import csr_matrix

In [299]:
#Ignore warnings so it doesnt clutter
import warnings
warnings.filterwarnings("ignore")

In [300]:
content_df_path = '../data/prepared-data/content-based-features.csv'
content_df = pd.read_csv(content_df_path)

print(content_df.head())

   UserID  MovieID  Rating  Avg_Rating  Rating_Count    Decade  Genre_Mystery  \
0       1     1193       5    0.847681      0.503064  0.666667              0   
1       1      661       3    0.616190      0.152903  0.888889              0   
2       1      914       3    0.788522      0.185293  0.555556              0   
3       1     3408       4    0.715970      0.383426  1.000000              0   
4       1     2355       5    0.713594      0.496644  0.888889              0   

   Genre_Thriller  Genre_Action  Genre_Western  ...  Genre_Animation  \
0               0             0              0  ...                0   
1               0             0              0  ...                1   
2               0             0              0  ...                0   
3               0             0              0  ...                0   
4               0             0              0  ...                1   

   Genre_Horror  Genre_Fantasy  Genre_Romance  Genre_Documentary  \
0           

In [301]:
print(content_df.columns)

Index(['UserID', 'MovieID', 'Rating', 'Avg_Rating', 'Rating_Count', 'Decade',
       'Genre_Mystery', 'Genre_Thriller', 'Genre_Action', 'Genre_Western',
       'Genre_War', 'Genre_Musical', 'Genre_Children's', 'Genre_Drama',
       'Genre_Animation', 'Genre_Horror', 'Genre_Fantasy', 'Genre_Romance',
       'Genre_Documentary', 'Genre_Film-Noir', 'Genre_Adventure',
       'Genre_Comedy', 'Genre_Crime', 'Genre_Sci-Fi'],
      dtype='object')


## Cosine similarity-based content-based movie recommender

With using a movie - item matrix. I think its good. Turned out much better than the one using tf-idf.

In [302]:
# Drop duplicates and keep the first row for each MovieID
movie_features = content_df.drop_duplicates(subset='MovieID')

In [303]:
print(movie_features.head())

   UserID  MovieID  Rating  Avg_Rating  Rating_Count    Decade  Genre_Mystery  \
0       1     1193       5    0.847681      0.503064  0.666667              0   
1       1      661       3    0.616190      0.152903  0.888889              0   
2       1      914       3    0.788522      0.185293  0.555556              0   
3       1     3408       4    0.715970      0.383426  1.000000              0   
4       1     2355       5    0.713594      0.496644  0.888889              0   

   Genre_Thriller  Genre_Action  Genre_Western  ...  Genre_Animation  \
0               0             0              0  ...                0   
1               0             0              0  ...                1   
2               0             0              0  ...                0   
3               0             0              0  ...                0   
4               0             0              0  ...                1   

   Genre_Horror  Genre_Fantasy  Genre_Romance  Genre_Documentary  \
0           

In [304]:
features_to_keep = [
    'MovieID', 'Avg_Rating', 'Rating_Count', 'Decade',
    'Genre_Mystery', 'Genre_Thriller', 'Genre_Action', 'Genre_Western',
    'Genre_War', 'Genre_Musical', "Genre_Children's", 'Genre_Drama',
    'Genre_Animation', 'Genre_Horror', 'Genre_Fantasy', 'Genre_Romance',
    'Genre_Documentary', 'Genre_Film-Noir', 'Genre_Adventure',
    'Genre_Comedy', 'Genre_Crime', 'Genre_Sci-Fi'
]
movie_features = movie_features[features_to_keep]
movie_features = movie_features.sort_values('MovieID', ascending=True)
movie_features = movie_features.reset_index(drop=True) # reset index

print(movie_features)

      MovieID  Avg_Rating  Rating_Count    Decade  Genre_Mystery  \
0           1    0.786712      0.605778  0.888889              0   
1           2    0.550285      0.204260  0.888889              0   
2           3    0.504184      0.139189  0.888889              0   
3           4    0.432353      0.049314  0.888889              0   
4           5    0.501689      0.086081  0.888889              0   
...       ...         ...           ...       ...            ...   
3701     3948    0.658933      0.251240  1.000000              0   
3702     3949    0.778783      0.088416  1.000000              0   
3703     3950    0.666667      0.015465  1.000000              0   
3704     3951    0.725000      0.011380  1.000000              0   
3705     3952    0.695232      0.112927  1.000000              0   

      Genre_Thriller  Genre_Action  Genre_Western  Genre_War  Genre_Musical  \
0                  0             0              0          0              0   
1                  0     

In [305]:
X = movie_features.drop('MovieID', axis=1)

# Compute similarity
similarity_matrix = cosine_similarity(X)

In [306]:
print(similarity_matrix.shape)

(3706, 3706)


In [307]:
print(similarity_matrix)

[[1.         0.5281038  0.59371583 ... 0.41636992 0.42213373 0.36812398]
 [0.5281038  1.         0.3079409  ... 0.39596942 0.39923773 0.34048716]
 [0.59371583 0.3079409  1.         ... 0.44840283 0.45151611 0.38350801]
 ...
 [0.41636992 0.39596942 0.44840283 ... 1.         0.99944499 0.84324733]
 [0.42213373 0.39923773 0.45151611 ... 0.99944499 1.         0.84309625]
 [0.36812398 0.34048716 0.38350801 ... 0.84324733 0.84309625 1.        ]]


In [308]:
# Original movies dataframe for titles since i got rid of those
movies_column_names = ["MovieID", "Title", "Genres"]

movies_path = '../data/ml-1m-(raw)/movies.dat'
movies_og = pd.read_csv(movies_path, header=None, sep='::', engine="python", names=movies_column_names, encoding="latin-1")

movies_og = movies_og.merge(movie_features, on='MovieID')

print(movies_og)

      MovieID                               Title  \
0           1                    Toy Story (1995)   
1           2                      Jumanji (1995)   
2           3             Grumpier Old Men (1995)   
3           4            Waiting to Exhale (1995)   
4           5  Father of the Bride Part II (1995)   
...       ...                                 ...   
3701     3948             Meet the Parents (2000)   
3702     3949          Requiem for a Dream (2000)   
3703     3950                    Tigerland (2000)   
3704     3951             Two Family House (2000)   
3705     3952               Contender, The (2000)   

                            Genres  Avg_Rating  Rating_Count    Decade  \
0      Animation|Children's|Comedy    0.786712      0.605778  0.888889   
1     Adventure|Children's|Fantasy    0.550285      0.204260  0.888889   
2                   Comedy|Romance    0.504184      0.139189  0.888889   
3                     Comedy|Drama    0.432353      0.049314  0.888

In [309]:
def get_title_from_id(movie_id, movies_og):
    title = movies_og[movies_og['MovieID'] == movie_id]['Title']
    return title.values[0] if not title.empty else None

def get_random_movie(movie_features, movies_og):
    random_row = movie_features.sample(1)
    movie_id = random_row['MovieID'].values[0]
    title = get_title_from_id(movie_id, movies_og)
    return movie_id, title


def recommend_movies(movie_id, movie_features, similarity_matrix, movies_og, top_n=5):
    movie_idx = movie_features[movie_features['MovieID'] == movie_id].index[0]
    similarity_scores = similarity_matrix[movie_idx]

    similar_indices = np.argsort(similarity_scores)[::-1][1:top_n+1]
    similar_movies = movie_features.iloc[similar_indices].copy()
    similar_movies['Title'] = similar_movies['MovieID'].apply(lambda x: get_title_from_id(x, movies_og))

    return similar_movies[['MovieID', 'Title', 'Avg_Rating']]


In [310]:
#Chooses a random movie and recommends 5 that are similar

random_id, random_name = get_random_movie(movie_features, movies_og)
random_rec = recommend_movies(random_id, movie_features, similarity_matrix, movies_og, 5)

print(random_id, random_name)
print(random_rec)

2313 Elephant Man, The (1980)
      MovieID                     Title  Avg_Rating
1096     1185       My Left Foot (1989)    0.739514
1776     1956    Ordinary People (1980)    0.747535
1024     1096    Sophie's Choice (1982)    0.744186
2534     2739  Color Purple, The (1985)    0.713889
2853     3068       Verdict, The (1982)    0.716359


In [311]:
# Some outputs :)
# They are allright
'''
1320 Alien³ (1992)
      MovieID                       Title  Avg_Rating
2096     2288           Thing, The (1982)    0.696356
1124     1214                Alien (1979)    0.789896
1553     1690  Alien: Resurrection (1997)    0.489730
2621     2826    13th Warrior, The (1999)    0.539667
3455     3697           Predator 2 (1990)    0.466952
-----------------------------------------------------
2273 Rush Hour (1998)
      MovieID                              Title  Avg_Rating
2296     2490                     Payback (1999)    0.626455
1877     2058             Negotiator, The (1998)    0.652065
3043     3267                Mariachi, El (1992)    0.671843
159       165  Die Hard: With a Vengeance (1995)    0.640303
981      1047    Long Kiss Goodnight, The (1996)    0.620968
------------------------------------------------------------
177 Lord of Illusions (1995)
      MovieID                                              Title  Avg_Rating
1819     1999                           Exorcist III, The (1990)    0.437008
2135     2328                                    Vampires (1998)    0.462428
2134     2327          Tales from the Darkside: The Movie (1990)    0.462500
790       842  Tales from the Crypt Presents: Bordello of Blo...    0.397351
3594     3839                       Phantasm IV: Oblivion (1998)    0.409574
----------------------------------------------------------------------------
2716 Ghostbusters (1984)
      MovieID                            Title  Avg_Rating
1186     1278        Young Frankenstein (1974)    0.812657
1823     2003                  Gremlins (1984)    0.567010
2512     2717           Ghostbusters II (1989)    0.476549
3040     3264  Buffy the Vampire Slayer (1992)    0.522949
2660     2867              Fright Night (1985)    0.608660
'''

'\n1320 Alien³ (1992)\n      MovieID                       Title  Avg_Rating\n2096     2288           Thing, The (1982)    0.696356\n1124     1214                Alien (1979)    0.789896\n1553     1690  Alien: Resurrection (1997)    0.489730\n2621     2826    13th Warrior, The (1999)    0.539667\n3455     3697           Predator 2 (1990)    0.466952\n-----------------------------------------------------\n2273 Rush Hour (1998)\n      MovieID                              Title  Avg_Rating\n2296     2490                     Payback (1999)    0.626455\n1877     2058             Negotiator, The (1998)    0.652065\n3043     3267                Mariachi, El (1992)    0.671843\n159       165  Die Hard: With a Vengeance (1995)    0.640303\n981      1047    Long Kiss Goodnight, The (1996)    0.620968\n------------------------------------------------------------\n177 Lord of Illusions (1995)\n      MovieID                                              Title  Avg_Rating\n1819     1999              

In [312]:
# Save them to use in deployment
movie_features.to_pickle('../app/data/movie_features.pkl')
movies_og.to_pickle('../app/data/movies_og.pkl')

#import pickle
#with open('../app/data/similarity_matrix.pkl', 'wb') as f:
#    pickle.dump(similarity_matrix, f)


## TF-IDF (Term Frequency - Inverse Document Frequency)

TF-IDF (Term Frequency – Inverse Document Frequency) is a way to numerically represent text — in our case, movie titles or plot summaries — as vectors. It captures:

* TF (term frequency): how often a word appears in a document (e.g., a movie title).

* IDF (inverse document frequency): how unique that word is across all documents (common words get downweighted).

It wasnt very successful, normal movie item matrix is better in this case

In [313]:
titles = movies_og['Title']

# Initialize vectorizer
vectorizer = TfidfVectorizer(stop_words='english', max_features=500)

# Learn vocab & transform titles into TF-IDF matrix
tfidf_matrix = vectorizer.fit_transform(titles)

In [314]:
print(tfidf_matrix)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 6838 stored elements and shape (3706, 500)>
  Coords	Values
  (0, 436)	0.8599881329707616
  (0, 70)	0.510314031895522
  (1, 70)	1.0
  (2, 70)	0.3452435409242046
  (2, 364)	0.709877127714642
  (2, 334)	0.6139066386656256
  (3, 70)	0.4151252140296577
  (3, 473)	0.909764286327415
  (4, 70)	0.29030832137039253
  (4, 200)	0.6362228404357442
  (4, 120)	0.5630165241766855
  (4, 274)	0.44040205421294576
  (5, 70)	0.405263589381106
  (5, 256)	0.9141998813836841
  (6, 70)	1.0
  (7, 70)	0.4308321981876823
  (7, 461)	0.9024320567249202
  (8, 70)	0.47770388295406585
  (8, 158)	0.8785209162055324
  (9, 70)	1.0
  (10, 70)	0.49571585051526307
  (10, 88)	0.8684847698998119
  (11, 70)	0.35347878933765625
  (11, 173)	0.7268101439677717
  (11, 156)	0.5888970708994332
  :	:
  (3693, 65)	0.33633958861754976
  (3693, 329)	0.47591650082282183
  (3693, 422)	0.5389755159653536
  (3694, 272)	0.5910201621884316
  (3694, 61)	0.4462911527846371
  (3694, 

In [315]:
#Combine tf-idf with feature matrix

# Drop MovieID for similarity — we'll keep it elsewhere
numerical_features = movie_features.drop('MovieID', axis=1).values

# Stack horizontally: TF-IDF (sparse) + other features (dense)
combined_features = hstack([tfidf_matrix, csr_matrix(numerical_features)])
similarity_matrix = cosine_similarity(combined_features)

In [316]:
#Check so everyting is aligned hopefully
assert similarity_matrix.shape[0] == len(movie_features)

In [317]:
def get_title_from_id(movie_id, movies_og):
    title = movies_og[movies_og['MovieID'] == movie_id]['Title']
    return title.values[0] if not title.empty else None

def get_random_movie(movie_features, movies_og):
    random_row = movie_features.sample(1)
    movie_id = random_row['MovieID'].values[0]
    title = get_title_from_id(movie_id, movies_og)
    return movie_id, title


def recommend_movies(movie_id, movie_features, similarity_matrix, movies_og, top_n=5):
    movie_idx = movie_features[movie_features['MovieID'] == movie_id].index[0]
    similarity_scores = similarity_matrix[movie_idx]

    similar_indices = np.argsort(similarity_scores)[::-1][1:top_n+1]
    similar_movies = movie_features.iloc[similar_indices].copy()
    similar_movies['Title'] = similar_movies['MovieID'].apply(lambda x: get_title_from_id(x, movies_og))

    return similar_movies[['MovieID', 'Title', 'Avg_Rating']]

In [318]:
#Chooses a random movie and recommends 5 that are similar

random_id, random_name = get_random_movie(movie_features, movies_og)
random_rec = recommend_movies(random_id, movie_features, similarity_matrix, movies_og, 5)

print(random_id, random_name)
print(random_rec)

3553 Gossip (2000)
      MovieID                       Title  Avg_Rating
3280     3513  Rules of Engagement (2000)    0.586905
3705     3952       Contender, The (2000)    0.695232
3277     3510            Frequency (2000)    0.689708
3699     3946           Get Carter (2000)    0.315000
3178     3409    Final Destination (2000)    0.568083


In [319]:
# Outputs with if-idf :(
# Previour results were much much better. 
# It doesnt even find the similarly named ones as good as it did previourly.
# The problem is probably that the titles are too short to work with. 
# If movies had a description that might work better.
'''
1320 Alien³ (1992)
      MovieID                       Title  Avg_Rating
972      1037  Lawnmower Man, The (1992)    0.415041
2096     2288          Thing, The (1982)    0.696356
1124     1214               Alien (1979)    0.789896
3032     3256       Patriot Games (1992)    0.687500
805       861            Supercop (1992)    0.628205
-----------------------------------------------------
2273 Rush Hour (1998)
      MovieID                            Title  Avg_Rating
1877     2058           Negotiator, The (1998)    0.652065
2160     2353        Enemy of the State (1998)    0.671812
1623     1792            U.S. Marshalls (1998)    0.542969
1608     1769  Replacement Killers, The (1998)    0.534483
2141     2334                Siege, The (1998)    0.531088
------------------------------------------------------------
177 Lord of Illusions (1995)
      MovieID                                              Title  Avg_Rating
3593     3838              Phantasm III: Lord of the Dead (1994)    0.404762
182       188                               Prophecy, The (1995)    0.555263
146       152                              Addiction, The (1995)    0.500000
214       220                                Castle Freak (1995)    0.205357
318       328  Tales From the Crypt Presents: Demon Knight (1...    0.527778
----------------------------------------------------------------------------
2716 Ghostbusters (1984)
      MovieID                      Title  Avg_Rating
1823     2003            Gremlins (1984)    0.567010
1963     2144     Sixteen Candles (1984)    0.677677
1186     1278  Young Frankenstein (1974)    0.812657
3542     3785         Scary Movie (2000)    0.502855
3292     3525      Bachelor Party (1984)    0.534722
'''

'\n1320 Alien³ (1992)\n      MovieID                       Title  Avg_Rating\n972      1037  Lawnmower Man, The (1992)    0.415041\n2096     2288          Thing, The (1982)    0.696356\n1124     1214               Alien (1979)    0.789896\n3032     3256       Patriot Games (1992)    0.687500\n805       861            Supercop (1992)    0.628205\n-----------------------------------------------------\n2273 Rush Hour (1998)\n      MovieID                            Title  Avg_Rating\n1877     2058           Negotiator, The (1998)    0.652065\n2160     2353        Enemy of the State (1998)    0.671812\n1623     1792            U.S. Marshalls (1998)    0.542969\n1608     1769  Replacement Killers, The (1998)    0.534483\n2141     2334                Siege, The (1998)    0.531088\n------------------------------------------------------------\n177 Lord of Illusions (1995)\n      MovieID                                              Title  Avg_Rating\n3593     3838              Phantasm III: Lor

## Downweighted TF-IDF

In [320]:
#Scaling TF-IDF to 0.3 of its weight
tfidf_matrix = tfidf_matrix * 0.2

# Combine again
combined_features = hstack([tfidf_matrix, csr_matrix(numerical_features)])
similarity_matrix = cosine_similarity(combined_features)

In [321]:
random_id, random_name = get_random_movie(movie_features, movies_og)
random_rec = recommend_movies(random_id, movie_features, similarity_matrix, movies_og, 5)

print(random_id, random_name)
print(random_rec)

1566 Hercules (1997)
      MovieID                          Title  Avg_Rating
104       107  Muppet Treasure Island (1996)    0.537109
965      1030           Pete's Dragon (1977)    0.519350
574       588                 Aladdin (1992)    0.697076
1897     2078        Jungle Book, The (1967)    0.704066
1921     2102        Steamboat Willie (1940)    0.596447


In [322]:
# Outputs with downweighted (0.2) if-idf :o
# I think it only got better becase the original movie item vector is doing the heavy lifting now.
# Not really satisfied.
'''
1320 Alien³ (1992)
      MovieID                       Title  Avg_Rating
2096     2288           Thing, The (1982)    0.696356
1124     1214                Alien (1979)    0.789896
1553     1690  Alien: Resurrection (1997)    0.489730
2621     2826    13th Warrior, The (1999)    0.539667
3455     3697           Predator 2 (1990)    0.466952
-----------------------------------------------------
2273 Rush Hour (1998)
      MovieID                              Title  Avg_Rating
1877     2058             Negotiator, The (1998)    0.652065
2296     2490                     Payback (1999)    0.626455
3043     3267                Mariachi, El (1992)    0.671843
159       165  Die Hard: With a Vengeance (1995)    0.640303
981      1047    Long Kiss Goodnight, The (1996)    0.620968
------------------------------------------------------------
177 Lord of Illusions (1995)
      MovieID                                      Title  Avg_Rating
3593     3838      Phantasm III: Lord of the Dead (1994)    0.404762
1819     1999                   Exorcist III, The (1990)    0.437008
2135     2328                            Vampires (1998)    0.462428
2134     2327  Tales from the Darkside: The Movie (1990)    0.462500
146       152                      Addiction, The (1995)    0.500000
----------------------------------------------------------------------------
2716 Ghostbusters (1984)
      MovieID                            Title  Avg_Rating
1186     1278        Young Frankenstein (1974)    0.812657
1823     2003                  Gremlins (1984)    0.567010
2512     2717           Ghostbusters II (1989)    0.476549
3040     3264  Buffy the Vampire Slayer (1992)    0.522949
3542     3785               Scary Movie (2000)    0.502855
'''

'\n1320 Alien³ (1992)\n      MovieID                       Title  Avg_Rating\n2096     2288           Thing, The (1982)    0.696356\n1124     1214                Alien (1979)    0.789896\n1553     1690  Alien: Resurrection (1997)    0.489730\n2621     2826    13th Warrior, The (1999)    0.539667\n3455     3697           Predator 2 (1990)    0.466952\n-----------------------------------------------------\n2273 Rush Hour (1998)\n      MovieID                              Title  Avg_Rating\n1877     2058             Negotiator, The (1998)    0.652065\n2296     2490                     Payback (1999)    0.626455\n3043     3267                Mariachi, El (1992)    0.671843\n159       165  Die Hard: With a Vengeance (1995)    0.640303\n981      1047    Long Kiss Goodnight, The (1996)    0.620968\n------------------------------------------------------------\n177 Lord of Illusions (1995)\n      MovieID                                      Title  Avg_Rating\n3593     3838      Phantasm III: Lo