# CS6460 Recommendation System

Datasets are from MovieLens (https://grouplens.org/datasets/movielens/) 

In [1]:
%matplotlib inline
import pandas as pd
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity

import warnings; warnings.simplefilter('ignore')

> ## Part 1 - Content Based Filtering Method Using NLP

This project focues on personalization so this algorithm will computes similarity among movies based on Movie Overviews and Taglines to the movies that users like. This is Content Based Filtering because this part is using movie metadata to develop. 

This part will only use part of the MovieLens to speed up computation. 

In [2]:
movie_metadata = pd.read_csv('movies.csv')
movie_links = pd.read_csv('links.csv')

In [3]:
movie_metadata.head(5)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [4]:
movie_links.head(5)

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


Preprocess Datasets

In [5]:
movie_links = movie_links[movie_links['tmdbId'].notnull()]['tmdbId'].astype('int')
movie_metadata = movie_metadata.drop([19730, 29503, 35587])
movie_metadata['id'] = movie_metadata['id'].astype('int')
# pmm  (processed_movie_metadata)
pmm = movie_metadata[movie_metadata['id'].isin(movie_links)]

In [6]:
pmm['tagline'] = pmm['tagline'].fillna('')
pmm['overview'] = pmm['overview'].fillna('')
pmm['description'] = pmm['overview'] + pmm['tagline']
pmm['description'] = pmm['description'].fillna('')

Applying the NLP method TF-IDF. 
Ref1 https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html & 
Ref2 https://www.learndatasci.com/glossary/tf-idf-term-frequency-inverse-document-frequency/#:~:text=Using%20scikit%2Dlearn-,What%20is%20TF%2DIDF%3F,%2C%20relative%20to%20a%20corpus).

In [7]:
vectorizer = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0.0, stop_words='english')
tfidf_matrix = vectorizer.fit_transform(pmm['description'])
print(tfidf_matrix.shape)

(45463, 1104516)


Applying Cosine Similarity to calculate the similarity between two movies. 
https://goodboychan.github.io/python/datacamp/natural_language_processing/2020/07/17/04-TF-IDF-and-similarity-scores.html

In [8]:
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [9]:
pmm = pmm.reset_index()
titles = pmm['title']
indices = pd.Series(pmm.index, index=pmm['title'])

In [10]:
def get_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:50]
    movie_indices = [i[0] for i in sim_scores]
    return titles.iloc[movie_indices]

Use qualitative evaluation by observating how close the recommendations are. 

In [66]:
get_recommendations('Ip Man').head(10)

43259        The Legend Is Born: Ip Man
11838                 Dragon Tiger Gate
24338                    The Odd Couple
38900             Bruce Lee: The Legend
15571                          Ip Man 2
20816                   The Grandmaster
42407                 Kung Fu Wing Chun
39429    Young Dragons: Kung Fu Kids II
31636       Young Dragons: Kung Fu Kids
30655     The Incredible Kung Fu Master
Name: title, dtype: object

> ## Part 2 - Collaborative Based Filtering Method Using SVD

In [21]:
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate

%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
from surprise import Reader, Dataset, SVD

import warnings; warnings.simplefilter('ignore')

In [22]:
reader = Reader()

In [32]:
ratings = pd.read_csv('ratings_small.csv')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [33]:
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)

In [68]:
svd = SVD()
cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=5)

{'test_rmse': array([0.90886465, 0.89052397, 0.89661052, 0.88935198, 0.89698521]),
 'test_mae': array([0.69934374, 0.68541644, 0.68921528, 0.6850171 , 0.69231221]),
 'fit_time': (0.48992109298706055,
  0.42191505432128906,
  0.40769100189208984,
  0.40878915786743164,
  0.43720316886901855),
 'test_time': (0.04843592643737793,
  0.04628300666809082,
  0.046096086502075195,
  0.04620790481567383,
  0.12246870994567871)}

In [69]:
trainset = data.build_full_trainset()
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x467b74c10>

In [70]:
ratings[ratings['userId'] == 1]

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205
5,1,1263,2.0,1260759151
6,1,1287,2.0,1260759187
7,1,1293,2.0,1260759148
8,1,1339,3.5,1260759125
9,1,1343,2.0,1260759131


In [71]:
svd.predict(1, 302, 3)

Prediction(uid=1, iid=302, r_ui=3, est=2.7550630587345912, details={'was_impossible': False})

> ## Part 3 Hybrid NLP & SVD 

In [72]:
def convert_int(x):
    try:
        return int(x)
    except:
        return np.nan

In [73]:
id_map = pd.read_csv('links.csv')[['movieId', 'tmdbId']]
id_map['tmdbId'] = id_map['tmdbId'].apply(convert_int)
id_map.columns = ['movieId', 'id']
id_map = id_map.merge(pmm[['title', 'id']], on='id').set_index('title')
#id_map = id_map.set_index('tmdbId')

In [74]:
indices_map = id_map.set_index('id')

In [77]:
def hybrid(userId, title):
    idx = indices[title]
    tmdbId = id_map.loc[title]['id']
    #print(idx)
    movie_id = id_map.loc[title]['movieId']
    
    sim_scores = list(enumerate(cosine_sim[int(idx)]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:26]
    movie_indices = [i[0] for i in sim_scores]
    
    movies = pmm.iloc[movie_indices][['title', 'vote_count', 'vote_average', 'id']]
    movies['est'] = movies['id'].apply(lambda x: svd.predict(userId, indices_map.loc[x]['movieId']).est)
    movies = movies.sort_values('est', ascending=False)
    return movies.head(10)

In [79]:
hybrid(5, 'Toy Story')

Unnamed: 0,title,vote_count,vote_average,id,est
15348,Toy Story 3,4710.0,7.6,10193,4.490617
1071,Rebel Without a Cause,351.0,7.6,221,4.384847
1199,Manhattan,600.0,7.8,696,4.115904
2997,Toy Story 2,3914.0,7.3,863,4.064339
11606,Factory Girl,83.0,6.2,12271,4.0196
1932,Condorman,37.0,5.6,19379,4.008047
10301,The 40 Year Old Virgin,2020.0,6.2,6957,3.865786
8327,The Champ,13.0,6.8,42816,3.800043
43424,Andy Kaufman Plays Carnegie Hall,0.0,0.0,364123,3.800043
34587,Lost and Love,6.0,5.8,329206,3.800043


In [80]:
hybrid(500, 'Ip Man')

Unnamed: 0,title,vote_count,vote_average,id,est
2808,Drunken Master,178.0,7.2,11230,3.376841
7502,Master of the Flying Guillotine,27.0,5.9,49636,3.018792
43259,The Legend Is Born: Ip Man,227.0,6.6,44249,2.964582
19298,I Am Bruce Lee,35.0,7.7,84383,2.964582
29039,Kung Fu Jungle,117.0,6.5,290864,2.964582
34519,"Bruce Lee, My Brother",17.0,7.0,53807,2.964582
24484,Crippled Avengers,21.0,7.0,40081,2.964582
20891,Descendant of the Sun,2.0,4.3,64988,2.964582
15112,City of Life and Death,55.0,7.6,21345,2.964582
19893,The 36 Crazy Fists,3.0,5.0,46114,2.964582


> ## Part 4 - Ultimate Collaborative Based Filtering Method Using NCF Deep Learning

In [89]:
from tqdm import tqdm
import torch, torch.nn as nn, torch.utils.data as data, torchvision as tv, torch.nn.functional as F
import lightning as L
from torch.utils.data import Dataset, DataLoader

In [111]:
ratings = pd.read_csv('ratings_small.csv', 
                      parse_dates=['timestamp'])

In [112]:
rand_userIds = np.random.choice(ratings['userId'].unique(), 
                                size=int(len(ratings['userId'].unique())*0.3), 
                                replace=False)

ratings = ratings.loc[ratings['userId'].isin(rand_userIds)]

print('There are {} rows of data from {} users'.format(len(ratings), len(rand_userIds)))

There are 27310 rows of data from 201 users


In [113]:
rand_userIds = np.random.choice(ratings['userId'].unique(), 
                                size=int(len(ratings['userId'].unique())*0.3), 
                                replace=False)

ratings = ratings.loc[ratings['userId'].isin(rand_userIds)]

print('There are {} rows of data from {} users'.format(len(ratings), len(rand_userIds)))

There are 7277 rows of data from 60 users


In [114]:
ratings.sample(5)

Unnamed: 0,userId,movieId,rating,timestamp
8515,56,6936,4.0,1467006226
18921,125,61323,5.0,1269735674
43164,308,648,5.0,854376177
81502,554,4179,4.0,1012752793
65315,464,165,4.0,829912558


In [115]:
ratings['rank_latest'] = ratings.groupby(['userId'])['timestamp'] \
                                .rank(method='first', ascending=False)

train_ratings = ratings[ratings['rank_latest'] != 1]
test_ratings = ratings[ratings['rank_latest'] == 1]

# drop columns that we no longer need
train_ratings = train_ratings[['userId', 'movieId', 'rating']]
test_ratings = test_ratings[['userId', 'movieId', 'rating']]

In [116]:
train_ratings.sample(5)

Unnamed: 0,userId,movieId,rating
56387,407,342,4.0
51029,378,61240,3.0
4589,23,6669,4.0
4455,23,4007,4.0
56576,407,3911,4.0


In [117]:
# Get a list of all movie IDs
all_movieIds = ratings['movieId'].unique()

# Placeholders that will hold the training data
users, items, labels = [], [], []

# This is the set of items that each user has interaction with
user_item_set = set(zip(train_ratings['userId'], train_ratings['movieId']))

# 4:1 ratio of negative to positive samples
num_negatives = 4

In [118]:
for (u, i) in tqdm(user_item_set):
    users.append(u)
    items.append(i)
    labels.append(1) # items that the user has interacted with are positive
    for _ in range(num_negatives):
        # randomly select an item
        negative_item = np.random.choice(all_movieIds) 
        # check that the user has not interacted with this item
        while (u, negative_item) in user_item_set:
            negative_item = np.random.choice(all_movieIds)
        users.append(u)
        items.append(negative_item)
        labels.append(0) # items not interacted with are negative


  0%|                                                                                                                                          | 0/7217 [00:00<?, ?it/s][A
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7217/7217 [00:00<00:00, 40471.71it/s][A


In [119]:
class MovieLensTrainDataset(Dataset):
    """MovieLens PyTorch Dataset for Training
    
    Args:
        ratings (pd.DataFrame): Dataframe containing the movie ratings
        all_movieIds (list): List containing all movieIds
    
    """

    def __init__(self, ratings, all_movieIds):
        self.users, self.items, self.labels = self.get_dataset(ratings, all_movieIds)

    def __len__(self):
        return len(self.users)
  
    def __getitem__(self, idx):
        return self.users[idx], self.items[idx], self.labels[idx]

    def get_dataset(self, ratings, all_movieIds):
        users, items, labels = [], [], []
        user_item_set = set(zip(ratings['userId'], ratings['movieId']))

        num_negatives = 4
        for u, i in user_item_set:
            users.append(u)
            items.append(i)
            labels.append(1)
            for _ in range(num_negatives):
                negative_item = np.random.choice(all_movieIds)
                while (u, negative_item) in user_item_set:
                    negative_item = np.random.choice(all_movieIds)
                users.append(u)
                items.append(negative_item)
                labels.append(0)

        return torch.tensor(users), torch.tensor(items), torch.tensor(labels)

In [120]:
class NCF(L.LightningModule):
    """ Neural Collaborative Filtering (NCF)
    
        Args:
            num_users (int): Number of unique users
            num_items (int): Number of unique items
            ratings (pd.DataFrame): Dataframe containing the movie ratings for training
            all_movieIds (list): List containing all movieIds (train + test)
    """
    
    def __init__(self, num_users, num_items, ratings, all_movieIds):
        super().__init__()
        self.user_embedding = nn.Embedding(num_embeddings=num_users, embedding_dim=8)
        self.item_embedding = nn.Embedding(num_embeddings=num_items, embedding_dim=8)
        self.fc1 = nn.Linear(in_features=16, out_features=64)
        self.fc2 = nn.Linear(in_features=64, out_features=32)
        self.output = nn.Linear(in_features=32, out_features=1)
        self.ratings = ratings
        self.all_movieIds = all_movieIds
        
    def forward(self, user_input, item_input):
        
        # Pass through embedding layers
        user_embedded = self.user_embedding(user_input)
        item_embedded = self.item_embedding(item_input)

        # Concat the two embedding layers
        vector = torch.cat([user_embedded, item_embedded], dim=-1)

        # Pass through dense layer
        vector = nn.ReLU()(self.fc1(vector))
        vector = nn.ReLU()(self.fc2(vector))

        # Output layer
        pred = nn.Sigmoid()(self.output(vector))

        return pred
    
    def training_step(self, batch, batch_idx):
        user_input, item_input, labels = batch
        predicted_labels = self(user_input, item_input)
        loss = nn.BCELoss()(predicted_labels, labels.view(-1, 1).float())
        return loss

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters())

    def train_dataloader(self):
        return DataLoader(MovieLensTrainDataset(self.ratings, self.all_movieIds),
                          batch_size=512, num_workers=1)

In [121]:
num_users = ratings['userId'].max()+1
num_items = ratings['movieId'].max()+1

all_movieIds = ratings['movieId'].unique()

model = NCF(num_users, num_items, train_ratings, all_movieIds)

In [122]:
trainer = L.Trainer(max_epochs=5, accelerator='auto', reload_dataloaders_every_n_epochs=1)

GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [123]:
trainer.fit(model)


  | Name           | Type      | Params
---------------------------------------------
0 | user_embedding | Embedding | 5.0 K 
1 | item_embedding | Embedding | 1.3 M 
2 | fc1            | Linear    | 1.1 K 
3 | fc2            | Linear    | 2.1 K 
4 | output         | Linear    | 33    
---------------------------------------------
1.3 M     Trainable params
0         Non-trainable params
1.3 M     Total params
5.172     Total estimated model params size (MB)
Traceback (most recent call last):
  File "<string>", line 1, in <module>
  File "/Applications/Xcode_13.4.0_fb.app/Contents/Developer/Library/Frameworks/Python3.framework/Versions/3.8/lib/python3.8/multiprocessing/spawn.py", line 116, in spawn_main
    exitcode = _main(fd, parent_sentinel)
  File "/Applications/Xcode_13.4.0_fb.app/Contents/Developer/Library/Frameworks/Python3.framework/Versions/3.8/lib/python3.8/multiprocessing/spawn.py", line 126, in _main
    self = reduction.pickle.load(from_parent)
AttributeError: Can't get at

Training: 0it [00:00, ?it/s]

Traceback (most recent call last):
  File "<string>", line 1, in <module>
  File "/Applications/Xcode_13.4.0_fb.app/Contents/Developer/Library/Frameworks/Python3.framework/Versions/3.8/lib/python3.8/multiprocessing/spawn.py", line 116, in spawn_main
    exitcode = _main(fd, parent_sentinel)
  File "/Applications/Xcode_13.4.0_fb.app/Contents/Developer/Library/Frameworks/Python3.framework/Versions/3.8/lib/python3.8/multiprocessing/spawn.py", line 126, in _main
    self = reduction.pickle.load(from_parent)
AttributeError: Can't get attribute 'MovieLensTrainDataset' on <module '__main__' (built-in)>


RuntimeError: DataLoader worker (pid(s) 26114) exited unexpectedly

In [None]:
# User-item pairs for testing
test_user_item_set = set(zip(test_ratings['userId'], test_ratings['movieId']))

# Dict of all items that are interacted with by each user
user_interacted_items = ratings.groupby('userId')['movieId'].apply(list).to_dict()

hits = []
for (u,i) in tqdm(test_user_item_set):
    interacted_items = user_interacted_items[u]
    not_interacted_items = set(all_movieIds) - set(interacted_items)
    selected_not_interacted = list(np.random.choice(list(not_interacted_items), 99))
    test_items = selected_not_interacted + [i]
    
    predicted_labels = np.squeeze(model(torch.tensor([u]*100), 
                                        torch.tensor(test_items)).detach().numpy())
    
    top10_items = [test_items[i] for i in np.argsort(predicted_labels)[::-1][0:10].tolist()]
    
    if i in top10_items:
        hits.append(1)
    else:
        hits.append(0)
        
print("The Hit Ratio @ 10 is {:.2f}".format(np.average(hits)))

In [None]:
def hybrid(userId, title):
    idx = indices[title]
    movie_id = id_map.loc[title]['movieId']
    
    sim_scores = list(enumerate(cosine_sim[int(idx)]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:1000]
    movie_indices = [i[0] for i in sim_scores]
    
    
    interacted_items = user_interacted_items[userId]
    not_interacted_items = set(movie_indices) - set(interacted_items)
    selected_not_interacted = list(np.random.choice(list(not_interacted_items), 99))
    test_items = selected_not_interacted + [i]
    predicted_labels = np.squeeze(model(torch.tensor([userId]*100), 
                                    torch.tensor(test_items)).detach().numpy())
    top10_items = [test_items[i] for i in np.argsort(predicted_labels)[::-1][0:10].tolist()]
    # print(top10_items)
    movies = pmm.iloc[movie_indices][['title', 'vote_count', 'vote_average', 'id']]
    return movies
    # return top10_items
    
    # movies = pmm.iloc[movie_indices][['title', 'vote_count', 'vote_average', 'id']]
    # movies['est'] = movies['id'].apply(lambda x: svd.predict(userId, indices_map.loc[x]['movieId']).est)
    # movies = movies.sort_values('est', ascending=False)
    # return movies.head(200)

In [None]:
for (userId,i) in tqdm(test_user_item_set):
    # interacted_items = user_interacted_items[u]
    # not_interacted_items = set(all_movieIds) - set(interacted_items)
    # selected_not_interacted = list(np.random.choice(list(not_interacted_items), 99))
    # test_items = selected_not_interacted + [i]
    if i > 45465:
        continue 
    title = pmm.iloc[i]['title']
    top10_items = hybrid(userId, title)
    
    if i in top10_items:
        hits.append(1)
    else:
        hits.append(0)
        
print("The Hit Ratio @ 10 is {:.2f}".format(np.average(hits)))

In [None]:
hybrid(60, "The Matrix")