### Decision Support Systems: Recommender Systems 

__Case Description__

The focus of this project is to design, implement, test, compare, and document recommender algorithms for recommending, e.g. books or articles. The algorithms must be optimized with regard to recommendation accuracy and explain-ability/interpret-ability, i.e. results must be shown such that end-users readily can see the basis for the recommendation. To quantify the quality of the algorithms, comparative evaluations should be carried out via objective metrics and the algorithms should be assessed against state of the art.


Inspiration for source code:
- https://www.ethanrosenthal.com/2015/11/02/intro-to-collaborative-filtering/
- https://towardsdatascience.com/how-does-collaborative-filtering-work-da56ea94e331
- https://medium.com/grabngoinfo/recommendation-system-user-based-collaborative-filtering-a2e76e3e15c4

In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt
import warnings
with warnings.catch_warnings():
    warnings.simplefilter(action='ignore', category=FutureWarning)


### Data Preprocessing

In [12]:
def load_movielens_data():
    #main data
    columns = ['user_id', 'item_id', 'rating', 'timestamp']
    data = pd.read_csv('ml-100k/u.data', sep='\t', names=columns)
    
    #movie features 
    movies = pd.read_csv('ml-100k/u.item', sep='|', encoding='latin-1', header=None, names=['movie_id', 'title', 'release_date', 'video_release_date', 'IMDb_URL', 'unknown', 'action', 'adventure', 'animation', 'children', 'comedy', 'crime', 'documentary', 'drama', 'fantasy', 'film-noir', 'horror', 'musical', 'mystery', 'romance', 'sci-fi', 'thriller', 'war', 'western'])
    movies['genres'] = movies[movies.columns[6:]].apply(lambda x: '|'.join(x.index[x.astype(bool)]), axis=1)
    movies = movies[['movie_id', 'title', 'genres']]
    
    return data, movies

data, movies = load_movielens_data()

display(data.head(5))
display(movies.head(5))

#splitting dataset into training and testing 
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)


Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),animation|children|comedy
1,2,GoldenEye (1995),action|adventure|thriller
2,3,Four Rooms (1995),thriller
3,4,Get Shorty (1995),action|comedy|drama
4,5,Copycat (1995),crime|drama|thriller


In [15]:
"""
User-Item matrix
"""
def create_user_item_matrix(data):
    #rows = users | columns = movies | values = ratings | if no ratings, then fill with 0
    user_item_matrix = data.pivot_table(index='user_id', columns='item_id', values='rating', fill_value=0)
    return user_item_matrix

#create user-item matrices
user_item_train = create_user_item_matrix(train_data)
user_item_test = create_user_item_matrix(test_data)

#check if sets are disjoint
assert(np.all(user_item_train * user_item_test) == 0)

### Similarity matrices

In [17]:
"""
Similarity matrix
"""
def compute_similarity_matrices(user_item_matrix, kind='user'):
    #for user-based
    if kind == 'user':
        return cosine_similarity(user_item_matrix) 
    
    #for item-based
    elif kind == 'item':
        return cosine_similarity(user_item_matrix.T)


user_similarity = compute_similarity_matrices(user_item_train, kind='user')
item_similarity = compute_similarity_matrices(user_item_train, kind='item')


### Predictions

In [18]:
"""
Function to calculate the predictions:
The function uses weighted sum approachbytaking the 
"""

def predict(ratings, similarity, kind='user'):
    #normalize rating matrix
    #mean_rating = ratings.mean(axis=1)
    #normalized_ratings = ratings.subtract(mean_rating, axis=0)

    #for user-based
    if kind == 'user':
        user_predictions = np.dot(similarity, ratings) / np.array([np.abs(similarity).sum(axis=1)]).T
        #user_predictions += mean_rating[:, np.newaxis] #adding the mean back

        return user_predictions
    
    #for item-based
    elif kind == 'item':
        item_predictions = np.dot(ratings, similarity) / np.array([np.abs(similarity).sum(axis=1)])
        #item_predictions += mean_rating[:, np.newaxis] #adding the mean back

        return item_predictions


user_prediction = predict(user_item_train, user_similarity, kind='user')
item_prediction = predict(user_item_train, item_similarity, kind='item')


In [25]:
"""
Calculate RMSE
"""
def rmse(prediction, ground_truth):
    #flatten matrices
    prediction = prediction[ground_truth.nonzero()].flatten()
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()

    #RMSE is just MSE squared -> Setting squared=False for MSE function yields RMSE
    return mean_squared_error(prediction, ground_truth, squared=False)

#calculate RMSE
print("User-based CF RMSE:", rmse(user_prediction, user_item_test.values))
print("Item-based CF RMSE:", rmse(item_prediction, user_item_test.values))

User-based CF RMSE: 3.3117861175435794
Item-based CF RMSE: 3.395080832677687


### Verification

In [29]:
"""
Function to get-top k movies  
"""
def get_top_k_movies(prediction, user_id, movies, k=5):
    #sorting the predictions according to the similairty
    user_predictions = prediction[user_id - 1].argsort()[-k:][::-1]
    
    return movies.iloc[user_predictions]


def get_predicted_rating(user_id, movie_id, prediction, movie_data, user_item_matrix):
    movie_index = np.where(user_item_matrix.columns == movie_id)[0][0]
    predicted_rating = prediction[user_id - 1, movie_index]
    
    title = movie_data.loc[movie_data['movie_id'] == movie_id]['title'].values[0]
    genres = movie_data.loc[movie_data['movie_id'] == movie_id]['genres'].values[0]
    
    return {"title": title, "genres": genres, "predicted_rating": predicted_rating}


#number of movies to be recommended
top_k = 5 

#user-based
user_id = 67
print(f"Top {top_k} recommendations for User {user_id}:")
display(get_top_k_movies(user_prediction, user_id, movies, k=top_k))

#item-based
movie_id = 9
print(f"Top {top_k} recommendations for Movie {movie_id}:")
display(get_top_k_movies(item_prediction, movie_id, movies, k=top_k))


Top 5 recommendations for User 67:


Unnamed: 0,movie_id,title,genres
49,50,Star Wars (1977),action|adventure|romance|sci-fi|war
6,7,Twelve Monkeys (1995),drama|sci-fi
99,100,Fargo (1996),crime|drama|thriller
116,117,"Rock, The (1996)",action|adventure|thriller
120,121,Independence Day (ID4) (1996),action|sci-fi|war


Top 5 recommendations for Movie 9:


Unnamed: 0,movie_id,title,genres
1600,1601,Office Killer (1997),thriller
1358,1359,Boys in Venice (1996),drama
1629,1630,"Silence of the Palace, The (Saimt el Qusur) (1...",drama
1482,1483,"Man in the Iron Mask, The (1998)",action|drama|romance
1251,1252,"Contempt (Mépris, Le) (1963)",drama


In [30]:
"""
Functions to see user movie history 
"""
def get_user_movie_history(user_id, ratings_data, movie_data):
    user_ratings = ratings_data[ratings_data['user_id'] == user_id]
    user_movie_history = user_ratings.merge(movie_data, left_on='item_id', right_on='movie_id')[['title', 'genres', 'rating']]
    user_movie_history = user_movie_history.sort_values(by='rating', ascending=False)
    return user_movie_history

#get history for user 1
movie_history = get_user_movie_history(user_id, data, movies)
display(movie_history)


"""
Function to see data for a single movie 
"""
def get_movie_info(movie_id, movie_data):
    movie_info = movie_data[movie_data['movie_id'] == movie_id].iloc[0]
    return movie_info

movie_info = get_movie_info(movie_id, movies)
print("\nMovie information:")
display(movie_info)

Unnamed: 0,title,genres,rating
0,"Shawshank Redemption, The (1994)",drama,5
13,Twelve Monkeys (1995),drama|sci-fi,5
28,"Rock, The (1996)",action|adventure|thriller,5
4,Beavis and Butt-head Do America (1996),animation|comedy,5
5,Mission: Impossible (1996),action|adventure|mystery,5
7,Live Nude Girls (1995),comedy,5
20,Rumble in the Bronx (1995),action|adventure|crime,4
18,"Birdcage, The (1996)",comedy,4
17,"Frighteners, The (1996)",comedy|horror,4
16,Bulletproof (1996),action,4



Movie information:


movie_id                          9
title       Dead Man Walking (1995)
genres                        drama
Name: 8, dtype: object

In [None]:
"""
Function to get predicted rating
"""
def get_predicted_rating(user_id, movie_id, prediction, movie_data, user_item_matrix):
    movie_index = np.where(user_item_matrix.columns == movie_id)[0][0]
    predicted_rating = prediction[user_id - 1, movie_index]
    
    title = movie_data.loc[movie_data['movie_id'] == movie_id]['title'].values[0]
    genres = movie_data.loc[movie_data['movie_id'] == movie_id]['genres'].values[0]
    
    return {"title": title, "genres": genres, "predicted_rating": predicted_rating}

predicted_rating_user_based = get_predicted_rating(user_id, 7, user_prediction, movies, user_item_train)
print("Predicted rating (User-based):")
display(predicted_rating_user_based)
