In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import PCA

import opendatasets as od
import csv

In [2]:
download = od.download(
    "https://www.kaggle.com/datasets/bandikarthik/movie-recommendation-system?select=links.csv")

Skipping, found downloaded files in ".\movie-recommendation-system" (use force=True to force download)


In [3]:
#links contains movie ID, imdb ID, tmdb ID
links = pd.read_csv('movie-recommendation-system/links.csv')

#movies contains movie ID, Title, Genre
movies= pd.read_csv('movie-recommendation-system/movies.csv')

#ratings contains user ID, movie ID, rating, timestamp
ratings = pd.read_csv('movie-recommendation-system/ratings.csv')

#tags contains user ID, movie ID, tag, timestamp
tags = pd.read_csv('movie-recommendation-system/tags.csv')

## Data Cleaning

Check for sparsity

In [4]:
links_sparsity = 1.0 - (links.count().sum() / float(links.size))
movies_sparsity = 1.0 - (movies.count().sum() / float(movies.size))
ratings_sparsity = 1.0 - (ratings.count().sum() / float(ratings.size))
tags_sparsity = 1.0 - (tags.count().sum() / float(tags.size))

In [5]:
print("Links Sparsity: ", links_sparsity)
print("Movies Sparsity: ", movies_sparsity)
print("Ratings Sparsity: ", ratings_sparsity)
print("Tags Sparsity: ", tags_sparsity)

Links Sparsity:  0.0028843155597131354
Movies Sparsity:  0.0
Ratings Sparsity:  0.0
Tags Sparsity:  6.814379704067619e-06


Replacing Null values

In [6]:
#tmdbId is numeric so we can fill it with 0
links = links.fillna(0)

#tag is categorical data, so we will drop the rows with null vals
tags = tags.dropna()

## Sampling

Create Dataset for Content-Based System

In [7]:
#We want to combine tags and genres for our TF-IDF 
#Merging tags and movies datasets
CB_df = movies.merge(tags, how = 'left', on = 'movieId')

#Replace NaNs with genre
CB_df['tag'] = CB_df['tag'].fillna("")

#Grouping by Title & concatenate tags
agg_functions = {'movieId': 'first', 'tag': ' '.join, 'genres': 'first'}
combined_tags = CB_df.groupby('title', as_index = False).aggregate(agg_functions)

#Combine tag and genres
combined_tags["newtag"] = combined_tags[['tag', 'genres']].agg(' '.join, axis = 1)

#Drop unnecessary columns
combined_tags = combined_tags.drop('tag', axis = 1)
combined_tags = combined_tags.drop('genres', axis = 1)

In [8]:
#Get top 10 users with most reviews
count = ratings.groupby('userId').size()
top_10 = count.sort_values(ascending = False).head(10)
top_10 = pd.DataFrame(top_10)

#Merge top_10 with ratings to get the ratings for the top 10 users
sampled_df = top_10.merge(ratings, how = 'left', right_on = 'userId', left_on = top_10.index)

In [9]:
#We only need the movies that have ratings, so we merge ratings with combined_tags
merged = sampled_df.merge(movies, how = 'left', left_on = 'movieId', right_on = 'movieId')

merged = merged.merge(combined_tags, how = 'left', left_on = 'title', right_on = 'title')

#Dropping unnecessary columns
merged = merged.drop('timestamp', axis = 1)
merged = merged.drop('movieId_y', axis = 1)

agg_functions = {'title': 'first', 'newtag': 'first'}
CB_merged = merged.groupby('movieId_x', as_index = False).aggregate(agg_functions)

#We want to sample CB_merged to reduce rows
sampled_data = CB_merged[:2500]

# Content-Based Recommender System

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

In [11]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Tim\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Tim\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### CB Pt 1) Preprocessing: Stop word removal + stemming

In [12]:
def preprocess_text(text):
    words = nltk.word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    words = [w for w in words if not w.lower() in stop_words]
    
    stemmer = PorterStemmer()
    stemmed_words = [stemmer.stem(w) for w in words]
    preprocessed_text = ' '.join(stemmed_words)
    
    return preprocessed_text

In [13]:
preprocessed_tags = []
for i in sampled_data['newtag']:
    preprocessed_text = preprocess_text(i)
    preprocessed_tags.append(preprocessed_text)

### CB Pt 2) TF-IDF Vectorization

In [14]:
X = np.array(preprocessed_tags)

In [15]:
vectorizer = TfidfVectorizer()

tfidf_vectors = vectorizer.fit_transform(X)

### CB Pt 3) Recommender Algorithm 

In [16]:
#Calculate cosine similarity matrix
cosine_sim = cosine_similarity(tfidf_vectors, tfidf_vectors)

In [17]:
#Construct Ratings matrix
ratings_matrix = merged.pivot_table(index = 'userId', columns = 'movieId_x', values = 'rating', fill_value = 0)

#Limit Ratings matrix to first 2500 movies
ratings_matrix = ratings_matrix.iloc[:, :2500]
ratings_matrix = np.array(ratings_matrix)

In [18]:
#Split ratings matrix into train/test sets
train_matrix, test_matrix = train_test_split(ratings_matrix, test_size = 0.2)

In [19]:
#Calculate the predicted ratings based on cosine similarity matrix
predicted_ratings = np.dot(cosine_sim, train_matrix.T) / np.sum(cosine_sim, axis = 1, keepdims = True)

In [20]:
#Flatten predicted ratings and actual ratings
predicted_ratings_flat = predicted_ratings.flatten()
test_ratings_flat = test_matrix.flatten()

In [21]:
#Filter out zero ratings
nonzero_indices = np.nonzero(test_ratings_flat)
predicted_ratings_nonzero = predicted_ratings_flat[nonzero_indices]
test_ratings_nonzero = test_ratings_flat[nonzero_indices]

In [22]:
mae = mean_absolute_error(test_ratings_nonzero, predicted_ratings_nonzero)

In [23]:
mae

1.988123742312418

# Proposed Hybrid System 

In [24]:
import itertools
from scipy import sparse

### Hybrid Pt 1) Calculate Similarity Matrix

Item-Rating similarity

In [25]:
#Calculate Item-Rating similarity using cosine similarity

#We want to subtract the user means from all non-zero values
matrix = ratings_matrix.copy()
RI_sim = cosine_similarity(matrix.T)

Item-Demographic similarity

In [26]:
#Calculate Item-Demographic similarity using vector similarity

demo_matrix = merged.groupby('movieId_x')['genres'].first().reset_index()
demo_matrix = demo_matrix.head(2500)

preprocessed_tags = []
for i in demo_matrix['genres']:
    preprocessed_text = preprocess_text(i)
    preprocessed_tags.append(preprocessed_text)
    
X = np.array(preprocessed_tags)
vectorizer = TfidfVectorizer()
tfidf_vectors = vectorizer.fit_transform(X)

DI_sim = cosine_similarity(tfidf_vectors, tfidf_vectors)

Item-Feature similarity

In [27]:
#Calculate Item-Feature similarity using vector similarity

feature_matrix = sampled_df.movieId.unique()
feature_matrix = pd.DataFrame(feature_matrix)
feature_matrix = feature_matrix.merge(tags, how = 'left', left_on = feature_matrix[0], right_on = 'movieId')
feature_matrix.dropna(subset = ['movieId'], inplace = True)
feature_matrix['tag'].fillna(" ", inplace = True)
feature_matrix = feature_matrix.groupby('movieId')['tag'].agg(' '.join).reset_index()
feature_matrix = feature_matrix.head(2500)

preprocessed_tags = []
for i in feature_matrix['tag']:
    preprocessed_text = preprocess_text(i)
    preprocessed_tags.append(preprocessed_text)
    
X = np.array(preprocessed_tags)
vectorizer = TfidfVectorizer()
tfidf_vectors = vectorizer.fit_transform(X)

FI_sim = cosine_similarity(tfidf_vectors, tfidf_vectors)

Candidate Itemset similarity matrices

In [28]:
#Function to find candidate neighbors based on similarity to target item
def get_candidates(feature_similarities, target, k):
    #Get similarity values for target item
    target_item_similarities = feature_similarities[target]
    
    #Sort values in descending order & exclude target
    sorted_sim = np.argsort(target_item_similarities)[::-1]
    sorted_sim = sorted_sim[sorted_sim != target]
    
    top_k = sorted_sim[:k]
    
    return top_k

In [29]:
#Function to find RD_sim, DD_sim, FD_sim
#Turns item similarity matrices into sparse matrices that only contain the candidate movies
def get_candidate_sims(sim_matr, candidates):
    rows = []
    cols = []
    data = []

    for i in candidates:
        for j in candidates:
            rows.append(i)
            cols.append(j)
            data.append(sim_matr[i, j])

    output = sparse.coo_matrix((data, (rows, cols)), shape = sim_matr.shape)
    return output

In [30]:
top20 = get_candidates(FI_sim, 1, 20)

In [31]:
#Calculate RD_sim, FD_sim, DD_sim
RD_sim = get_candidate_sims(RI_sim, top20)
RD_sim = RD_sim.toarray()

FD_sim = get_candidate_sims(FI_sim, top20)
FD_sim = FD_sim.toarray()

DD_sim = get_candidate_sims(DI_sim, top20)
DD_sim = DD_sim.toarray()

### Hybrid Pt 2)  Calculating BoostedSim

In [32]:
#Split ratings matrix into train/test sets
train_matrix, test_matrix = train_test_split(ratings_matrix, test_size = 0.2)

In [33]:
#Prints a list of the similarity matrices associated with each matrix in linear_combination
def print_index_better():
    matrices = ['FI_sim', 'DI_sim', 'RI_sim', 'FD_sim', 'DD_sim', 'RD_sim']

    combinations = []
    for r in range(1, len(matrices) + 1):
        combinations.extend(list(itertools.combinations(matrices, r)))

    for i, combination in enumerate(combinations, start=1):
        print("Index:", i)
        print("Matrices:", combination)
        print()

In [34]:
#Calculate MAE for each matrix in linear_combinations
def fmax(matrices, training_data):
    mae_list = []

    for i in matrices:
        predicted_ratings = np.dot(i, training_data.T)
        
        absolute_diff = np.abs(predicted_ratings - training_data.T)
        
        mae = np.mean(absolute_diff)
        mae_list.append(mae)
    
    lowest_mae = mae_list.index(min(mae_list)) + 1
   
    return lowest_mae, mae_list

In [35]:
def combine_matrices(matrices):
    combined_matrix = np.zeros_like(matrices[0])
    for matrix in matrices:
        combined_matrix += matrix
    return combined_matrix

In [36]:
#Find optimal weights for Boosted_sim
def find_optimal_weights(α, β, FD_sim, DD_sim):
    matrices = []
    
    for i in range(len(α)):
        matrices.append(α[i]*FD_sim + β[i]*DD_sim)
    
    fmax_index, mae_list = fmax(matrices, train_matrix)
    
    #Find the index of the smallest MAE in mae_list
    least_index = mae_list.index(min(mae_list))
    least_α = α[least_index]
    least_β = β[least_index]
    
    return least_α, least_β

In [37]:
#Perform linear combination on matrices combinations
similarity_matrices = [FI_sim, DI_sim, RI_sim, FD_sim, DD_sim, RD_sim]

all_combinations = []
linear_combinations = []

for i in range(1, len(similarity_matrices) + 1):
    combinations = itertools.combinations(similarity_matrices, i)
    all_combinations.extend(combinations)
    
for combination in all_combinations:
    linear_combination = combine_matrices(combination)
    linear_combinations.append(linear_combination)

In [38]:
fmax_index, mae_list = fmax(linear_combinations, train_matrix)

In [39]:
print_index_better()

Index: 1
Matrices: ('FI_sim',)

Index: 2
Matrices: ('DI_sim',)

Index: 3
Matrices: ('RI_sim',)

Index: 4
Matrices: ('FD_sim',)

Index: 5
Matrices: ('DD_sim',)

Index: 6
Matrices: ('RD_sim',)

Index: 7
Matrices: ('FI_sim', 'DI_sim')

Index: 8
Matrices: ('FI_sim', 'RI_sim')

Index: 9
Matrices: ('FI_sim', 'FD_sim')

Index: 10
Matrices: ('FI_sim', 'DD_sim')

Index: 11
Matrices: ('FI_sim', 'RD_sim')

Index: 12
Matrices: ('DI_sim', 'RI_sim')

Index: 13
Matrices: ('DI_sim', 'FD_sim')

Index: 14
Matrices: ('DI_sim', 'DD_sim')

Index: 15
Matrices: ('DI_sim', 'RD_sim')

Index: 16
Matrices: ('RI_sim', 'FD_sim')

Index: 17
Matrices: ('RI_sim', 'DD_sim')

Index: 18
Matrices: ('RI_sim', 'RD_sim')

Index: 19
Matrices: ('FD_sim', 'DD_sim')

Index: 20
Matrices: ('FD_sim', 'RD_sim')

Index: 21
Matrices: ('DD_sim', 'RD_sim')

Index: 22
Matrices: ('FI_sim', 'DI_sim', 'RI_sim')

Index: 23
Matrices: ('FI_sim', 'DI_sim', 'FD_sim')

Index: 24
Matrices: ('FI_sim', 'DI_sim', 'DD_sim')

Index: 25
Matrices: ('FI_

In [40]:
mae_list

[47.81938324842951,
 662.5179770280289,
 2564.7794489109,
 1.669476575992371,
 1.6894403249922278,
 1.8267525103617972,
 711.8570852764583,
 2614.118557159329,
 47.89003482442188,
 47.909998573421746,
 48.04731075879131,
 3228.927000938929,
 662.5886286040211,
 662.608592353021,
 662.7459045383906,
 2564.8501004868926,
 2564.870064235892,
 2565.0073764212616,
 1.7600919009845988,
 1.8974040863541684,
 1.9173678353540247,
 3278.266109187359,
 711.9277368524507,
 711.9477006014505,
 712.0850127868201,
 2614.1892087353217,
 2614.2091724843217,
 2614.346484669691,
 47.98065014941411,
 48.117962334783684,
 48.13792608378354,
 3228.9976525149214,
 3229.0176162639214,
 3229.1549284492908,
 662.6792439290133,
 662.8165561143828,
 662.8365198633827,
 2564.9407158118843,
 2565.078027997254,
 2565.0979917462537,
 1.9880194113463956,
 3278.336760763351,
 3278.3567245123504,
 3278.49403669772,
 712.0183521774429,
 712.1556643628124,
 712.1756281118123,
 2614.2798240603142,
 2614.4171362456837,
 261

We see that the combination of FD_sim and DD_sim has a low MAE

In [41]:
#Find the combination of weights that produces lowest MAE
α = [.1, .2, .3, .4, .5, .6, .7, .8, .9]
β = [.9, .8, .7, .6, .5, .4, .3, .2, .1]

In [42]:
optimal_α, optimal_β = find_optimal_weights(α, β, FD_sim, DD_sim)

In [43]:
Boosted_sim = optimal_α*FD_sim + optimal_β*DD_sim

## Hybrid Pt 3) Predict ratings

In [44]:
def get_predictions(target_item, user_index ,boostedsim):
    top10 = get_candidates(FI_sim, target_item, 10)
    
    #Calculate RD_sim, FD_sim, DD_sim based on target_item
    RD_sim = get_candidate_sims(RI_sim, top10)
    RD_sim = RD_sim.toarray()

    FD_sim = get_candidate_sims(FI_sim, top10)
    FD_sim = FD_sim.toarray()

    DD_sim = get_candidate_sims(DI_sim, top10)
    DD_sim = DD_sim.toarray()
    
    
    #Generate linear combinations of every possible combination of similarity matrices
    similarity_matrices = [FI_sim, DI_sim, RI_sim, FD_sim, DD_sim, RD_sim]

    all_combinations = []
    linear_combinations = []
  
    for i in range(1, len(similarity_matrices) + 1):
        combinations = itertools.combinations(similarity_matrices, i)
        all_combinations.extend(combinations)

    for combination in all_combinations:
        linear_combination = combine_matrices(combination)
        linear_combinations.append(linear_combination)
        
        
    #We want to predict the ratings for user in index 0
    numerator = []
    denominator = []
    for i in top10:
        numerator.append(boostedsim * ratings_matrix[user_index, i])
        denominator.append(abs(Boosted_sim))
    numerator = np.sum(numerator)
    denominator = np.sum(denominator)
    rating = numerator / denominator
    
    return rating

In [45]:
#Get all the ratings for user 0
user_predict = []
for i in range(100):
    user_predict.append(get_predictions(target_item = i, user_index = 0, boostedsim = Boosted_sim))
    

In [46]:
validation_set = ratings_matrix[0, :100]

In [47]:
mae = mean_absolute_error(user_predict, validation_set)

In [48]:
mae

1.3504999999999998