In [1]:
import pandas as pd
import numpy as np
import json
import itertools as it
from scipy.spatial.distance import cosine
from surprise import Reader, Dataset, SVD
from scipy.stats import pearsonr
from sklearn.model_selection import train_test_split

In [3]:
# Load data
all_ratings = pd.read_csv('/Users/irenebonafonte/Downloads/data_for_model/ratings.csv')
all_ratings.drop('rating_date', axis=1, inplace=True) 
all_ratings.shape, all_ratings.userId.nunique(), all_ratings.movieId.nunique()

In [None]:
# Keep only movies used in CF
content_ids = pd.read_csv('content_based_index.txt')
content_ids = content_ids['movieId'].unique()
all_ratings = all_ratings[all_ratings.movieId.isin(content_ids)]

In [4]:
class CollaborativeFiltering:
    def __init__(self, DataFrame, sim_method='CosineSim', scale_rate=True, balance_ncommon=True, N='all'):
        """ Constructor """
        
        if sim_method == 'CosineSim':
            self.sim_method=cosine
            
        elif sim_method == 'SVDsim':
            self.sim_method='svd_similarity'
            
        elif sim_method == 'pearsonCor':
            self.sim_method=pearsonr
            
        self.df=DataFrame
        self.sim = {}
        self.scale_rate = scale_rate
        self.balance_ncommon = balance_ncommon
        
        # Select the number of similar items to be taken into account when
        # predicting ratings. By default, all items are used.        
        self.N = N
        
        return
            
    def similarity(self, items):
        # Find users that have rated both items
        users = np.intersect1d(self.df_np[self.df_np[:,1] == items[0],0], 
                               self.df_np[self.df_np[:,1] == items[1],0])

        if len(users) >= self.min_common:
            user_bools = np.isin(self.df_np[:,0], users, assume_unique=False)

            # Compute similarity
            sim = self.sim_method(self.df_np[(user_bools & (self.df_np[:,1] == items[0])),2],
                                  self.df_np[(user_bools & (self.df_np[:,1] == items[1])),2])
            
            if self.sim_method == cosine:
                 sim = 1 - sim
                    
            elif self.sim_method == pearsonr:
                sim = sim[0]

            # Balance the similarity score by considering the number of users taken into account
            if self.balance_ncommon:
                sim = sim * min(50,len(users))/50
            
            # we do not want negative similarities (would result in negative ratings)
            if sim <= 0:
                sim = np.nan
            
        else:
            sim = np.nan
            
        return sim 
    
    def svd_similarity(self):  
        reader = Reader(rating_scale=(0, 5))
        trainset = Dataset.load_from_df(self.df, reader)
        trainset = trainset.build_full_trainset()
        svd = SVD(verbose=True, biased=False, n_epochs=20)
        svd.fit(trainset)
        latent_factors = svd.qi
        self.sim_mat = np.corrcoef(latent_factors)
        
        # Sort IDs
        order = np.argsort(self.allItems)
        self.sim_mat = self.sim_mat[order,:]
        self.sim_mat = self.sim_mat[:,order]
        self.allItems_sorted = self.allItems[order]
        
        return
        
    def fit(self, min_common=2):        
        """ Prepare data structures for estimation. Similarity matrix for items """
        
        # Minimum of common users required for the similarity to be computed
        self.min_common = min_common 
        self.allItems=self.df['movieId'].unique()
        
        if self.sim_method == 'svd_similarity':
            self.df_np = self.df.to_numpy()
            self.svd_similarity()
            
            return

        if self.scale_rate:
            # For the computation of the adjusted cosine similarity
            # we substract the user mean rating from the film ratings 
            # so that it does not influence the calculation of similarities.

            means = self.df.groupby('userId')[['rating']].mean()
            self.df['userMean'] = means.loc[self.df['userId']].values
            self.df['scaledRating'] = self.df['rating'] - self.df['userMean']
            self.df.drop('userMean', axis=1, inplace=True)
            # Position similarity metric in third column
            self.df = self.df[['userId','movieId','scaledRating','rating']] 
        
        self.df_np = self.df.to_numpy()
        self.sim_mat = np.array(list(it.combinations(self.allItems,2)))
        self.sim_mat = np.concatenate((self.sim_mat, np.empty((self.sim_mat.shape[0],1))), axis=1)

        self.sim_mat[:,2] = np.apply_along_axis(self.similarity, axis=1, arr=self.sim_mat[:,0:2])
        self.sim_mat = self.sim_mat[~np.isnan(self.sim_mat[:,2]),:]
        
        # Back to normal column order
        if self.scale_rate:
            self.df = self.df.drop(['scaledRating'], axis=1)
            self.df_np = self.df.to_numpy()
            
        return
    
    def svd_predict(self, user_id, movie_id, user_ratings):
        # Get similarity between movie to predict (row) and rated movies (columns)
        movie_sim = self.sim_mat[(self.allItems_sorted == movie_id), np.isin(self.allItems_sorted,user_ratings[:,0])].copy() 
        rating = np.sum((user_ratings[:,1] * movie_sim)) / np.sum(movie_sim)
        
        return min(rating, 5)
        
    def predict(self, user_id, movie_id):
        # user_u ratings
        user_ratings = self.df_np[self.df_np[:,0] == user_id,1:3]
        user_ratings = user_ratings[np.argsort(user_ratings[:,0]), :] # sort by movie_id
        
        if self.sim_method == 'svd_similarity':
            rating = self.svd_predict(user_id, movie_id, user_ratings)
            return rating

        # movie_i similarities
        movie_sim = self.sim_mat[(self.sim_mat[:,0] == movie_id) | (self.sim_mat[:,1] == movie_id), :]
        # put movie_j in first column and remove second column
        movie_sim[movie_sim[:,1] == movie_id,1] = movie_sim[movie_sim[:,1] == movie_id,0]
        
        # Select top N most similar items to do the prediction
        if self.N != 'all':
            N = min(movie_sim.shape[0], self.N)
            movie_sim = movie_sim[np.argsort(-movie_sim[:,2]), ] # sort by similarity
            movie_sim = movie_sim[0:N, :] # Keep top N
            
        movie_sim = movie_sim[np.argsort(movie_sim[:,1]), 1:3] # sort by movie_id

        # subset to movies with user_u rating and similarity to movie_i
        to_weight = np.intersect1d(user_ratings[:,0], movie_sim[:,0])
        if len(to_weight) > 0:
            user_ratings = user_ratings[np.isin(user_ratings[:,0], to_weight),:]
            movie_sim = movie_sim[np.isin(movie_sim[:,0], to_weight),:]

            rating = np.sum((user_ratings[:,1] * movie_sim[:,1])) / np.sum(movie_sim[:,1])
            
        else:
            return np.mean(user_ratings[:,1]) # Assign user mean

        return min(rating, 5) # Limit to 5

    def rmse_evaluate(self, data_test):
        """ RMSE-based predictive performance evaluation. """
        
        # Exclude from testing users for which we do not have any ratings
        data_test = data_test.loc[data_test.userId.isin(self.df.userId.unique()),:]
        print('Evaluating performance with n='+str(data_test.userId.nunique())+' users.')
        
        to_estimate = data_test[['userId','movieId']].to_numpy()
        y = data_test[['rating']].values
        
        # Predict rating for each user-movie pair
        y_hat = np.apply_along_axis(lambda x: self.predict(x[0], x[1]), axis=1, arr=to_estimate)
        
        rmse = np.sqrt(np.mean(np.power(y_hat - y, 2)))
        return rmse
    

# Fit model

In [5]:
recomend = CollaborativeFiltering(all_ratings, sim_method='SVDsim')
recomend.fit()

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
Processing epoch 15
Processing epoch 16
Processing epoch 17
Processing epoch 18
Processing epoch 19


In [6]:
similarity = recomend.sim_mat
model_movie_index = recomend.allItems_sorted.tolist()

# Upload to database

In [7]:
# Read configuration file
with open('config_prod.json', 'r') as fp:
    config = json.load(fp)
print(config)


def encode_matrix(matrix, id_list, num_digits=2):
    '''Encode similarity matrix as list of strings
       Every string will correspond to a row of the matrix, with each value coded as a fixed number of digits
       Input:
           - matrix: 2-dimensional array of float values between 0 and 1
           - id_list: list of id values
           - num_digits: number of digits for each value (default 2)
       Returns:
           - matrix_out: formatted matrix (list of strings)
    '''
    if not (matrix.shape[0] == matrix.shape[1] and matrix.shape[0] == len(id_list)):
        raise ValueError('[encode_matrix] The matrix dimensions must match with the size of the id_list')
        
    matrix_out = []
    d_movies = []
    for i in range(len(matrix)):
        row_string=''
        for j in range(len(matrix)):
            x = matrix[i,j]
            if x<0 or x>1:
                raise ValueError('[encode_matrix] The matrix values must be between 0 and 1 - error in value [{},{}] '.format(i,j))
            x_int = int(x*10**num_digits)  # Convert into n-digit integer
            if x_int==10**num_digits:  # Values of 1 will be converted to 0.99
                x_int -= 1
            row_string = row_string + format(x_int, '0'+str(num_digits)+'d')
        matrix_out.append({'movieId':id_list[i], 'similarities':row_string})
    return matrix_out


{'dir_data_raw': '../data/raw', 'dir_data_input': '../data/input', 'db_url': 'cluster0.egjki.mongodb.net', 'db_name': 'gmam', 'db_user': 'getmeamovie_rw'}


In [8]:
similarity[similarity < 0] = 0

In [9]:
f_sim_matrix = encode_matrix(similarity, model_movie_index, num_digits=2)

In [12]:
# Connect to MongoDB

db_url = config['db_url']
db_name = config['db_name']
db_user = config['db_user']

import pymongo
import ssl
try:
    # Close previous connection
    if 'conn' in globals():
        conn.close()
        print("Closing connection")
    
    # Read from db_credentials.txt password required to connect to MongoDB.
    with open("db_credentials.txt", 'r') as f:
        [db_password] = f.read().splitlines()
    
    # Connect
    conn=pymongo.MongoClient("mongodb+srv://{}:{}@{}".format(db_user, db_password, db_url), ssl_cert_reqs=ssl.CERT_NONE)
    print ("Connected successfully to MongoDB")
    
except pymongo.errors.ConnectionFailure as e:
    print ("Could not connect to MongoDB: %s" % e) 
    
# Open database and collection
db = conn[db_name]
col_similarity = db['similarity_CF']

col_similarity.delete_many({})  # Delete previous data in the collection

col_similarity.insert_many(f_sim_matrix)  # Insert formatted similarity matrix

# Close connection to MongoDB
conn.close()


Closing connection
Connected successfully to MongoDB


In [13]:
try:
    # Close previous connection
    if 'conn' in globals():
        conn.close()
        print("Closing connection")
    
    # Read from db_credentials.txt password required to connect to MongoDB.
    with open("db_credentials.txt", 'r') as f:
        [db_password] = f.read().splitlines()
    
    # Connect
    conn=pymongo.MongoClient("mongodb+srv://{}:{}@{}".format(db_user, db_password, db_url), ssl_cert_reqs=ssl.CERT_NONE)
    print ("Connected successfully to MongoDB")
    
except pymongo.errors.ConnectionFailure as e:
    print ("Could not connect to MongoDB: %s" % e) 
    
db = conn[db_name]



Closing connection
Connected successfully to MongoDB


In [14]:
db.list_collection_names()

['similarity', 'similarity_CF', 'movies', 'similarity_content_based']

In [15]:
db.drop_collection("similarity")
db.list_collection_names()

['similarity_CF', 'movies', 'similarity_content_based']