# Collaborative-based Filtering

### Importing and Exploring

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error

In [2]:
#Load the u.user file into a dataframe
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']

users = pd.read_csv('movielens/u.user', sep='|', names=u_cols,
 encoding='latin-1')

users.head()

Unnamed: 0,user_id,age,sex,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [3]:
#Load the u.items file into a dataframe
i_cols = ['movie_id', 'title' ,'release date','video release date', 'IMDb URL', 'unknown', 'Action', 'Adventure',
 'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']

movies = pd.read_csv('movielens/u.item', sep='|', names=i_cols, encoding='latin-1')

#Remove all information except Movie ID and title (because, collab_filter)
movies = movies[['movie_id', 'title']]

movies.head()

Unnamed: 0,movie_id,title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)


In [4]:
#Load the u.data file into a dataframe
r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']

ratings = pd.read_csv('movielens/u.data', sep='\t', names=r_cols,
 encoding='latin-1')

#Drop the timestamp column
ratings = ratings.drop('timestamp', axis=1)

ratings.head()

Unnamed: 0,user_id,movie_id,rating
0,196,242,3
1,186,302,3
2,22,377,1
3,244,51,2
4,166,346,1


## Baseline Model

The goal is to split in such a way that 75% of a user's ratings is in the training dataset and 25% is in the testing dataset. So separate user_id into y and stratify it. <br> 
Startify ensures that the given variable is distributed according to the proportion

In [5]:
#Assign X as the original ratings dataframe and y as the user_id column of ratings.
X = ratings.copy()
y = ratings['user_id']

#Split into training and test datasets, stratified along user_id
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, stratify=y, random_state=123)

In [6]:
#Function that computes the root mean squared error (or RMSE)
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

In [7]:
#Define the baseline model to always return 3, to compare with the upcoming models
def baseline(user_id, movie_id):
    return 3.0

In [8]:
#Function to compute the RMSE score obtained on the testing set by a model
def score(cf_model):
    
    #Construct a list of user-movie tuples from the testing dataset
    id_pairs = zip(X_test['user_id'], X_test['movie_id'])
    
    #Predict the rating for every user-movie tuple
    y_pred = np.array([cf_model(user, movie) for (user, movie) in id_pairs])
    
    #Extract the actual ratings given by the users in the test data
    y_true = np.array(X_test['rating'])
    
    #Return the final RMSE score
    return rmse(y_true, y_pred)

In [9]:
# RMSE score of the baseline model
score(baseline)

1.2411285187280163

## User-based: Mean Collab Filter

Returns the mean of the ratings of the movie from all users. <br>
In other words, ratings of all users are given equal weights.

In [10]:
#Build the ratings matrix using pivot_table function
r_matrix = X_train.pivot_table(values='rating', index='user_id', columns='movie_id')

r_matrix.head()

movie_id,1,2,3,4,5,6,7,8,9,10,...,1670,1671,1672,1673,1674,1676,1677,1678,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,4.0,3.0,,5.0,4.0,,5.0,3.0,...,,,,,,,,,,
2,,,,,,,,,,2.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,3.0,,,,,,,,,...,,,,,,,,,,


In [11]:
#User Based Collaborative Filter using Mean Ratings
def cf_user_mean(user_id, movie_id):
    # its possibe for a movie to exist in test and not in train
    if movie_id in r_matrix:
        mean_rating = r_matrix[movie_id].mean()
    else:
        # default to 3 if does not exist
        mean_rating = 3.0
    
    return mean_rating

# compute score of the model
score(cf_user_mean)

1.0249357705533904

## User-based: Weighted mean

Assign a weight to the ratings of each user. The similarity function chosen here is cosine similarity.

In [12]:
#Create a dummy ratings matrix with all null values imputed to 0
r_matrix_dummy = r_matrix.copy().fillna(0)

cosine_sim = cosine_similarity(r_matrix_dummy, r_matrix_dummy)
print(cosine_sim.shape)

# convert into dataframe
cosine_sim = pd.DataFrame(cosine_sim, index=r_matrix.index, columns=r_matrix.index)

cosine_sim.head()

(943, 943)


user_id,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.106685,0.045783,0.048213,0.283924,0.310471,0.327659,0.295866,0.077645,0.285125,...,0.292789,0.087778,0.226892,0.154691,0.134548,0.110568,0.254568,0.134698,0.130574,0.264547
2,0.106685,1.0,0.096208,0.159209,0.0,0.174844,0.089415,0.115771,0.148136,0.098831,...,0.062747,0.222434,0.254607,0.380234,0.148746,0.139122,0.163425,0.077417,0.088402,0.03923
3,0.045783,0.096208,1.0,0.212349,0.028603,0.04269,0.074308,0.023804,0.083428,0.068582,...,0.033866,0.060235,0.168076,0.065622,0.117721,0.0,0.10443,0.120873,0.116315,0.037183
4,0.048213,0.159209,0.212349,1.0,0.013001,0.052199,0.094573,0.156285,0.135434,0.037884,...,0.039095,0.050703,0.07085,0.193329,0.101676,0.040212,0.186246,0.217055,0.164163,0.037558
5,0.283924,0.0,0.028603,0.013001,1.0,0.197938,0.275318,0.169758,0.0,0.132165,...,0.195447,0.107394,0.049301,0.060563,0.136559,0.080164,0.183999,0.116627,0.142776,0.256035


In [13]:
#User Based Collaborative Filter using Weighted Mean Ratings
def cf_user_wmean(user_id, movie_id):
    #check if the test movie exists in the trained data (r_matrix)
    if movie_id in r_matrix:
        
        # get the similarity scores with all users
        sim_scores = cosine_sim[user_id]
        
        # get the ratings of the movie from all users
        ratings = r_matrix[movie_id]
        
        # extract indices which have NaN in ratings
        ind = ratings[ratings.isnull()].index
        
        # remove the NaNs
        ratings = ratings.dropna()
        
        #remove corresponding entries in similarities (users who have not rated the movie)
        sim_scores = sim_scores.drop(ind)
        
        # compute the final weighted mean
        wmean_rating = np.dot(sim_scores, ratings)/ sim_scores.sum()
    
    else:
        wmean_rating = 3.0
        
    return wmean_rating

score(cf_user_wmean)

1.0189167985727243

#### Remarks:
1. the above cell took a lot of time for very little improvement
2. In the case of negative ratings, add a modulus for cosine similarity

## User-based: User demographics filter

Instead of calclating similarity with all users, a demographic filter is applied to the users, and only the filtered users are considered

### Using gender as a demographic filter with mean

In [14]:
# Merging the user data and the training data
merged_df = pd.merge(X_train, users)

merged_df.head()

Unnamed: 0,user_id,movie_id,rating,age,sex,occupation,zip_code
0,89,724,4,43,F,administrator,68106
1,89,187,5,43,F,administrator,68106
2,89,221,1,43,F,administrator,68106
3,89,402,4,43,F,administrator,68106
4,89,815,4,43,F,administrator,68106


In [15]:
# Compute the mean rating of every movie by gender
gender_mean = merged_df[['movie_id', 'sex', 'rating']].groupby(['movie_id', 'sex'])['rating'].mean()

In [16]:
# Set the index of the users dataframe to the user_id
users = users.set_index('user_id')

In [17]:
# Gender Based Collaborative Filter using Mean Ratings
def cf_gender(user_id, movie_id):
    
    #Check if movie_id exists in r_matrix (or training set)
    if movie_id in r_matrix:
        # identify the gender
        gender = users.loc[user_id]['sex']
        
        #Check if the gender has rated the movie
        if gender in gender_mean[movie_id]:
            
            #Compute the mean rating given by that gender to the movie
            gender_rating = gender_mean[movie_id][gender]
        
        else:
            gender_rating = 3.0
    
    else:
        #Default to a rating of 3.0 in the absence of any information
        gender_rating = 3.0
    
    return gender_rating

score(cf_gender)

1.0344238303635338

#### Remarks:
1. The gender filter has performed worse. 
2. Experiment with other demographic filters or a combination of features
3. Experiment by taking the weighted users model but only with users through the demographic filters
4. Experiment with item-based filter, switching the user and movies

## Model-based: k-NN

1. Find the k-nearest neighbors of u who have rated movie m
2. Output the average rating of the k users for the movie m

Note: Using scikit-surprise, which is a popular python recommender engine package

In [18]:
# pip3 install scikit-surprise
from surprise import Reader, Dataset, KNNBasic
from surprise.model_selection import cross_validate

In [19]:
#Define a Reader object
#The Reader object helps in parsing the file or dataframe containing ratings
reader = Reader(rating_scale=(1, 5))

#Create the dataset to be used for building the filter
# the dataframe must have user column, item column and rating column in that order
data = Dataset.load_from_df(ratings, reader)

#Define the algorithm object; in this case kNN
# default k = 40
knn = KNNBasic()

#Evaluate the performance in terms of RMSE
cross_validate(knn, data, measures=['RMSE'], verbose=True)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Evaluating RMSE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9843  0.9654  0.9851  0.9811  0.9780  0.9788  0.0072  
Fit time          0.21    0.26    0.22    0.22    0.23    0.23    0.02    
Test time         2.13    2.19    2.11    2.06    2.11    2.12    0.04    


{'test_rmse': array([0.98426351, 0.96539946, 0.98514685, 0.98110881, 0.97800599]),
 'fit_time': (0.20784831047058105,
  0.2574176788330078,
  0.22255158424377441,
  0.22209930419921875,
  0.22631359100341797),
 'test_time': (2.127811908721924,
  2.1941769123077393,
  2.114354372024536,
  2.057927131652832,
  2.1104962825775146)}

## Model-based: SVD

In [20]:
#Import SVD
from surprise import SVD
from surprise.model_selection import cross_validate

#Define a Reader object
#The Reader object helps in parsing the file or dataframe containing ratings
reader = Reader(rating_scale=(1, 5))

#Create the dataset to be used for building the filter
# the dataframe must have user column, item column and rating column in that order
data = Dataset.load_from_df(ratings, reader)

#Define the SVD algorithm object
svd = SVD()

#Evaluate the performance in terms of RMSE
cross_validate(svd, data, measures=['RMSE'], verbose=True)

Evaluating RMSE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9382  0.9321  0.9335  0.9367  0.9391  0.9359  0.0027  
Fit time          3.26    3.20    3.28    3.10    3.01    3.17    0.10    
Test time         0.13    0.09    0.13    0.09    0.13    0.11    0.02    


{'test_rmse': array([0.93824682, 0.93208277, 0.93347613, 0.93668606, 0.93913641]),
 'fit_time': (3.25650691986084,
  3.197284698486328,
  3.2789018154144287,
  3.103548288345337,
  3.0123043060302734),
 'test_time': (0.13078546524047852,
  0.09395122528076172,
  0.13298797607421875,
  0.08985328674316406,
  0.12640881538391113)}