**USER BASED COLLABORATIVE FILTERING**
open in google colab  https://colab.research.google.com/drive/1NImjLB1uaIKhi94JgiKsWnovLvDVYCvJ?usp=sharing

In [1]:

import os

if not (os.path.exists("recsys.zip") or os.path.exists("recsys")):
    !wget https://github.com/nzhinusoftcm/review-on-collaborative-filtering/raw/master/recsys.zip    
    !unzip recsys.zip


In [2]:
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix

from recsys.datasets import ml100k
from recsys.preprocessing import ids_encoder

import pandas as pd
import numpy as np
import zipfile



### Load MovieLen ratings

In [3]:
ratings, movies = ml100k.load()

### userids and itemids encoding

In [4]:
# creating the encoder
ratings, uencoder, iencoder = ids_encoder(ratings)

### Transform rating dataframe to matrix

In [5]:
def ratingsmatrix(ratings):    
    return csr_matrix(pd.crosstab(ratings.userid, ratings.itemid, ratings.rating, aggfunc=sum).fillna(0).values)    

R = ratingsmatrix(ratings)
print(R)

  (0, 0)	5.0
  (0, 1)	3.0
  (0, 2)	4.0
  (0, 3)	3.0
  (0, 4)	3.0
  (0, 5)	5.0
  (0, 6)	4.0
  (0, 7)	1.0
  (0, 8)	5.0
  (0, 9)	3.0
  (0, 10)	2.0
  (0, 11)	5.0
  (0, 12)	5.0
  (0, 13)	5.0
  (0, 14)	5.0
  (0, 15)	5.0
  (0, 16)	3.0
  (0, 17)	4.0
  (0, 18)	5.0
  (0, 19)	4.0
  (0, 20)	1.0
  (0, 21)	4.0
  (0, 22)	4.0
  (0, 23)	3.0
  (0, 24)	4.0
  :	:
  (942, 738)	4.0
  (942, 755)	2.0
  (942, 762)	4.0
  (942, 764)	3.0
  (942, 784)	2.0
  (942, 793)	3.0
  (942, 795)	3.0
  (942, 807)	4.0
  (942, 815)	4.0
  (942, 823)	4.0
  (942, 824)	3.0
  (942, 830)	2.0
  (942, 839)	4.0
  (942, 927)	5.0
  (942, 940)	1.0
  (942, 942)	5.0
  (942, 1010)	2.0
  (942, 1027)	2.0
  (942, 1043)	3.0
  (942, 1046)	2.0
  (942, 1066)	2.0
  (942, 1073)	4.0
  (942, 1187)	3.0
  (942, 1227)	3.0
  (942, 1329)	3.0


In [6]:
def model_CF(rating_matrix, metric):
    """
    - create the nearest neighbors model with the corresponding similarity metric
    - fit the model
    """
    model = NearestNeighbors(metric=metric, n_neighbors=21, algorithm='brute')
    model.fit(rating_matrix)    
    return model

In [7]:
def NN(rating_matrix, model):
    """    
    :param rating_matrix : rating matrix of shape (nb_users, nb_items)
    :param model : nearest neighbors model    
    :return
        - similarities : distances of the neighbors from the referenced user
        - neighbors : neighbors of the referenced user in decreasing order of similarities
    """    
    similarities, neighbors = model.kneighbors(rating_matrix)        
    return similarities[:, 1:], neighbors[:, 1:]

model = model_CF(rating_matrix=R, metric='cosine')
similarities, neighbors = NN(R, model)


In [8]:
def candidateItems(userid):
    """
    Find candidate items for an active user
    
    :param userid : active user
    :param neighbors : users similar to the active user        
    :return candidates : top 30 of candidate items
    """
    user_neighbors = neighbors[userid]
    activities = ratings.loc[ratings.userid.isin(user_neighbors)]
    
    # sorting items in descending order of frequency
    frequency = activities.groupby('itemid')['rating'].count().reset_index(name='count').sort_values(['count'],ascending=False)
    Gu_items = frequency.itemid
    active_items = ratings.loc[ratings.userid == userid].itemid.to_list()
    candidates = np.setdiff1d(Gu_items, active_items, assume_unique=True)[:30]
        
    return candidates

In [9]:
# user mean ratings 
mean = ratings.groupby(by='userid', as_index=False)['rating'].mean()
rating_mean = pd.merge(ratings, mean, suffixes=('','_mean'), on='userid')

# each items normalized rating
rating_mean['norm_rating'] = rating_mean['rating'] - rating_mean['rating_mean']

mean = mean.to_numpy()[:, 1]

np_ratings = rating_mean.to_numpy()

In [10]:
def predict(userid, itemid):
    """
    predict what score userid would have given to itemid.
    
    :param
        - userid : user id for which we want to make prediction
        - itemid : item id on which we want to make prediction
        
    :return
        - r_hat : predicted rating of user userid on item itemid
    """
    userSimilarities = similarities[userid]
    user_neighbors = neighbors[userid]
    # user userid mean rating
    user_mean = mean[userid]
    
    # finding users who rated item 'itemid'
    iratings = np_ratings[np_ratings[:, 1].astype('int') == itemid]
    
    # similar users to userid who rated item 
    suri = iratings[np.isin(iratings[:, 0], user_neighbors)]
    
    # list of similar user who rated current item
    normalized_ratings = suri[:,4]
    indexes = [np.where(user_neighbors == uid)[0][0] for uid in suri[:, 0].astype('int')]
    sims = userSimilarities[indexes]
    
    num = np.dot(normalized_ratings, sims)
    den = np.sum(np.abs(sims))
    
    if num == 0 or den == 0:
        return user_mean
    
    r_hat = user_mean + np.dot(normalized_ratings, sims) / np.sum(np.abs(sims))
    
    return r_hat

def predictionUser2user(userid, pred_path):
    """
    Make rating prediction for the active user on each candidate item and save in file prediction.csv
    
    :param
        - userid : id of the active user
        - pred_path : where to save predictions
    """    
    # active user candidate items
    candidates = candidateItems(userid)
    
  
    for itemid in candidates:
        
        # prediction 
        r_hat = predict(userid, itemid)
        
        # saving prediction
        with open(pred_path, 'a+') as file:
            line = '{},{},{}\n'.format(userid, itemid, r_hat)
            file.write(line)

import sys

def userCF():
    """
    Make predictions for each user in the database.    
    """
  
    users = ratings.userid.unique()
    
    def _progress(count):
        sys.stdout.write('\rRating predictions. Progress status : %.1f%%' % (float(count/len(users))*100.0))
        sys.stdout.flush()
    
    predictions_file = 'predictions.csv'    
    if os.path.exists(predictions_file):
        os.remove(predictions_file)
    
    for count, userid in enumerate(users):        
        # predicting rrating for current user
        predictionUser2user(userid, predictions_file)
        _progress(count)

userCF()

def Recommendation_userCF(userid):
    """
    """
    # encoding the userid
    uid = uencoder.transform([userid])[0]
    predictions_file = 'predictions.csv'
    
    predictions = pd.read_csv(predictions_file, sep=',', names=['userid', 'itemid', 'predicted_rating'])
    predictions = predictions[predictions.userid==uid]
    List = predictions.sort_values(by=['predicted_rating'], ascending=False)
    
    List.userid = uencoder.inverse_transform(List.userid.tolist())
    List.itemid = iencoder.inverse_transform(List.itemid.tolist())
    
    List = pd.merge(List, movies, on='itemid', how='inner')
    
    return List

Rating predictions. Progress status : 99.9%

In [11]:
Recommendation_userCF(212)

Unnamed: 0,userid,itemid,predicted_rating,title
0,212,483,4.871495,Casablanca (1942)
1,212,357,4.764547,One Flew Over the Cuckoo's Nest (1975)
2,212,50,4.660002,Star Wars (1977)
3,212,98,4.613636,"Silence of the Lambs, The (1991)"
4,212,64,4.550733,"Shawshank Redemption, The (1994)"
5,212,194,4.522336,"Sting, The (1973)"
6,212,174,4.5213,Raiders of the Lost Ark (1981)
7,212,134,4.414819,Citizen Kane (1941)
8,212,187,4.344531,"Godfather: Part II, The (1974)"
9,212,196,4.303696,Dead Poets Society (1989)


#### Evaluation with Mean Absolute Error (MAE)

In [12]:

from recsys.preprocessing import train_test_split, get_examples


raw_examples, raw_labels = get_examples(ratings, labels_column='rating')

# training testing split
(x_training, x_testing), (y_train, y_test) = train_test_split(examples=raw_examples, labels=raw_labels)

def evaluation(x_testing, y_test):
    print('evaluation the model on {} test data ...'.format(x_testing.shape[0]))
    preds = list(predict(u,i) for (u,i) in x_testing)
    mae = np.sum(np.absolute(y_test - np.array(preds))) / x_testing.shape[0]
    print('\nMAE :', mae)
    return mae

evaluation(x_testing, y_test)

evaluation the model on 10000 test data ...

MAE : 0.7505910931068639


0.7505910931068639

In [13]:

from recsys.memories.UserToUser import UserToUser

# loading ml100k ratings
ratings, movies = ml100k.load()

# preparing dataset
ratings, uencoder, iencoder = ids_encoder(ratings)


raw_examples, raw_labels = get_examples(ratings, labels_column='rating')

# training testing split
(x_training, x_testing), (y_train, y_test) = train_test_split(examples=raw_examples, labels=raw_labels)

# user-based CF
usertouser = UserToUser(ratings, movies, metric='cosine')

# evaluating ser-based CF on the ml100k testing data
usertouser.evaluate(x_testing, y_test)

Normalize users ratings ...
Initialize the similarity model ...
Compute nearest neighbors ...
User to user recommendation model created with success ...
Evaluate the model on 10000 test data ...

MAE : 0.7505910931068639


0.7505910931068639

In [14]:
from recsys.datasets import ml1m
from recsys.preprocessing import ids_encoder, get_examples, train_test_split
from recsys.memories.UserToUser import UserToUser

# loading ml100k ratings
ratings, movies = ml1m.load()

# preparing dataset
ratings, uencoder, iencoder = ids_encoder(ratings)


raw_examples, raw_labels = get_examples(ratings, labels_column='rating')

# training testing split
(x_training, x_testing), (y_train, y_test) = train_test_split(examples=raw_examples, labels=raw_labels)

# create the user-based CF
usertouser = UserToUser(ratings, movies, k=20, metric='cosine')

# evaluation the user-based CF on the ml1m test data
print("==========================")
usertouser.evaluate(x_testing, y_test)

Normalize users ratings ...
Initialize the similarity model ...
Compute nearest neighbors ...
User to user recommendation model created with success ...
Evaluate the model on 100021 test data ...

MAE : 0.732267005840993


0.732267005840993