**ITEM** BASED COLLABORATIVE FILTERING 
Google colab https://colab.research.google.com/drive/1ZBEn06A74zOyfakIppsHCpQ6UEoYzUR3?usp=sharing


In [1]:
import os

if not (os.path.exists("recsys.zip") or os.path.exists("recsys")):
    !wget https://github.com/nzhinusoftcm/review-on-collaborative-filtering/raw/master/recsys.zip     
    !unzip recsys.zip


### Import requirements

In [2]:
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix

from recsys.datasets import ml1m, ml100k
from recsys.preprocessing import ids_encoder

import pandas as pd
import numpy as np
import os
import sys



### Load ratings

In [3]:
ratings, movies = ml100k.load()

Download data 100.2%
Successfully downloaded ml-100k.zip 4924029 bytes.
Unzipping the ml-100k.zip zip file ...


### userids and itemids encoding

In [4]:
# creation of  the encoder
ratings, uencoder, iencoder = ids_encoder(ratings)

In [5]:
def normalize():
    # each user mean rating computation
    mean = ratings.groupby(by='userid', as_index=False)['rating'].mean()
    normalized_ratings = pd.merge(ratings, mean, suffixes=('','_mean'), on='userid')
    
    #  each rating normalization by substracting the mean rating of the particular user
    normalized_ratings['norm_rating'] = normalized_ratings['rating'] - normalized_ratings['rating_mean']
    return mean.to_numpy()[:, 1], normalized_ratings

In [6]:
mean, normalized_ratings = normalize()
np_ratings = normalized_ratings.to_numpy()
normalized_ratings.head()

Unnamed: 0,userid,itemid,rating,rating_mean,norm_rating
0,0,0,5,3.610294,1.389706
1,0,1,3,3.610294,-0.610294
2,0,2,4,3.610294,0.389706
3,0,3,3,3.610294,-0.610294
4,0,4,3,3.610294,-0.610294


In [7]:
def movie_matrix(ratings):    
    return csr_matrix(
        pd.crosstab(ratings.itemid, ratings.userid, ratings.norm_rating, aggfunc=sum).fillna(0).values
    )

In [8]:
R = movie_matrix(normalized_ratings)
print(R)

  (0, 0)	1.3897058823529411
  (0, 1)	0.2903225806451615
  (0, 4)	1.125714285714286
  (0, 5)	0.3649289099526065
  (0, 9)	-0.2065217391304346
  (0, 12)	-0.09748427672955984
  (0, 14)	-1.875
  (0, 15)	0.6714285714285717
  (0, 16)	0.9642857142857144
  (0, 17)	1.1191335740072201
  (0, 19)	-0.10416666666666652
  (0, 20)	2.329608938547486
  (0, 22)	1.3642384105960264
  (0, 24)	0.9487179487179489
  (0, 25)	0.05607476635513997
  (0, 37)	1.28099173553719
  (0, 40)	0.23076923076923084
  (0, 41)	1.2732240437158469
  (0, 42)	1.2895927601809953
  (0, 43)	0.3509933774834435
  (0, 44)	1.3958333333333335
  (0, 48)	-0.6837209302325582
  (0, 53)	0.3076923076923075
  (0, 55)	0.35828877005347604
  (0, 56)	1.5
  :	:
  (1661, 781)	1.2112068965517242
  (1662, 781)	-0.7887931034482758
  (1663, 781)	1.2112068965517242
  (1663, 838)	-2.192982456140351
  (1663, 869)	0.5464684014869889
  (1663, 879)	0.5733695652173911
  (1664, 781)	-0.7887931034482758
  (1665, 781)	-0.7887931034482758
  (1666, 781)	0.2112068965517

In [9]:
def itemCF_model(rating_matrix, k=20, metric="cosine"):
    """
    :param R : numpy array of item representations
    :param k : number of nearest neighbors to return    
    :return model : our knn model
    """    
    model = NearestNeighbors(metric=metric, n_neighbors=k+1, algorithm='brute')
    model.fit(rating_matrix)    
    return model

In [10]:
def NN(rating_matrix, model):
    """
    compute the top n similar items for each item.    
    :param rating_matrix : items representations
    :param model : nearest neighbors model    
    :return similarities, neighbors
    """    
    similarities, neighbors = model.kneighbors(rating_matrix)    
    return similarities[:,1:], neighbors[:,1:]

In [11]:
def similarities_file(similarities, neighbors, dataset_name):    
    base_dir = 'recsys/weights/item2item'
    save_dir = os.path.join(base_dir, dataset_name)
    os.makedirs(save_dir, exist_ok=True)    
    similarities_file_name = os.path.join(save_dir, 'similarities.npy')
    neighbors_file_name = os.path.join(save_dir, 'neighbors.npy')    
    try:
        np.save(similarities_file_name, similarities)
        np.save(neighbors_file_name, neighbors)        
    except ValueError as error:
        print(f"An error occured when saving similarities, due to : \n ValueError : {error}")

        
def loading_similarity(dataset_name, k=20):
    base_dir = 'recsys/weights/item2item'
    save_dir = os.path.join(base_dir, dataset_name)    
    similiraties_file = os.path.join(save_dir, 'similarities.npy')
    neighbors_file = os.path.join(save_dir, 'neighbors.npy')    
    similarities = np.load(similiraties_file)
    neighbors = np.load(neighbors_file)    
    return similarities[:,:k], neighbors[:,:k]


def cosine(x, y):
    return np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y))


def adjusted_cosine(np_ratings, nb_items, dataset_name):
    similarities = np.zeros(shape=(nb_items, nb_items))
    similarities.fill(-1)
    
    def _progress(count):
        sys.stdout.write('\rComputing similarities. Progress status : %.1f%%' % (float(count / nb_items)*100.0))
        sys.stdout.flush()
        
    items = sorted(ratings.itemid.unique())    
    for i in items[:-1]:
        for j in items[i+1:]:            
            scores = np_ratings[(np_ratings[:, 1] == i) | (np_ratings[:, 1] == j), :]
            vals, count = np.unique(scores[:,0], return_counts = True)
            scores = scores[np.isin(scores[:,0], vals[count > 1]),:]

            if scores.shape[0] > 2:
                x = scores[scores[:, 1].astype('int') == i, 4]
                y = scores[scores[:, 1].astype('int') == j, 4]
                w = cosine(x, y)

                similarities[i, j] = w
                similarities[j, i] = w
        _progress(i)
    _progress(nb_items)
    
    # Sorting neighbors based on their similarities in decreasing order

    neighbors = np.flip(np.argsort(similarities), axis=1)
    
    # Sorting Similarities 
    similarities = np.flip(np.sort(similarities), axis=1)
    
    # saving similarties
    similarities_file(similarities, neighbors, dataset_name=dataset_name) 
    
    return similarities, neighbors

In [12]:
# metric : adjusted cosine metric is chosen

metric = 'adjusted_cosine'

if metric == 'adjusted_cosine':
    similarities, neighbors = loading_similarity('ml100k')
else:
    model = itemCF_model(R, k=21, metric=metric)
    similarities, neighbors = NN(R, model)

In [13]:
def candidate_movies(userid):
    """
    :param userid : user id for which we wish to find candidate items    
    :return : I_u, candidates
    """
    
    # 1. Already rated item set I_u by the user 
    I_u = np_ratings[np_ratings[:, 0] == userid]
    I_u = I_u[:, 1].astype('int')
    
    # 2. The set of candidate items is formed by unioning similar items in I_u
    c = set()
        
    for iid in I_u:    
        # In the candidate set, add items neighboring item iid
        c.update(neighbors[iid])
        
    c = list(c)
    # 3. excluding from the set C all items in I_u.
    candidates = np.setdiff1d(c, I_u, assume_unique=True)
    
    return I_u, candidates

In [14]:
test_user = uencoder.transform([1])[0]
i_u, u_candidates = candidate_movies(test_user)

In [15]:
print('number of items purchased by user 1 : ', len(i_u))
print('number of candidate items for user 1 : ', len(u_candidates))

number of items purchased by user 1 :  272
number of candidate items for user 1 :  893


In [16]:
def similarity_Iu(c, I_u):
    """
    compute similarity between an item c and a set of items I_u. For each item i in I_u, get similarity between 
    i and c, if c exists in the set of items similar to itemid.    
    :param c : itemid of a candidate item
    :param I_u : set of items already purchased by a given user    
    :return w : similarity between c and I_u
    """
    w = 0    
    for iid in I_u :        
        # Find the similarity between c and itemid
        if c in neighbors[iid] :
            w = w + similarities[iid, neighbors[iid] == c][0]    
    return w

In [17]:
def candidateRanking(candidates, I_u):
    """
    rank candidate items according to their similarities with i_u    
    :param candidates : list of candidate items
    :param I_u : list of items purchased by the user    
    :return ranked_candidates : dataframe of candidate items, ranked in descending order of similarities with I_u
    """
    
    # Mapping candidate items to their Iu and corresponding similarities 
    sims = [similarity_Iu(c, I_u) for c in candidates]
    candidates = iencoder.inverse_transform(candidates)    
    mapping = list(zip(candidates, sims))
    
    ranked_candidates = sorted(mapping, key=lambda couple:couple[1], reverse=True)    
    return ranked_candidates

In [18]:
def topRecommendation(userid, N=40):
    """
    Produce top-N recommendation for a given user    
    :param userid : user for which we produce top-N recommendation
    :param n : length of the top-N recommendation list    
    :return topn
    """
    # finding  candidate_movies
    I_u, candidates = candidate_movies(userid)
    
    # ranking candidate_movies 
    ranked_candidates = candidateRanking(candidates, I_u)
    
    # Top N row of ranked_candidates
    topn = pd.DataFrame(ranked_candidates[:N], columns=['itemid','similarity_Iu'])    
    topn = pd.merge(topn, movies, on='itemid', how='inner')    
    return topn

In [19]:
topRecommendation(test_user)

Unnamed: 0,itemid,similarity_Iu,title
0,1356,52.867173,Ed's Next Move (1996)
1,1189,50.362199,Prefontaine (1997)
2,1516,31.133267,"Wedding Gift, The (1994)"
3,1550,31.031738,Destiny Turns on the Radio (1995)
4,1554,27.364494,Safe Passage (1994)
5,1600,27.287712,Guantanamera (1994)
6,1223,26.63185,King of the Hill (1993)
7,1388,26.624397,Gabbeh (1996)
8,766,26.590175,Man of the Year (1995)
9,691,26.461802,Dark City (1998)


In [20]:
def predict(userid, itemid):
    """
    Make rating prediction for user userid on item itemid    
    :param userid : id of the active user
    :param itemid : id of the item for which we are making prediction        
    :return r_hat : predicted rating
    """
    
    #finding similar items with similarities
    item_neighbors = neighbors[itemid]
    item_similarities = similarities[itemid]
    
    # rating of user with userid
    uratings = np_ratings[np_ratings[:, 0].astype('int') == userid]
    
   
    # similar rated items 
    siru = uratings[np.isin(uratings[:, 1], item_neighbors)]
    scores = siru[:, 2]
    indexes = [np.where(item_neighbors == iid)[0][0] for iid in siru[:,1].astype('int')]    
    sims = item_similarities[indexes]
    
    dot = np.dot(scores, sims)
    som = np.sum(np.abs(sims))

    if dot == 0 or som == 0:
        return mean[userid]
    
    return dot / som

In [21]:
def topNPrediction(userid):

 
    # top N recommendation for active user
    topn = topRecommendation(userid)
    
    # top N  movie list
    itemids = topn.itemid.to_list()
    
    predictions = []

    # predicting for every movie in top list
    for itemid in itemids:
        r = predict(userid, itemid)
        
        predictions.append((itemid,r))
    
    predictions = pd.DataFrame(predictions, columns=['itemid','prediction'])
    

    # merging predicted results
    topn_predict = pd.merge(topn, predictions, on='itemid', how='inner')
    topn_predict = topn_predict.sort_values(by=['prediction'], ascending=False)
    
    return topn, topn_predict

In [22]:
topn, topn_predict = topNPrediction(userid=test_user)

In [23]:
topn_predict

Unnamed: 0,itemid,similarity_Iu,title,prediction
37,1301,19.738174,Stripes (1981),5.0
36,992,19.770985,Head Above Water (1996),5.0
7,1388,26.624397,Gabbeh (1996),4.666667
18,359,22.973658,"Assignment, The (1997)",4.6
4,1554,27.364494,Safe Passage (1994),4.5
14,1538,24.492453,All Over Me (1997),4.5
27,1448,20.846909,My Favorite Season (1993),4.490052
29,1375,20.627152,"Cement Garden, The (1993)",4.333333
26,1466,21.063269,Margaret's Museum (1995),4.271915
39,817,19.377984,Frisk (1995),4.0


In [24]:
from recsys.preprocessing import train_test_split, get_examples


raw_examples, raw_labels = get_examples(ratings, labels_column='rating')

# training testing split
(x_training, x_testing), (y_train, y_testing) = train_test_split(examples=raw_examples, labels=raw_labels)

def evaluate(x_testing, y_testing):
    print('Evaluate the model on {} test data ...'.format(x_testing.shape[0]))
    preds = list(predict(u,i) for (u,i) in x_testing)
    mae = np.sum(np.absolute(y_testing - np.array(preds))) / x_testing.shape[0]
    print('\nMAE :', mae)
    return mae

In [25]:
evaluate(x_testing, y_testing)

Evaluate the model on 10000 test data ...

MAE : 0.672389703640273


0.672389703640273

### Summary

In [26]:
from recsys.memories.ItemToItem import ItemToItem
from recsys.preprocessing import ids_encoder, train_test_split, get_examples
from recsys.datasets import ml100k

# loading  data
ratings, movies = ml100k.load()

# preparing data
ratings, uencoder, iencoder = ids_encoder(ratings)


raw_examples, raw_labels = get_examples(ratings, labels_column='rating')

# training testing  split
(x_training, x_testing), (y_train, y_testing) = train_test_split(examples=raw_examples, labels=raw_labels)

In [27]:
# using Item-based CF
item2item = ItemToItem(ratings, movies, k=20, metric='cosine', dataset_name='ml100k')

Normalize ratings ...
Create the similarity model ...
Compute nearest neighbors ...
Item to item recommendation model created with success ...


In [28]:
# evaluating item based on testing dataset
item2item.evaluate(x_testing, y_testing)

Evaluate the model on 10000 test data ...

MAE : 0.507794195659005


0.507794195659005

#### Evaluate the Item-based CF on the ML-1M dataset

In [29]:
from recsys.memories.ItemToItem import ItemToItem
from recsys.preprocessing import ids_encoder, train_test_split, get_examples
from recsys.datasets import ml1m

# loading data
ratings, movies = ml1m.load()

# preparing data
ratings, uencoder, iencoder = ids_encoder(ratings)


raw_examples, raw_labels = get_examples(ratings, labels_column='rating')

# training testing split
(x_training, x_testing), (y_train, y_testing) = train_test_split(examples=raw_examples, labels=raw_labels)

# Item-based_ CF
item2item = ItemToItem(ratings, movies, k=20, metric='cosine', dataset_name='ml1m')

# evaluating Item based on testing dataset
print("=========================")
item2item.evaluate(x_testing, y_testing)

Download data 100.1%
Successfully downloaded ml-1m.zip 5917549 bytes.
Unzipping the ml-1m.zip zip file ...
Normalize ratings ...
Create the similarity model ...
Compute nearest neighbors ...
Item to item recommendation model created with success ...
Evaluate the model on 100021 test data ...

MAE : 0.42514728655396045


0.42514728655396045