# About

Implementation of **User-Based Collaborative Filtering** that uses neighborhood strategy to estimate ratings. The similarity algorithm is naïve because it considers users that watched the same titles (not necessarily considering their ratings). Howerver, it is just a demo notebook and it is possible to replace the similarity calculation by any approach.

In [188]:
import itertools
from pathlib import Path

import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

## Load the dataset

This notebook assumes you are working with MovieLens 100k dataset.

In [175]:
ratings_filepath = Path("../datasets/ml-100k/u.data")

In [176]:
ratings_df = pd.read_csv(ratings_filepath, delimiter="\t", names=["userId", "itemId", "rating", "timestamp"])
ratings_df.head()

Unnamed: 0,userId,itemId,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


## Turn dataframe into a Pivot Table

Pivot Tables allow us to place users into rows, items into columns and represent ratings as values.

In [177]:
ratings_pt = pd.pivot_table(ratings_df, values="rating", index=["userId"], columns=["itemId"])
ratings_pt.head()

itemId,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,...,,,,,,,,,,
2,4.0,,,,,,,,,2.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,3.0,,,,,,,,,...,,,,,,,,,,


## Finding similar users

To find similar users we are going to convert the Ratings Pivot Table into vectors into 0 (not watched) or 1 (watched)

In [178]:
movies_watched_df = ratings_pt.notnull().astype('int')
movies_watched_df.head()

itemId,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [179]:
similarities = cosine_similarity(movies_watched_df)
similarities

array([[1.        , 0.13860925, 0.06600984, ..., 0.12927192, 0.18418988,
        0.38827462],
       [0.13860925, 1.        , 0.15554275, ..., 0.18953563, 0.17146357,
        0.08818445],
       [0.06600984, 0.15554275, 1.        , ..., 0.11605177, 0.13779456,
        0.02099803],
       ...,
       [0.12927192, 0.18953563, 0.11605177, ..., 1.        , 0.09594782,
        0.08224396],
       [0.18418988, 0.17146357, 0.13779456, ..., 0.09594782, 1.        ,
        0.16492459],
       [0.38827462, 0.08818445, 0.02099803, ..., 0.08224396, 0.16492459,
        1.        ]])

In [180]:
similarities = pd.DataFrame(similarities)
similarities.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,933,934,935,936,937,938,939,940,941,942
0,1.0,0.138609,0.06601,0.086638,0.366679,0.400724,0.437957,0.268391,0.064636,0.344189,...,0.353942,0.145638,0.244238,0.182154,0.210042,0.12993,0.304808,0.129272,0.18419,0.388275
1,0.138609,1.0,0.155543,0.181467,0.048002,0.279778,0.113874,0.099204,0.162459,0.149801,...,0.144418,0.284708,0.351701,0.40161,0.293294,0.199572,0.208719,0.189536,0.171464,0.088184
2,0.06601,0.155543,1.0,0.361111,0.010287,0.093683,0.094903,0.124015,0.058026,0.080257,...,0.030949,0.043581,0.182717,0.129099,0.130946,0.038881,0.197334,0.116052,0.137795,0.020998
3,0.086638,0.181467,0.361111,1.0,0.030861,0.084315,0.122018,0.212598,0.087039,0.060193,...,0.046424,0.032686,0.137038,0.193649,0.137493,0.029161,0.197334,0.174078,0.183726,0.062994
4,0.366679,0.048002,0.010287,0.030861,1.0,0.218569,0.384086,0.226351,0.064466,0.195047,...,0.33238,0.072627,0.095154,0.059761,0.152753,0.075593,0.20462,0.096699,0.144583,0.344095


In [181]:
np.fill_diagonal(similarities.values, 0)

In [182]:
def get_similarity(uid_1, uid_2, df, sim):
    pos_1 = df.index.get_loc(uid_1)
    pos_2 = df.index.get_loc(uid_2)
    return sim[pos_1][pos_2]

def get_most_similar_users(uid, df, sim, N):
    sim_index = df.index.get_loc(uid)
    similar_sim_uids = sim.loc[sim_index].sort_values(ascending=False)[:N].keys()
    return df.index[similar_sim_uids]

In [183]:
uid_1 = 1
user_ids = get_most_similar_users(uid_1, ratings_pt, similarities, 10)
sim = get_similarity(uid_1, user_ids[0], ratings_pt, similarities)

uid_1, user_ids[0], sim

(1, 916, 0.5278586163659504)

## Recommendation

In [193]:
def estimate(uid, iid, df, sim):
    similar_uids = get_most_similar_users(uid, df, sim, 10)
    res = df.loc[similar_uids][[iid]].mean()
    if res.isna().all():
        return 0.0
    return float(res.values[0])

In [194]:
def get_recommendation(uid, df, sim):
    similar_uids = get_most_similar_users(uid, df, sim, 10)
    not_rated_iids = df.loc[uid][df.isna().loc[uid]].keys()
    
    predictions = [(iid, estimate(uid, iid, df, sim)) for iid in not_rated_iids]
    predictions.sort(key=lambda x: x[1], reverse=True)
    return predictions[:10]

In [195]:
not_rated_iids = ratings_pt.loc[1][ratings_pt.isna().loc[1]].keys()
not_rated_iids

Int64Index([ 273,  274,  275,  276,  277,  278,  279,  280,  281,  282,
            ...
            1673, 1674, 1675, 1676, 1677, 1678, 1679, 1680, 1681, 1682],
           dtype='int64', name='itemId', length=1410)

In [196]:
rec = get_recommendation(1, ratings_pt, similarities)
rec

[(285, 5.0),
 (331, 5.0),
 (492, 5.0),
 (508, 5.0),
 (512, 5.0),
 (513, 5.0),
 (518, 5.0),
 (647, 5.0),
 (653, 5.0),
 (654, 5.0)]