In [1]:
# polara
!pip install --upgrade git+https://github.com/evfro/polara.git@develop#egg=polara

# ipypb:
!pip install ipypb

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting polara
  Cloning https://github.com/evfro/polara.git (to revision develop) to /tmp/pip-install-xlzcfus6/polara_f46fc0427005444db8595f149c3f299e
  Running command git clone --filter=blob:none --quiet https://github.com/evfro/polara.git /tmp/pip-install-xlzcfus6/polara_f46fc0427005444db8595f149c3f299e
  Running command git checkout -b develop --track origin/develop
  Switched to a new branch 'develop'
  Branch 'develop' set up to track remote branch 'develop' from 'origin'.
  Resolved https://github.com/evfro/polara.git to commit 8e48cfd88e616ca53f8bbda1702a3e2c8abaf38e
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: polara
  Building wheel for polara (setup.py) ... [?25l[?25hdone
  Created wheel for polara: filename=polara-0.7.2.dev0-py3-none-any.whl size=89470 sha256=1adf677c7aa9bb011227e24cb58241ebc68042e3176d21b2f7380510c002535f


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
%cd /content/drive/MyDrive/Skoltech/RecSys/Assignment 1

/content/drive/MyDrive/Skoltech/RecSys/Assignment 1


In [4]:
import numpy as np
import pandas as pd

from polara import get_movielens_data
from polara.preprocessing.dataframes import leave_one_out, reindex
from scipy.sparse import csr_matrix, diags, isspmatrix
from sklearn.metrics.pairwise import cosine_similarity


#from dataprep import transform_indices, verify_time_split, generate_interactions_matrix
from dataprep import transform_indices, verify_time_split
from evaluation import topn_recommendations, model_evaluate, downvote_seen_items

# Task

Implement two variants of user-based KNN for the top-$n$ recommendations task when:
1. similarity matrix is symmetric,
2. similarity matrix is asymmetric.

Recall, there's no reason for implementing row-wise weighting scheme in user-based KNN. So choose the weighting scheme wisely.

 In your experiments:  
- Test your solution against both weak and strong generalization. 
  - In total you'll have 4 different experiments.
- Follow the "most-recent-item" sampling strategy for constructing holdout.
  - Explain potential issues of this scheme in relation to both weak and strong generalization.  
- Report evaluation metrics, compare the models, and analyse the results.  
- Use Movielens-1M data.

**Note**: you can reuse some code from seminars if necessary.

In [5]:
data = get_movielens_data(include_time=True)

In [6]:
data.head()

Unnamed: 0,userid,movieid,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


# Weak generalization test

## Preparing data (1 pts)

Your task is
- split data into training and holdout parts
- build a new internal contiguous representation of user and item index based on the training data
- make sure same index is used in the holdout data

In [7]:
def generate_interactions_matrix(data, data_description, rebase_users=False):
    '''
    Convert pandas dataframe with interactions into a sparse matrix.
    Allows reindexing user ids, which help ensure data consistency
    at the scoring stage (assumes user ids are sorted in scoring array).
    '''
    n_users = data_description['n_users']
    n_items = data_description['n_items']
    # get indices of observed data
    user_idx = data[data_description['users']].values
    if rebase_users:
        user_idx, user_index = pd.factorize(user_idx, sort=True)
        n_users = len(user_index)
    item_idx = data[data_description['items']].values
    feedback = data[data_description['feedback']].values
    # construct rating matrix
    return csr_matrix((feedback, (user_idx, item_idx)), shape=(n_users, n_items))

In [8]:
# split most recent holdout item from each user
training_, holdout_ = leave_one_out(
    data,
    target='timestamp',
    sample_top=True,
    random_state=0
)

# check correct time splitting
verify_time_split(training_, holdout_)

In [9]:
holdout_

Unnamed: 0,userid,movieid,rating,timestamp
324593,1922,506,4,978163419
818775,4918,1617,3,968087239
148613,957,1260,4,980731400
778833,4653,1836,3,975535839
525463,3245,3646,3,968308837
...,...,...,...,...
928100,5606,318,5,959194215
18265,141,2203,5,977357662
440014,2690,32,3,973344717
964963,5819,1580,4,957906688


In [10]:
training_

Unnamed: 0,userid,movieid,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291
...,...,...,...,...
1000204,6040,1091,1,956716541
1000205,6040,1094,5,956704887
1000206,6040,562,5,956704746
1000207,6040,1096,4,956715648


In [11]:
# reindex data to make contiguous index starting from 0 for user and item IDs
training, data_index = transform_indices(training_, 'userid', 'movieid')

# apply new index to the holdout data
holdout = reindex(holdout_, data_index.values(), filter_invalid=True)
holdout = holdout.sort_values('userid')

Filtered 2 invalid observations.


In [12]:
training

Unnamed: 0,userid,movieid,rating,timestamp
0,0,1104,5,978300760
1,0,639,3,978302109
2,0,853,3,978301968
3,0,3175,4,978300275
4,0,2161,5,978824291
...,...,...,...,...
1000204,6039,1019,1,956716541
1000205,6039,1022,5,956704887
1000206,6039,548,5,956704746
1000207,6039,1024,4,956715648


In [13]:
data_index

{'users': Int64Index([   1,    2,    3,    4,    5,    6,    7,    8,    9,   10,
             ...
             6031, 6032, 6033, 6034, 6035, 6036, 6037, 6038, 6039, 6040],
            dtype='int64', name='userid', length=6040),
 'items': Int64Index([   1,    2,    3,    4,    5,    6,    7,    8,    9,   10,
             ...
             3943, 3944, 3945, 3946, 3947, 3948, 3949, 3950, 3951, 3952],
            dtype='int64', name='movieid', length=3704)}

In [14]:
holdout

Unnamed: 0,userid,movieid,rating,timestamp
25,0,47,5,978824351
87,1,1420,4,978300174
232,2,1900,4,978298504
235,3,2742,4,978294282
258,4,279,2,978246585
...,...,...,...,...
999251,6035,2440,1,956755196
999684,6036,421,3,956801840
999731,6037,1094,5,956717204
999826,6038,1162,4,956758029


- Let's also populate data description dictionary for convenience.
- It allows using uniform names for users and items field.
  - This way the code does't depend on the actual names in you dataset.
  - So later you can easily switch to another dataset without changing the code fo the pipeline.


In [15]:
data_description = dict(
    users = data_index['users'].name,
    items = data_index['items'].name,
    feedback = 'rating',
    n_users = len(data_index['users']),
    n_items = len(data_index['items']),
    test_users = holdout[data_index['users'].name].values
)

As previously, let's also explicitly store our testset (i.e., ratings of test users excluding holdout items).

In [16]:
userid = data_description['users']
seen_idx_mask = training[userid].isin(data_description['test_users'])
testset = training[seen_idx_mask]

## Models implementation

### Symmetric case (5 pts)

- You can consult the code from seminars or implement your own solution as long as it is fast enough.

- Recall that subsampling of the neighborhood not only makes the algorithm run faster, but can also improve the results.  
- **Make sure to implement some kind of neighborhood subsampling.**

In [17]:
def truncate_similarity(similarity, k):
    '''
    For every row in similarity matrix, pick at most k entities
    with the highest similarity scores. Disregard everything else.
    '''
    similarity = similarity.tocsr()
    inds = similarity.indices
    ptrs = similarity.indptr
    data = similarity.data
    new_ptrs = [0]
    new_inds = []
    new_data = []
    for i in range(len(ptrs)-1):
        start, stop = ptrs[i], ptrs[i+1]
        if start < stop:
            data_ = data[start:stop]
            topk = min(len(data_), k)
            idx = np.argpartition(data_, -topk)[-topk:]
            new_data.append(data_[idx])
            new_inds.append(inds[idx+start])
            new_ptrs.append(new_ptrs[-1]+len(idx))
        else:
            new_ptrs.append(new_ptrs[-1])
    new_data = np.concatenate(new_data)
    new_inds = np.concatenate(new_inds)
    truncated = csr_matrix(
        (new_data, new_inds, new_ptrs),
        shape=similarity.shape
    )
    return truncated 

In [18]:
def cosine_similarity_zd(matrix):
    '''Build cosine similarity matrix with zero diagonal.'''
    similarity = cosine_similarity(matrix, dense_output=False)
    similarity.setdiag(0)
    similarity.eliminate_zeros()
    return similarity.tocsr()

In [20]:
def build_uknn_model(config, data, data_description):
    user_item_mtx = generate_interactions_matrix(data, data_description)


    # compute similarity matrix
    user_similarity = cosine_similarity_zd(user_item_mtx)
    #print('user_similarity', user_similarity)
    user_similarity = truncate_similarity(user_similarity, config['n_neighbors']) 
    #print('user_similarity after truncate_similarity', user_similarity)
    weighting = config['weighting']
    weights = None

    if weighting == 'elementwise':
        weights = user_similarity.dot(user_item_mtx.astype('bool')) # BS.T
    return user_item_mtx, user_similarity, weights


def uknn_model_scoring(params, testset, testset_description):
    # implement the scoring function to assign scores
    # to all items for test users
    user_item_mtx, user_similarity, weights = params
    test_users = data_description['test_users']
    user_similarity = user_similarity[test_users, :][:, test_users]
    user_item_mtx = generate_interactions_matrix(
        testset, testset_description, rebase_users=True)
    # write your code for scoring, don't forget to return a dense array
    if weights is None:
      scores = user_similarity.dot(user_item_mtx).A
      return scores

    # if weights is None: #if not weighted:
    #     return user_similarity.dot(user_item_mtx).A
    
    # if isspmatrix(weights):
    #   test_users = testset_description['test_users']
    #   assert len(test_users) == user_item_mtx.shape[0]
    #   test_weights = weights[test_users, :].A
    #   scores = user_similarity.dot(user_item_mtx).A # KA ./ KB
    #   test_scores = np.divide(
    #       scores,
    #       test_weights,
    #       where = (test_weights!=0)
    #     )   
    #   return test_scores
    # normalizer = diags(np.divide(1., weights, where=weights!=0))
    # return ...

In [21]:
n_neighbors = 100

config = {'weighting': None, 'n_neighbors': n_neighbors}

uknn_params = build_uknn_model(config, training, data_description)

In [22]:
uknn_scores = uknn_model_scoring(uknn_params, testset, data_description)

In [23]:
downvote_seen_items(uknn_scores, testset, data_description)

Note: recommending items from user history doesn't make sense.

### Asymmetric case (5 pts)

- Your task here is to implement user-based KNN with asymmetric similarity.

In [27]:
def build_uknn_model_asym(config, data, data_description):
    user_item_mtx = generate_interactions_matrix(data, data_description)
    user_similarity = cosine_similarity_zd(user_item_mtx)
    user_similarity = truncate_similarity(user_similarity, config['n_neighbors']) 
    weighting = config['weighting']
    weights = None
    if weighting == 'columnwise':
      weights = (
            user_similarity
            ._with_data(np.abs(user_similarity.data))
            .sum(axis=0)
            .A.squeeze()
        )
    
    return user_item_mtx, user_similarity, weights


def uknn_model_scoring_asym(params, testset, testset_description):
    user_item_mtx, user_similarity, weights = params
    test_users = data_description['test_users']
    user_similarity = user_similarity[test_users, :][:, test_users]
    #print(user_similarity.shape)
    user_item_mtx = generate_interactions_matrix(
        testset, testset_description, rebase_users=True)
   # print(user_item_mtx.shape)
    if isinstance(weights, np.ndarray):
      weights = weights[test_users]
      normalizer = diags(np.divide(1., weights, where=weights!=0))
     # print(normalizer.shape)
      return user_similarity.dot(normalizer.dot(user_item_mtx)).A

In [25]:
config = {'weighting': 'columnwise', 'n_neighbors': n_neighbors}
uknn_params_asym = build_uknn_model_asym(config, training, data_description)

In [28]:
uknn_scores_asym = uknn_model_scoring_asym(uknn_params_asym, testset, data_description)

In [29]:
downvote_seen_items(uknn_scores_asym, testset, data_description)

 ## Evaluation (1 pts)

#### Generate top-$n$ recommendations for both models

In [30]:
uknn_recs = topn_recommendations(uknn_scores)

In [31]:
uknn_recs

array([[ 354, 1106, 1900, ..., 1915, 1897, 1906],
       [ 253,  513, 1449, ..., 2556, 1476, 1563],
       [2373,  575, 1848, ...,  578,  106, 1820],
       ...,
       [1107, 2650,  708, ..., 1106, 2784, 2202],
       [ 849,  908,  847, ...,  837,  580,  593],
       [1131, 1288, 1175, ..., 1143, 1114, 1773]])

In [32]:
uknn_recs.shape

(6038, 10)

In [33]:
uknn_recs_asym = topn_recommendations(uknn_scores_asym)

In [34]:
uknn_recs_asym

array([[1106, 1900,  354, ...,  309, 1915, 1899],
       [ 513,  253, 1449, ...,  593,  802, 1025],
       [2373, 1050, 2707, ..., 2425, 2709,  575],
       ...,
       [ 708,    0, 1107, ...,  309,   33, 1139],
       [ 849,  580, 2845, ...,  859, 1186, 1155],
       [1883, 1107, 1131, ..., 1143, 1114, 1212]])

### Calculate metrics

In [35]:
modes = ['symmetric', 'asymmetric']
#modes = ['symmetric']
uknn_recs = dict(zip(modes, [uknn_recs, uknn_recs_asym]))
#uknn_recs = dict(zip(modes, [uknn_recs]))

uknn_metrics = {}
for mode, recs in uknn_recs.items():
    if recs is None: continue
    uknn_metrics[mode] = metrics = model_evaluate(recs, holdout, data_description)
    print(
        f'Similarity type: {mode}\n'\
        'HR={:.3}, MRR={:.3}, COV={:.3}\n'.format(*metrics)
    )

Similarity type: symmetric
HR=0.085, MRR=0.0286, COV=0.176

Similarity type: asymmetric
HR=0.0886, MRR=0.0308, COV=0.275



# Strong generalization test

- Recall that in the strong generalization test you work with the warm-start scenario.
- It means that the set of test users is disjoint from the set of users in the training.
- You're provided with the basic functions to help you perform correct splitting, but there're still a few places where your input is required. Make sure you understand the logic of data splitting in this scenario.

## Preparing data (2 pts)

- Your task is to select a subset of users who have the most recent interactions in their history across entire dataset.
- You will apply holdout splitting to only this subset.
  - Think, why simply taking all users (as in weak generalization test) makes no sense in this scenario. 

In [36]:
def split_by_time(data, time_q=0.95, timeid='timestamp'):
    '''
    Split the input `data` DataFrame into two parts based on the timestamp, with the split point
    being determined by the quantile value `time_q`. The function returns a tuple `(before, after)`
    containing the two DataFrames. The `after` DataFrame contains the rows with timestamps greater
    than or equal to the split point, while the `before` DataFrame contains the remaining rows. 

    Details:
    The `quantile` method of the pandas DataFrame is used to calculate the time point (i.e., timestamp)
    that divides the data into two parts based on the given quantile value `time_q`. Specifically,
    the time point `split_timepoint` is calculated as the `time_q`th quantile of the values in the `timeid`
    column of the `data` DataFrame, using the interpolation method of `nearest`. This means that
    `split_timepoint` is the timestamp at or immediately after which `time_q` percent of the data points occur.    
    '''
    split_timepoint = data[timeid].quantile(q=time_q, interpolation='nearest')
    after = data.query(f'{timeid} >= @split_timepoint') 
    before = data.drop(after.index)
    return before, after

Firstly, you need to select a candidate subset of observations, from which you'll construct the the training, testset, and holdout datssets. Check the `split_by_time` function below and its description in the above cell.

In [37]:
before, after = split_by_time(data, time_q=0.95)

In [38]:
before

Unnamed: 0,userid,movieid,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291
...,...,...,...,...
1000204,6040,1091,1,956716541
1000205,6040,1094,5,956704887
1000206,6040,562,5,956704746
1000207,6040,1096,4,956715648


In [39]:
after

Unnamed: 0,userid,movieid,rating,timestamp
2327,19,318,4,994556598
2492,19,1234,5,994556636
2503,20,1694,3,1009669071
2504,20,2641,4,1009669115
2507,20,3527,4,1009669181
...,...,...,...,...
1000126,6040,1333,4,997454140
1000162,6040,3362,4,997453796
1000167,6040,161,3,997454486
1000169,6040,2725,4,997454180


- Now it's time to perform holdout sampling based on the obtained timepoint splitting. 
- Remember, you only sample from the test users.

In [40]:
testset_part_, holdout_ = leave_one_out(
    after,
    target='timestamp',
    sample_top=True,
    random_state=0
) # your code for holdout sampling

In [41]:
one_rating_only = np.setdiff1d(holdout_['userid'].unique(), testset_part_['userid'].unique())

In [42]:
one_rating_only

array([  80,  228,  351,  829,  838,  996, 1139, 1519, 1716, 2330, 2350,
       2390, 2505, 2535, 2593, 2753, 2896, 3072, 3096, 3150, 3233, 3279,
       3430, 3492, 3916, 4030, 4079, 4585, 4805, 5118, 5173, 5202, 5367,
       5923])

In [43]:
holdout_ = holdout_[~holdout_['userid'].isin(one_rating_only)]

In [44]:
# verify correctness of time-based splitting,
# i.e., for each test user, the holdout contains only future interactions w.r.t to testset
# and drop only one rating user's from the holdout
verify_time_split(testset_part_, holdout_)

In [45]:
training_ = before[~before['userid'].isin(after['userid'].unique())] # recall that training and testset must be disjoint by users

- Note that `testset_part_` only contains interactions of the test users **after the timepoint**.
- You need to combine it with the remaining histories of these users.

In [46]:
testset_part_.shape

(49198, 4)

In [47]:
[before[before['userid'].isin(testset_part_.userid.unique())]]

[         userid  movieid  rating  timestamp
 2245         19     2987       4  978555881
 2246         19     2989       4  978147099
 2247         19     3421       3  983074250
 2248         19     2622       5  978300144
 2249         19      648       3  978147357
 ...         ...      ...     ...        ...
 1000204    6040     1091       1  956716541
 1000205    6040     1094       5  956704887
 1000206    6040      562       5  956704746
 1000207    6040     1096       4  956715648
 1000208    6040     1097       4  956715569
 
 [218079 rows x 4 columns]]

In [48]:
np.array(before[before['userid'].isin(testset_part_.userid.unique())]).shape

(218079, 4)

In [49]:
# combine all test users data into a single `testset_` Dataframe.
testset_ = pd.concat(
    [before[before['userid'].isin(testset_part_['userid'].unique())], testset_part_],
    axis = 0,
    ignore_index=False
)

In [50]:
testset_.shape

(267277, 4)

### Building internal representation of user and item index

Use the `transform_indices` function for building a contiguous index starting from 0.

In [51]:
training, data_index = transform_indices(training_, 'userid', 'movieid')

- Before applying new index to the test data:
  - note that the users in the `testset` must be the same as the users in the `holdout`.
- Below is the corresponding function `align_test_by_users` that ensures these two datasets' alignment.

In [52]:
def align_test_by_users(testset, holdout):
    test_users = np.intersect1d(holdout['userid'].values, testset['userid'].values)
    # only allow the same users to be present in both datasets
    testset = testset.query('userid in @test_users').sort_values('userid')
    holdout = holdout.query('userid in @test_users').sort_values('userid')
    return testset, holdout

Let's apply new item index to test data and finalize the test split:

In [53]:
holdout = reindex(holdout_, data_index['items'], filter_invalid=True)
testset = reindex(testset_, data_index['items'], filter_invalid=True)

testset, holdout = align_test_by_users(testset, holdout)

Filtered 5 invalid observations.
Filtered 109 invalid observations.


- Think why we do not apply new index to users here.

## Models implementation

- In this section you'll need to implement user-based KNN models for the warm-start scenario.
- Think carefully which data must be generated at the build time and which data must be generated in the scoring function.

In [54]:
training_description = dict(
    users = data_index['users'].name,
    items = data_index['items'].name,
    feedback = 'rating',
    n_users = len(data_index['users']),
    n_items = len(data_index['items'])
)

test_description = dict(
    users = data_index['users'].name,
    items = data_index['items'].name,
    feedback = 'rating',
    n_users = testset.userid.nunique(),
    n_items = len(data_index['items'])
)

### Symmetric case (5 pts)

In [55]:
from scipy.sparse import csr_matrix, diags, hstack, vstack

In [89]:
def build_uknn_model(config, data, data_description):
    user_item_mtx = generate_interactions_matrix(data, data_description)

    return user_item_mtx


def uknn_model_scoring(params, testset, testset_description, config):
    user_item_mtx = params
    
    user_item_mtx_for_test = generate_interactions_matrix(testset, testset_description, rebase_users = True)
    user_item_mtx = vstack([user_item_mtx, user_item_mtx_for_test])
    
    user_similarity = cosine_similarity(user_item_mtx, dense_output=False)
    user_similarity.setdiag(0)
    user_similarity.eliminate_zeros()
    
    user_similarity = user_similarity[-testset_description['n_users']:]
    user_similarity = truncate_similarity(user_similarity, config['n_neighbors'])
    scores = user_similarity.dot(user_item_mtx).A
    return scores

In [91]:
config = {'weighting': None, 'n_neighbors': n_neighbors}
uknn_params =  build_uknn_model(config, training, training_description)

In [92]:
uknn_scores = uknn_model_scoring(uknn_params, testset, test_description, config)

In [93]:
downvote_seen_items(uknn_scores, testset, test_description)

In [104]:
# def build_uknn_model(config, data, data_description):
#   user_item_mtx = generate_interactions_matrix(data, data_description, rebase_users=True)
#   user_similarity = cosine_similarity_zd(user_item_mtx)
#   user_similarity = truncate_similarity(
#       user_similarity, 
#       config['n_neighbors'])  
#   weights = user_similarity.dot(user_item_mtx.astype('bool')) 

#   return user_similarity, weights   

# def uknn_model_scoring(params, testset, testset_description):
#   user_similarity, weights = params
#   user_item_mtx = generate_interactions_matrix(
#       testset, testset_description, rebase_users=True)
#   test_users = testset_description['test_users']

#   test_weights = weights[test_users, :]
#   if weights is None:
#       scores = user_similarity.dot(user_item_mtx).A

#   scores = user_item_mtx.T.dot(user_similarity.A)
#   test_scores = np.divide(
#           scores.T,
#           test_weights,
#           where = (test_weights!=0)
#         )  
#   return test_scores


# def build_uknn_model(config, data, data_description):
#     user_item_mtx = generate_interactions_matrix(data, data_description, rebase_users=True)
    
#     user_similarity = cosine_similarity_zd(user_item_mtx)
#     user_similarity = truncate_similarity(user_similarity, config['n_neighbors'])     
    
#     weighting = config['weighting']
#     weights = None
#     if weighting == 'elementwise':
#       weights = user_similarity.dot(user_item_mtx.astype('bool'))
    
#     return user_item_mtx, user_similarity, weights


# def uknn_model_scoring(params, testset, testset_description, config):
#     user_item_mtx, user_similarity, weights = params
#     # print('user_item_mtx', user_item_mtx.shape)
#     # print('user_similarity', user_similarity.shape)
#     user_item_mtx_for_test = generate_interactions_matrix(testset, testset_description, rebase_users=True)
#     # print('user_item_mtx_for_test', user_item_mtx_for_test.shape)
#     user_item_mtx = vstack([user_item_mtx, user_item_mtx_for_test])
#     # print('user_item_mtx', user_item_mtx.shape)
#     # print(-testset_description['n_users'])
#     # print(-len(testset.userid.unique()))

#     user_similarity = cosine_similarity_zd(user_item_mtx)
#     user_similarity = truncate_similarity(user_similarity, config['n_neighbors'])


#     if weights is None:
#       scores = user_similarity.dot(user_item_mtx).A
#       # print(scores)
#       # return scores[-testset_description['n_users']:] 
#       return scores[-len(testset.userid.unique()):]

# uknn_params =  build_uknn_model({'weighting': None, 'n_neighbors': n_neighbors}, training, training_description)
# uknn_scores = uknn_model_scoring(uknn_params, testset, test_description, {'weighting': None, 'n_neighbors': n_neighbors})
# print(uknn_scores.shape)
# downvote_seen_items(uknn_scores, testset, test_description)

### Asymmetric case (5 pts)

In [98]:
def build_uknn_model_asym(config, data, data_description):
    user_item_mtx = generate_interactions_matrix(data, data_description, rebase_users=True)
    # user_similarity = cosine_similarity_zd(user_item_mtx)
    # user_similarity = truncate_similarity(user_similarity, config['n_neighbors']) 
    # weighting = config['weighting']
    # weights = None
    # if weighting == 'columnwise':
    #   weights = (
    #         user_similarity
    #         ._with_data(np.abs(user_similarity.data))
    #         .sum(axis=1)
    #         .A.squeeze()
    #     )
    
    return user_item_mtx #, user_similarity, weights

def uknn_model_scoring_asym(params, testset, testset_description, config):
    user_item_mtx = params

    user_item_mtx_for_test = generate_interactions_matrix(testset, testset_description, rebase_users=True)
    user_item_mtx = vstack([user_item_mtx, user_item_mtx_for_test])

    user_similarity = cosine_similarity_zd(user_item_mtx)
    user_similarity = truncate_similarity(user_similarity, config['n_neighbors'])
    
    weights = (
            user_similarity
            ._with_data(np.abs(user_similarity.data))
            .sum(axis=0)
            .A.squeeze()
        )
    normalizer = diags(np.divide(1., weights, where=weights!=0))
    # print(normalizer.shape)
    scores = user_similarity.dot(normalizer.dot(user_item_mtx)).A
    return scores[-testset_description['n_users']:] 


In [99]:
config= {'n_neighbors', n_neighbors}
uknn_params_asym = build_uknn_model_asym(config, training, training_description)

In [101]:
uknn_scores_asym = uknn_model_scoring_asym(uknn_params_asym, testset, test_description, {'weighting': None, 'n_neighbors': n_neighbors})

In [102]:
downvote_seen_items(uknn_scores_asym, testset, test_description)

 ## Evaluation (1 pts)

### Generate recommendations for both models

In [103]:
uknn_recs = topn_recommendations(uknn_scores)

In [104]:
uknn_recs

array([[2666, 1040,  286, ..., 3291,  587, 1112],
       [ 252, 1100, 1086, ...,  953, 1087, 3497],
       [1100, 1253, 1158, ..., 1040, 2471,  510],
       ...,
       [ 829, 1088,  825, ..., 1155, 1087, 3379],
       [  49, 2546,  524, ...,  106, 1478, 1887],
       [1111, 1266, 1097, ..., 1123, 1094, 1742]])

In [105]:
uknn_recs.shape

(774, 10)

In [106]:
uknn_recs_asym = topn_recommendations(uknn_scores_asym)

In [107]:
uknn_recs_asym

array([[2666,  723,  286, ...,  524, 1743, 1757],
       [ 252, 1086,  687, ...,  286, 3497, 1249],
       [1253, 1100, 1158, ..., 1040,  308, 2733],
       ...,
       [ 829, 1160, 1088, ..., 2072, 1120, 3379],
       [  49,  524, 3084, ..., 3379,  106, 1887],
       [1851, 1087, 1111, ..., 1094, 1123, 1192]])

### Calculate metrics

In [108]:
modes = ['symmetric', 'asymmetric']
uknn_recs = dict(zip(modes, [uknn_recs, uknn_recs_asym]))


uknn_metrics = {}
for mode, recs in uknn_recs.items():
    if recs is None: continue
    uknn_metrics[mode] = metrics = model_evaluate(recs, holdout, data_description)
    print(
        f'Similarity type: {mode}\n'\
        'HR={:.3}, MRR={:.3}, COV={:.3}\n'.format(*metrics)
    )

Similarity type: symmetric
HR=0.0504, MRR=0.0215, COV=0.111

Similarity type: asymmetric
HR=0.0543, MRR=0.0228, COV=0.148



## Tuning (2 pts)
- Try to find a neighborhood size that gives you better results.
- Perform a simple grid-search experiment and report your findings.

In [109]:
modes = ['symmetric', 'asymmetric']
neigborhood_list = [10, 20, 30, 40, 100, 200] 

for neighbours in neigborhood_list:
  uknn_scores = uknn_model_scoring(uknn_params, testset, test_description, {'weighting': None, 'n_neighbors': neighbours})
  downvote_seen_items(uknn_scores, testset, test_description)
  uknn_recs = topn_recommendations(uknn_scores)

  uknn_scores_asym = uknn_model_scoring_asym(uknn_params_asym, testset, test_description, {'weighting': None, 'n_neighbors': neighbours})
  downvote_seen_items(uknn_scores_asym, testset, test_description)
  uknn_recs_asym = topn_recommendations(uknn_scores_asym)

  uknn_recs = dict(zip(modes, [uknn_recs, uknn_recs_asym]))


  uknn_metrics = {}
  for mode, recs in uknn_recs.items():
      if recs is None: continue
      uknn_metrics[mode] = metrics = model_evaluate(recs, holdout, data_description)
      print(
          f'Similarity type: {mode}, numbers of neighbours : {neighbours}\n'\
          'HR={:.3}, MRR={:.3}, COV={:.3}\n'.format(*metrics)
      )

Similarity type: symmetric, numbers of neighbours : 10
HR=0.0465, MRR=0.0183, COV=0.198

Similarity type: asymmetric, numbers of neighbours : 10
HR=0.0401, MRR=0.0166, COV=0.256

Similarity type: symmetric, numbers of neighbours : 20
HR=0.0517, MRR=0.0172, COV=0.162

Similarity type: asymmetric, numbers of neighbours : 20
HR=0.0504, MRR=0.0161, COV=0.223

Similarity type: symmetric, numbers of neighbours : 30
HR=0.0556, MRR=0.0211, COV=0.149

Similarity type: asymmetric, numbers of neighbours : 30
HR=0.053, MRR=0.0185, COV=0.204

Similarity type: symmetric, numbers of neighbours : 40
HR=0.0517, MRR=0.0213, COV=0.135

Similarity type: asymmetric, numbers of neighbours : 40
HR=0.0517, MRR=0.0188, COV=0.188

Similarity type: symmetric, numbers of neighbours : 100
HR=0.0504, MRR=0.0215, COV=0.111

Similarity type: asymmetric, numbers of neighbours : 100
HR=0.0543, MRR=0.0228, COV=0.148

Similarity type: symmetric, numbers of neighbours : 200
HR=0.0568, MRR=0.0225, COV=0.0923

Similarity ty

For 200 neighbours best HR, fir 200 neighbours best MRR, for 200 neighbours best COV

# Final analysis (3 pts)

1. Provide an analysis on which model performs the best and explain why.
2. Explain the difference in computational complexity of your models. Consider how the training and the recommendation generation differ for different models in terms of
    - the amount of RAM,
    - the amount of disk storage,
    - the load on CPU.
3. How else would you modify the model to improve either the quality of recommendations or computational performance? Describe at least one modification and its envisioned effect.