**Import libraries**

In [1]:
import numpy as np
import pandas as pd

from tqdm import tqdm

from scipy.sparse import coo_matrix

from sklearn.preprocessing import minmax_scale

from itertools import groupby

import os
os.environ['OPENBLAS_NUM_THREADS'] = '1'
os.environ ['MKL_NUM_THREADS'] = '1'

from implicit.als import AlternatingLeastSquares
from implicit.nearest_neighbours import CosineRecommender

**Functions we'll need**

In [2]:
def get_masked(arr, mask):
    """
    Maskes sparse matrix
    """
    return coo_matrix(
        (
            [np.float32(item) for item in arr.data[mask]],
            (arr.row[mask], arr.col[mask])
        ), arr.shape
    )

In [3]:
def get_recs(user, model):
    """
    Returns recomendations for given users by given model
    """
    return {
        user: model.recommend(userid=user, user_items=train_csr, N=50)
        for user in users
    }

In [4]:
def hitrate(k, recs, users):
    """
    Returns hitrate
    """
    hits = 0
    for user in users:
        if recs[user]:
            rec_items, _ = zip(*recs[user])
            hits += len(set(rec_items[:k]).intersection(set(test_csr[user].indices))) > 0
    return hits / len(users) 

In [5]:
def normalize(alg, users):
    """
    Normalizes data in recommendations 'cause metrics for scores are not the same for each model
    """
    for user in users:
        if alg[user]:
            rec_items, rec_us = zip(*alg[user])
            rec_us = list(minmax_scale(list(rec_us)))
            for i in range(len(alg[user])):
                alg[user][i] = (rec_items[i], rec_us[i])
    return alg

**Read data**

In [6]:
data = pd.read_csv('data_ml-1m/ratings.csv')
data.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [7]:
data['userId'].nunique()

6040

**Create sparse matrix**

In [8]:
user_item_matrix = coo_matrix((
    (data["rating"]>=4).astype(np.float32),
    (data["userId"], data["movieId"])))

user_item_matrix.eliminate_zeros()

**Split data onto train and test**

In [9]:
total_len = user_item_matrix.data.size
train_len = int(total_len * 0.8)
all_indices = np.arange(total_len)
np.random.seed(42)
train_indices = np.random.choice(all_indices, train_len, replace = False)
train_mask = np.in1d(all_indices, train_indices)

**Preparing train and test matrices**

In [10]:
train_csr = get_masked(user_item_matrix, train_mask).tocsr()
train = train_csr.T
test_coo = get_masked(user_item_matrix, ~train_mask)
test_csr = test_coo.tocsr()

**Preparing data for testing models**

In [11]:
users = list(set(test_coo.row))

**Let's try to use each algorithm separately at first**

Cosine

In [12]:
cosine = CosineRecommender()

In [13]:
%%time
cosine.fit(train)

HBox(children=(FloatProgress(value=0.0, max=3953.0), HTML(value='')))


Wall time: 174 ms


In [14]:
#getting recommendations
recs_cosine = get_recs(users, cosine)

In [15]:
#hitrate for 50
print('hitrate=50  ', hitrate(50, recs_cosine, users))

hitrate=50   0.9349498327759197


ALS

In [16]:
als = AlternatingLeastSquares(factors = 50, random_state = 42)

In [17]:
%%time
als.fit(train)

HBox(children=(FloatProgress(value=0.0, max=15.0), HTML(value='')))


Wall time: 965 ms


In [18]:
#getting recommendations
recs_als = get_recs(users, als)

In [19]:
#hitrate for 50
print('hitrate=50  ', hitrate(50, recs_als, users))

hitrate=50   0.9441471571906355


**Now we need to combine our algorithms**

In [20]:
#at first we need to normalize recommendations
recs_cosine_norm = normalize(recs_cosine, users)
recs_als_norm = normalize(recs_als, users)

In [21]:
user_keys = recs_als.keys()

summed_recs = dict()

for user in user_keys:
    new_rec = recs_cosine_norm[user] + recs_als_norm[user]
    summed_recs[user] = sorted(new_rec, key=lambda pair: -pair[1])[:50]

In [22]:
print('hitrate=50  ', hitrate(50, summed_recs, users))

hitrate=50   0.9359531772575251


In [23]:
#hitrate is better, but let's try to improve it

#in code below we kinda combine weighting and blending
#recommendations given by both algorithms we weight and add to final list of recommendations
#we add others to final list too, but multiply them by coefficient less than 1 
#to underline the fact that we're not sure about them

In [24]:
user_keys = recs_als.keys()

mixed_recs = dict()

i = 0.36 

for user in tqdm(user_keys):
    new_rec = []
    items = set(list(dict(recs_cosine_norm[user]).keys()) + list(dict(recs_als_norm[user]).keys()))
    for item in items:
        if item in dict(recs_cosine_norm[user]) and item in dict(recs_als_norm[user]):
            new_rec.append((item, dict(recs_cosine_norm[user]).get(item) * i + dict(recs_als_norm[user]).get(item) * (1 - i)))
        elif item in dict(recs_cosine_norm[user]):
            new_rec.append((item, dict(recs_cosine_norm[user]).get(item) * (1 - i/2)))
        else:
            new_rec.append((item, dict(recs_als_norm[user]).get(item) * (1 - i/2)))
    mixed_recs[user] = sorted(new_rec, key=lambda pair: -pair[1])[:50]
print('i = ', i, 'res = ', hitrate(50, mixed_recs, users))

100%|██████████| 5980/5980 [00:06<00:00, 925.89it/s]


i =  0.36 res =  0.9464882943143813


In [25]:
print('Mixed: ', hitrate(50, mixed_recs, users))
print('Sum: ', hitrate(50, summed_recs, users))
print('ALS: ', hitrate(50, recs_als, users))
print('Cosine: ', hitrate(50, recs_cosine, users))

Mixed:  0.9464882943143813
Sum:  0.9359531772575251
ALS:  0.9441471571906355
Cosine:  0.9349498327759197
