In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm

In [2]:
data = pd.read_csv('data/dataset.csv')
data = data.sort_values(['timestamp'])

In [3]:
train = data[:80000]
test = data[80000:]

In [4]:
train.head()

Unnamed: 0,user_id,item_id,rating,timestamp
217,259,255,4,874724710
83968,259,286,4,874724727
43030,259,298,4,874724754
21399,259,185,4,874724781
82658,259,173,4,874724843


In [5]:
test.head()

Unnamed: 0,user_id,item_id,rating,timestamp
1346,3,245,1,889237247
27978,3,355,3,889237247
1260,3,335,1,889237269
38673,3,322,3,889237269
3761,3,323,2,889237269


In [6]:
user_index = {}
user_list = train['user_id'].unique()

for user_id in user_list:
    user_index[user_id] = len(user_index)

user_number = len(user_index)

In [7]:
item_index = {}
item_list = train['item_id'].unique()

for item_id in item_list:
    item_index[item_id] = len(item_index)

item_number = len(item_index)

In [8]:
rating_matrix = np.empty((user_number, item_number))
rating_matrix.fill(False)
rating_matrix.shape

(752, 1616)

In [9]:
for index, row in tqdm(train.iterrows()):
    rating_matrix[user_index[row['user_id']], item_index[row['item_id']]] = True

rating_matrix

80000it [00:01, 44929.38it/s]


array([[1., 1., 1., ..., 0., 0., 0.],
       [0., 1., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [10]:
users_rated_sum = np.sum(rating_matrix, axis=1)

In [11]:
item_popularity = np.sum(rating_matrix.T, axis=1).tolist()
for i in range(len(item_list)):
    item_popularity[i] = (item_list[i], item_popularity[i])

item_popularity.sort(key=lambda x: x[1], reverse=True)

In [12]:
users_mean_rated = np.mean(rating_matrix, axis=1)[np.newaxis, :].T

In [13]:
jaccard_similarity_matrix = rating_matrix @ rating_matrix.T
diag = jaccard_similarity_matrix.diagonal()
for i in range(len(jaccard_similarity_matrix)):
    for j in range(len(jaccard_similarity_matrix[0])):
        jaccard_similarity_matrix[i, j] /= diag[i] + diag[j] - jaccard_similarity_matrix[i, j]

jaccard_similarity_matrix

array([[1.        , 0.0875    , 0.06535948, ..., 0.11888112, 0.33333333,
        0.11111111],
       [0.0875    , 1.        , 0.08666667, ..., 0.15942029, 0.84615385,
        0.42857143],
       [0.06535948, 0.08666667, 1.        , ..., 0.15942029, 0.        ,
        0.03448276],
       ...,
       [0.11888112, 0.15942029, 0.15942029, ..., 1.        , 0.41176471,
        0.15384615],
       [0.33333333, 0.84615385, 0.        , ..., 0.41176471, 1.        ,
        0.5       ],
       [0.11111111, 0.42857143, 0.03448276, ..., 0.15384615, 0.5       ,
        1.        ]])

In [14]:
def average_precision(actual, recommended, k=30):
    ap_sum = 0
    hits = 0
    for i in range(k):
        product_id = recommended[i] if i < len(recommended) else None
        if product_id is not None and product_id in actual:
            hits += 1
            ap_sum += hits / (i + 1)
    return ap_sum / k


def normalized_average_precision(actual, recommended, k=30):
    actual = set(actual)
    if len(actual) == 0:
        return 0.0

    ap = average_precision(actual, recommended, k=k)
    ap_ideal = average_precision(actual, list(actual)[:k], k=k)
    return ap / ap_ideal

In [15]:
def recommend(user):
    item_scores = []
    
    if user not in user_list:
        return [pair[0] for pair in item_popularity[:30]]
    
    if users_rated_sum[user_index[user]] < 7:
        return [pair[0] for pair in item_popularity[:30]]
    
    for item_id in item_list:
        users_rated = rating_matrix.T[item_index[item_id]][np.newaxis, :].T
        
        if users_rated[user_index[user]] > 0:
            item_scores.append((item_id, 0))
            continue
        
        similarity_coef = np.sum(jaccard_similarity_matrix[user_index[user]] @ users_rated)
        if similarity_coef == 0:
            item_scores.append((item_id, 0))
            continue

        weighted_ratings_sum = jaccard_similarity_matrix[user_index[user]] @ \
                               (users_rated - users_mean_rated)
        
        item_scores.append((item_id, (users_mean_rated[user_index[user]] + weighted_ratings_sum / similarity_coef)[0]))
    
    item_scores.sort(key=lambda x: x[1], reverse=True)
    
    return [pair[0] for pair in item_scores[:30]]

In [16]:
scores = []
for user in tqdm(test['user_id'].unique()):
    actual = list(test[test['user_id'] == user]['item_id'])
    recommended = recommend(user)

    scores.append(normalized_average_precision(actual, recommended))

np.mean(scores)

100%|██████████| 301/301 [00:02<00:00, 135.98it/s]


0.17296968581677968

In [17]:
# Задача: Обучить модель так, чтобы мера была больше 0.1