In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm

In [2]:
data = pd.read_csv('data/dataset.csv')
data = data.sort_values(['timestamp'])

In [3]:
train = data[:80000]
test = data[80000:]

In [4]:
train.head()

Unnamed: 0,user_id,item_id,rating,timestamp
217,259,255,4,874724710
83968,259,286,4,874724727
43030,259,298,4,874724754
21399,259,185,4,874724781
82658,259,173,4,874724843


In [5]:
test.head()

Unnamed: 0,user_id,item_id,rating,timestamp
1346,3,245,1,889237247
27978,3,355,3,889237247
1260,3,335,1,889237269
38673,3,322,3,889237269
3761,3,323,2,889237269


In [6]:
user_index = {}
user_list = train['user_id'].unique()

for user_id in user_list:
    user_index[user_id] = len(user_index)

user_number = len(user_index)
user_index

{259: 0,
 851: 1,
 712: 2,
 119: 3,
 640: 4,
 594: 5,
 23: 6,
 276: 7,
 913: 8,
 532: 9,
 821: 10,
 291: 11,
 157: 12,
 817: 13,
 195: 14,
 756: 15,
 893: 16,
 933: 17,
 543: 18,
 506: 19,
 346: 20,
 21: 21,
 130: 22,
 297: 23,
 621: 24,
 1: 25,
 860: 26,
 342: 27,
 76: 28,
 833: 29,
 407: 30,
 941: 31,
 870: 32,
 30: 33,
 397: 34,
 188: 35,
 113: 36,
 793: 37,
 422: 38,
 658: 39,
 275: 40,
 337: 41,
 759: 42,
 703: 43,
 221: 44,
 24: 45,
 279: 46,
 452: 47,
 145: 48,
 468: 49,
 798: 50,
 919: 51,
 268: 52,
 514: 53,
 265: 54,
 633: 55,
 612: 56,
 287: 57,
 597: 58,
 738: 59,
 67: 60,
 118: 61,
 671: 62,
 943: 63,
 490: 64,
 35: 65,
 693: 66,
 517: 67,
 779: 68,
 207: 69,
 233: 70,
 172: 71,
 691: 72,
 152: 73,
 5: 74,
 92: 75,
 593: 76,
 307: 77,
 300: 78,
 398: 79,
 760: 80,
 319: 81,
 837: 82,
 694: 83,
 634: 84,
 63: 85,
 839: 86,
 286: 87,
 684: 88,
 102: 89,
 364: 90,
 477: 91,
 382: 92,
 715: 93,
 912: 94,
 770: 95,
 43: 96,
 472: 97,
 777: 98,
 802: 99,
 661: 100,
 886: 101,
 2

In [7]:
item_index = {}
item_list = train['item_id'].unique()

for item_id in item_list:
    item_index[item_id] = len(item_index)

item_number = len(item_index)
item_index

{255: 0,
 286: 1,
 298: 2,
 185: 3,
 173: 4,
 772: 5,
 108: 6,
 288: 7,
 928: 8,
 117: 9,
 200: 10,
 405: 11,
 1074: 12,
 176: 13,
 357: 14,
 210: 15,
 687: 16,
 696: 17,
 284: 18,
 295: 19,
 544: 20,
 473: 21,
 290: 22,
 147: 23,
 121: 24,
 717: 25,
 220: 26,
 1040: 27,
 510: 28,
 417: 29,
 50: 30,
 731: 31,
 385: 32,
 623: 33,
 97: 34,
 66: 35,
 553: 36,
 96: 37,
 79: 38,
 969: 39,
 181: 40,
 172: 41,
 42: 42,
 421: 43,
 402: 44,
 739: 45,
 234: 46,
 498: 47,
 423: 48,
 174: 49,
 99: 50,
 692: 51,
 392: 52,
 1220: 53,
 812: 54,
 202: 55,
 82: 56,
 61: 57,
 215: 58,
 432: 59,
 195: 60,
 462: 61,
 746: 62,
 69: 63,
 1055: 64,
 88: 65,
 177: 66,
 776: 67,
 4: 68,
 238: 69,
 785: 70,
 1469: 71,
 660: 72,
 419: 73,
 365: 74,
 585: 75,
 1503: 76,
 72: 77,
 560: 78,
 71: 79,
 228: 80,
 622: 81,
 73: 82,
 662: 83,
 141: 84,
 393: 85,
 584: 86,
 716: 87,
 378: 88,
 949: 89,
 732: 90,
 191: 91,
 196: 92,
 83: 93,
 59: 94,
 136: 95,
 230: 96,
 404: 97,
 655: 98,
 1053: 99,
 568: 100,
 729: 101,

In [8]:
rating_matrix = np.empty((user_number, item_number))
rating_matrix.fill(False)
rating_matrix.shape

(752, 1616)

In [9]:
for index, row in tqdm(train.iterrows()):
    rating_matrix[user_index[row['user_id']], item_index[row['item_id']]] = True

rating_matrix

80000it [00:01, 43370.47it/s]


array([[1., 1., 1., ..., 0., 0., 0.],
       [0., 1., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [10]:
users_rated_sum = np.sum(rating_matrix, axis=1)
users_rated_sum

array([ 46., 173., 162., 165., 108.,  25., 150., 504., 131., 168.,  62.,
       296.,  51.,  36.,  83., 109.,  59., 184., 195., 238., 193., 179.,
       353., 192., 167., 265.,  65., 201.,  82., 267., 216.,  22., 250.,
        43.,  93., 112.,  51.,  55.,  97.,  71.,  95.,  34.,  32.,  47.,
       146.,  68., 353., 206., 313., 143., 239., 217., 328., 192.,  46.,
        58.,  27.,  62.,  41., 112.,  30.,  71., 124., 168.,  57.,  25.,
       154.,   1.,  37., 225., 108.,  27.,  32., 102., 175., 371., 156.,
       112.,  20., 172.,  41.,  13.,  46., 157., 131.,  93.,  57., 252.,
        86., 179.,  20.,  35.,  53., 167.,  53.,  64., 217., 208.,  36.,
        75., 118., 240., 207., 129.,  25., 216.,  66., 153., 114., 109.,
        33.,  68., 168., 148., 237.,  28., 131.,  89.,  32., 166.,  58.,
        58., 259., 147.,  36.,  58.,  67., 279.,  53.,  40., 120.,  27.,
        38.,  73.,  43.,  66.,  44.,  40., 138.,  30.,  61.,  34.,  65.,
        20.,  24.,  81., 233., 208.,  38., 124.,  6

In [11]:
item_popularity = np.sum(rating_matrix.T, axis=1).tolist()
for i in range(len(item_list)):
    item_popularity[i] = (item_list[i], item_popularity[i])

item_popularity.sort(key=lambda x: x[1], reverse=True)
item_popularity

[(50, 474.0),
 (181, 423.0),
 (100, 417.0),
 (294, 396.0),
 (258, 394.0),
 (288, 383.0),
 (1, 374.0),
 (286, 370.0),
 (121, 366.0),
 (174, 341.0),
 (127, 332.0),
 (56, 329.0),
 (7, 328.0),
 (237, 321.0),
 (117, 319.0),
 (300, 315.0),
 (98, 313.0),
 (222, 310.0),
 (172, 300.0),
 (405, 291.0),
 (204, 290.0),
 (79, 275.0),
 (151, 270.0),
 (210, 268.0),
 (69, 267.0),
 (173, 266.0),
 (168, 255.0),
 (257, 252.0),
 (15, 252.0),
 (25, 252.0),
 (118, 252.0),
 (423, 251.0),
 (748, 250.0),
 (9, 250.0),
 (195, 248.0),
 (276, 247.0),
 (96, 240.0),
 (64, 240.0),
 (111, 236.0),
 (318, 236.0),
 (22, 236.0),
 (176, 233.0),
 (183, 232.0),
 (234, 230.0),
 (202, 230.0),
 (742, 228.0),
 (12, 228.0),
 (216, 226.0),
 (28, 225.0),
 (89, 223.0),
 (191, 220.0),
 (275, 219.0),
 (475, 219.0),
 (546, 218.0),
 (328, 215.0),
 (302, 215.0),
 (289, 213.0),
 (135, 212.0),
 (357, 211.0),
 (97, 209.0),
 (269, 209.0),
 (238, 208.0),
 (125, 208.0),
 (82, 206.0),
 (186, 205.0),
 (228, 203.0),
 (196, 203.0),
 (153, 203.0),
 

In [12]:
users_mean_rated = np.mean(rating_matrix, axis=1)[np.newaxis, :].T
users_mean_rated

array([[0.02846535],
       [0.10705446],
       [0.10024752],
       [0.10210396],
       [0.06683168],
       [0.0154703 ],
       [0.09282178],
       [0.31188119],
       [0.08106436],
       [0.1039604 ],
       [0.03836634],
       [0.18316832],
       [0.03155941],
       [0.02227723],
       [0.05136139],
       [0.0674505 ],
       [0.0365099 ],
       [0.11386139],
       [0.12066832],
       [0.14727723],
       [0.11943069],
       [0.11076733],
       [0.21844059],
       [0.11881188],
       [0.10334158],
       [0.16398515],
       [0.04022277],
       [0.12438119],
       [0.05074257],
       [0.16522277],
       [0.13366337],
       [0.01361386],
       [0.15470297],
       [0.02660891],
       [0.0575495 ],
       [0.06930693],
       [0.03155941],
       [0.03403465],
       [0.06002475],
       [0.04393564],
       [0.05878713],
       [0.0210396 ],
       [0.01980198],
       [0.02908416],
       [0.09034653],
       [0.04207921],
       [0.21844059],
       [0.127

In [13]:
jaccard_similarity_matrix = rating_matrix @ rating_matrix.T
diag = jaccard_similarity_matrix.diagonal()
for i in range(len(jaccard_similarity_matrix)):
    for j in range(len(jaccard_similarity_matrix[0])):
        jaccard_similarity_matrix[i, j] /= diag[i] + diag[j] - jaccard_similarity_matrix[i, j]

jaccard_similarity_matrix

array([[1.        , 0.0875    , 0.06535948, ..., 0.11888112, 0.33333333,
        0.11111111],
       [0.0875    , 1.        , 0.08666667, ..., 0.15942029, 0.84615385,
        0.42857143],
       [0.06535948, 0.08666667, 1.        , ..., 0.15942029, 0.        ,
        0.03448276],
       ...,
       [0.11888112, 0.15942029, 0.15942029, ..., 1.        , 0.41176471,
        0.15384615],
       [0.33333333, 0.84615385, 0.        , ..., 0.41176471, 1.        ,
        0.5       ],
       [0.11111111, 0.42857143, 0.03448276, ..., 0.15384615, 0.5       ,
        1.        ]])

In [14]:
def average_precision(actual, recommended, k=30):
    ap_sum = 0
    hits = 0
    for i in range(k):
        product_id = recommended[i] if i < len(recommended) else None
        if product_id is not None and product_id in actual:
            hits += 1
            ap_sum += hits / (i + 1)
    return ap_sum / k


def normalized_average_precision(actual, recommended, k=30):
    actual = set(actual)
    if len(actual) == 0:
        return 0.0

    ap = average_precision(actual, recommended, k=k)
    ap_ideal = average_precision(actual, list(actual)[:k], k=k)
    return ap / ap_ideal

In [15]:
def recommend(user):
    item_scores = []
    
    if user not in user_list:
        return [pair[0] for pair in item_popularity[:30]]
    
    if users_rated_sum[user_index[user]] < 7:
        return [pair[0] for pair in item_popularity[:30]]
    
    for item_id in item_list:
        users_rated = rating_matrix.T[item_index[item_id]][np.newaxis, :].T
        
        if users_rated[user_index[user]] > 0:
            item_scores.append((item_id, 0))
            continue
        
        similarity_coef = np.sum(jaccard_similarity_matrix[user_index[user]] @ users_rated)
        if similarity_coef == 0:
            item_scores.append((item_id, 0))
            continue

        weighted_ratings_sum = jaccard_similarity_matrix[user_index[user]] @ \
                               (users_rated - users_mean_rated)
        
        item_scores.append((item_id, (users_mean_rated[user_index[user]] + weighted_ratings_sum / similarity_coef)[0]))
    
    item_scores.sort(key=lambda x: x[1], reverse=True)
    
    return [pair[0] for pair in item_scores[:30]]

In [16]:
scores = []
for user in tqdm(test['user_id'].unique()):
    actual = list(test[test['user_id'] == user]['item_id'])
    recommended = recommend(user)

    scores.append(normalized_average_precision(actual, recommended))

np.mean(scores)

100%|██████████| 301/301 [00:02<00:00, 132.08it/s]


0.17296968581677968

In [17]:
# Задача: Обучить модель так, чтобы мера была больше 0.1